From 096673f67527b0fed1aab1843041b9d929fd0fb5 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Thu, 29 Nov 2018 13:20:29 +0000
Subject: [PATCH 01/45] refactor eager deletion test=develop

---
 paddle/fluid/framework/details/CMakeLists.txt |  12 +-
 .../details/computation_op_handle.cc          |   6 +-
 .../framework/details/computation_op_handle.h |   6 +-
 .../details/eager_deletion_op_handle.cc       | 117 ++++++++++
 .../details/eager_deletion_op_handle.h        |  64 ++++++
 .../framework/details/eager_deletion_pass.cc  |  96 ++++++++
 .../framework/details/eager_deletion_pass.h   |  32 +++
 .../details/multi_devices_graph_pass.cc       |   6 +-
 .../details/reference_count_op_handle.h       | 138 ------------
 .../framework/details/reference_count_pass.cc | 213 +++++-------------
 .../framework/details/reference_count_pass.h  |   5 -
 .../details/reference_count_pass_helper.h     |  49 ++++
 .../scope_buffered_ssa_graph_executor.cc      |  30 +--
 .../scope_buffered_ssa_graph_executor.h       |   4 +
 paddle/fluid/framework/garbage_collector.h    |  12 +-
 paddle/fluid/framework/ir/graph.h             |  11 +-
 paddle/fluid/framework/ir/pass.h              |  11 +-
 paddle/fluid/framework/parallel_executor.cc   | 106 ++++++---
 paddle/fluid/framework/parallel_executor.h    |  24 +-
 paddle/fluid/platform/CMakeLists.txt          |   9 +-
 .../fluid/platform/stream_callback_manager.cc |  70 ++++++
 .../fluid/platform/stream_callback_manager.h  |  51 +----
 22 files changed, 631 insertions(+), 441 deletions(-)
 create mode 100644 paddle/fluid/framework/details/eager_deletion_op_handle.cc
 create mode 100644 paddle/fluid/framework/details/eager_deletion_op_handle.h
 create mode 100644 paddle/fluid/framework/details/eager_deletion_pass.cc
 create mode 100644 paddle/fluid/framework/details/eager_deletion_pass.h
 delete mode 100644 paddle/fluid/framework/details/reference_count_op_handle.h
 create mode 100644 paddle/fluid/framework/details/reference_count_pass_helper.h
 create mode 100644 paddle/fluid/platform/stream_callback_manager.cc

diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 93288936fe..8cf97d667d 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -33,10 +33,9 @@ cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base s
 
 cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
 
-if (WITH_GPU)
-  cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle rpc_op_handle
-          all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
-endif()
+cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows op_handle_base)
+cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
+cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass)
 
 cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
 cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)
@@ -44,10 +43,7 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
         scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
 
-set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass) 
-if (WITH_GPU)
-  list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
-endif()
+set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass) 
 
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
 
diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
index 7ad1e40c60..7beb8c8de9 100644
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -20,11 +20,13 @@ namespace paddle {
 namespace framework {
 namespace details {
 ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
-                                         platform::Place place)
+                                         platform::Place place,
+                                         size_t scope_idx)
     : OpHandleBase(node),
       op_(framework::OpRegistry::CreateOp(*node->Op())),
       scope_(scope),
-      place_(place) {}
+      place_(place),
+      scope_idx_(scope_idx) {}
 
 void ComputationOpHandle::RunImpl() {
   WaitInputVarGenerated(place_);
diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h
index 662a91d6b4..601ae4f8c6 100644
--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@@ -28,7 +28,8 @@ namespace framework {
 namespace details {
 struct ComputationOpHandle : public OpHandleBase {
  public:
-  ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place);
+  ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place,
+                      size_t scope_idx);
 
   std::string Name() const override;
 
@@ -38,6 +39,8 @@ struct ComputationOpHandle : public OpHandleBase {
 
   void SetLockAndRecordEventFree(bool b) { is_lock_and_record_event_free_ = b; }
 
+  size_t GetScopeIdx() const { return scope_idx_; }
+
  protected:
   void RunImpl() override;
 
@@ -47,6 +50,7 @@ struct ComputationOpHandle : public OpHandleBase {
   std::unique_ptr<OperatorBase> op_;
   Scope *scope_;
   platform::Place place_;
+  size_t scope_idx_;
   bool is_lock_and_record_event_free_{false};
 };
 }  // namespace details
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
new file mode 100644
index 0000000000..cd26203376
--- /dev/null
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -0,0 +1,117 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+EagerDeletionOpHandle::EagerDeletionOpHandle(
+    ir::Node *node, const Scope *scope, const platform::Place &place,
+    const std::vector<std::string> &var_names, GarbageCollector<Tensor> *gc,
+    AtomicReferenceCountMap *ref_cnts)
+    : OpHandleBase(node), scope_(scope), gc_(gc), ref_cnts_(ref_cnts) {
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(place)) {
+    dev_ctx_ = static_cast<platform::CUDADeviceContext *>(
+        platform::DeviceContextPool::Instance().Get(place));
+    if (dynamic_cast<StreamGarbageCollector<Tensor> *>(gc_)) {
+      platform::SetDeviceId(boost::get<platform::CUDAPlace>(place).device);
+      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+    }
+  }
+#endif
+
+  for (auto &name : var_names) AddVar(name);
+}
+
+EagerDeletionOpHandle::~EagerDeletionOpHandle() {
+#ifdef PADDLE_WITH_CUDA
+  if (event_) {
+    auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
+    platform::SetDeviceId(gpu_place.device);
+    PADDLE_ENFORCE(cudaEventDestroy(event_));
+  }
+#endif
+}
+
+std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; }
+
+void EagerDeletionOpHandle::AddVar(const std::string &name) {
+  var_names_.insert(name);
+}
+
+void EagerDeletionOpHandle::RunImpl() {
+  auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  std::vector<Tensor *> tensors;
+  for (auto &name : var_names_) {
+    auto it = ref_cnts_->find(name);
+    if (it == ref_cnts_->end()) {
+      continue;
+    }
+
+    auto *var = exec_scope->FindVar(name);
+    if (var == nullptr) {
+      continue;
+    }
+
+    if (var->IsType<LoDTensor>()) {
+      if (it->second.fetch_sub(1) == 1) {
+        tensors.emplace_back(var->GetMutable<LoDTensor>());
+      }
+    } else if (var->IsType<SelectedRows>()) {
+      if (it->second.fetch_sub(1) == 1) {
+        tensors.emplace_back(var->GetMutable<SelectedRows>()->mutable_value());
+      }
+    } else if (var->IsType<LoDTensorArray>()) {
+      if (it->second.fetch_sub(1) == 1) {
+        auto *tensor_arr = var->GetMutable<LoDTensorArray>();
+        for (auto &t : *tensor_arr) {
+          tensors.emplace_back(&t);
+        }
+      }
+    }
+  }
+
+  if (!tensors.empty()) {
+    ClearTensors(tensors);
+  }
+}
+
+void EagerDeletionOpHandle::ClearTensors(const std::vector<Tensor *> &tensors) {
+#ifdef PADDLE_WITH_CUDA
+  if (event_) {
+    auto compute_stream = dev_ctx_->stream();
+    auto callback_stream =
+        static_cast<StreamGarbageCollector<Tensor> *>(gc_)->stream();
+    auto callback_func = [=]() {
+      PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
+      PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
+    };
+    gc_->Add(tensors, callback_func);
+  } else {
+#endif
+    gc_->Add(tensors);
+#ifdef PADDLE_WITH_CUDA
+  }
+#endif
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h
new file mode 100644
index 0000000000..8254f21bdf
--- /dev/null
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace details {
+
+class EagerDeletionPass;
+
+class EagerDeletionOpHandle : public OpHandleBase {
+ public:
+  EagerDeletionOpHandle(ir::Node *node, const Scope *scope,
+                        const platform::Place &place,
+                        const std::vector<std::string> &var_names,
+                        GarbageCollector<Tensor> *gc,
+                        AtomicReferenceCountMap *ref_cnts);
+
+  ~EagerDeletionOpHandle();
+
+  std::string Name() const override;
+
+ protected:
+  void RunImpl() override;
+
+ private:
+  void ClearTensors(const std::vector<Tensor *> &tensors);
+
+  void AddVar(const std::string &name);
+
+  const Scope *scope_;
+  std::unordered_set<std::string> var_names_;
+  GarbageCollector<Tensor> *gc_;       // not own
+  AtomicReferenceCountMap *ref_cnts_;  // not own
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDADeviceContext *dev_ctx_{nullptr};
+  cudaEvent_t event_{nullptr};
+#endif
+
+  friend class EagerDeletionPass;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc
new file mode 100644
index 0000000000..f877c2881c
--- /dev/null
+++ b/paddle/fluid/framework/details/eager_deletion_pass.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
+#include "paddle/fluid/framework/details/eager_deletion_pass.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+static void AddDependencyBetween(OpHandleBase *in, OpHandleBase *out,
+                                 ir::Graph *graph) {
+  auto it = std::find_if(
+      in->Outputs().begin(), in->Outputs().end(), [](VarHandleBase *var) {
+        return dynamic_cast<DummyVarHandle *>(var) != nullptr;
+      });
+
+  if (it != in->Outputs().end()) {
+    out->AddInput(*it);
+  } else {
+    auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
+    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
+    in->AddOutput(dep_var);
+    out->AddInput(dep_var);
+  }
+
+  // Add leaf node to eager_deletion_node
+  if (out->Outputs().empty()) {
+    auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
+    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dummy_leaf);
+    out->AddOutput(dummy_leaf);
+  }
+}
+
+std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  auto &vars = graph->Get<GraphVars>(kGraphVars);
+
+  auto &ref_cnts =
+      Get<std::vector<AtomicReferenceCountMap>>(kCurReferenceCount);
+  auto &last_live_ops = Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
+  auto &gcs = Get<GarbageCollectorList>(kGarbageCollector);
+
+  ref_cnts = std::vector<AtomicReferenceCountMap>(vars.size());
+
+  std::unordered_map<ComputationOpHandle *, EagerDeletionOpHandle *> op_map;
+  for (auto &var_ops_map : last_live_ops) {
+    for (auto &var_ops_pair : var_ops_map) {
+      const std::string &var_name = var_ops_pair.first;
+      for (ComputationOpHandle *op : var_ops_pair.second) {
+        auto it = op_map.find(op);
+        if (it != op_map.end()) {
+          it->second->AddVar(var_name);
+        } else {
+          auto *eager_deletion_node = graph->CreateEmptyNode(
+              "eager_deletion", ir::Node::Type::kOperation);
+          auto *eager_deletion_op = new EagerDeletionOpHandle(
+              eager_deletion_node, op->GetScope(), op->GetPlace(), {var_name},
+              gcs[op->GetScopeIdx()].get(), &(ref_cnts[op->GetScopeIdx()]));
+          AddDependencyBetween(op, eager_deletion_op, graph.get());
+          op_map[op] = eager_deletion_op;
+        }
+      }
+    }
+  }
+  VLOG(10) << "Create " << op_map.size() << " EagerDeletionOpHandle(s)";
+  return graph;
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(eager_deletion_pass,
+              paddle::framework::details::EagerDeletionPass)
+    .RequirePassAttr(paddle::framework::details::kCurReferenceCount)
+    .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars)
+    .RequirePassAttr(paddle::framework::details::kGarbageCollector);
diff --git a/paddle/fluid/framework/details/eager_deletion_pass.h b/paddle/fluid/framework/details/eager_deletion_pass.h
new file mode 100644
index 0000000000..d7a7a9709d
--- /dev/null
+++ b/paddle/fluid/framework/details/eager_deletion_pass.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class EagerDeletionPass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index a36ad25926..97830386e4 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -562,7 +562,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result,
                                                     int dev_id) const {
   result->Get<GraphOps>(kGraphOps).emplace_back(
       new ComputationOpHandle(result->CreateOpNode(node->Op()),
-                              local_scopes_[dev_id], places_[dev_id]));
+                              local_scopes_[dev_id], places_[dev_id], dev_id));
   CreateOpHandleIOs(result, node, dev_id);
 }
 
@@ -685,8 +685,8 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result,
   for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) {
     auto p = places_[scope_idx];
     auto s = local_scopes_[scope_idx];
-    result->Get<GraphOps>(kGraphOps).emplace_back(
-        new ComputationOpHandle(result->CreateOpNode(node->Op()), s, p));
+    result->Get<GraphOps>(kGraphOps).emplace_back(new ComputationOpHandle(
+        result->CreateOpNode(node->Op()), s, p, scope_idx));
     CreateOpHandleIOs(result, node, scope_idx);
   }
 }
diff --git a/paddle/fluid/framework/details/reference_count_op_handle.h b/paddle/fluid/framework/details/reference_count_op_handle.h
deleted file mode 100644
index cc4ccfbdfc..0000000000
--- a/paddle/fluid/framework/details/reference_count_op_handle.h
+++ /dev/null
@@ -1,138 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <atomic>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/garbage_collector.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-using ReferenceCountMap = std::unordered_map<std::string, int>;
-using AtomicReferenceCountMap =
-    std::unordered_map<std::string, std::atomic<int>>;
-using DeviceReferenceCountMap =
-    std::unordered_map<int, std::unique_ptr<ReferenceCountMap>>;
-using AtomicDeviceReferenceCountMap =
-    std::unordered_map<int, std::unique_ptr<AtomicReferenceCountMap>>;
-using DeviceGarbageCollectorMap =
-    std::unordered_map<int,
-                       std::unique_ptr<GarbageCollector<framework::Tensor>>>;
-
-class ReferenceCountOpHandle : public OpHandleBase {
- public:
-  ReferenceCountOpHandle(ir::Node *node, const Scope *scope,
-                         const platform::CUDAPlace &place,
-                         const std::vector<std::string> &var_names,
-                         GarbageCollector<Tensor> *gc,
-                         AtomicReferenceCountMap *ref_cnts)
-      : OpHandleBase(node), scope_(scope), gc_(gc), ref_cnts_(ref_cnts) {
-    dev_ctx_ = static_cast<platform::CUDADeviceContext *>(
-        platform::DeviceContextPool::Instance().Get(place));
-    if (IsStreamGarabageCollector()) {
-      platform::SetDeviceId(place.device);
-      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
-    }
-
-    for (auto &name : var_names) AddVar(name);
-  }
-
-  ~ReferenceCountOpHandle() {
-    if (IsStreamGarabageCollector()) {
-      auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
-      platform::SetDeviceId(gpu_place.device);
-      PADDLE_ENFORCE(cudaEventDestroy(event_));
-    }
-  }
-
-  std::string Name() const override { return "reference_count"; }
-
-  void AddVar(const std::string &name) {
-    auto it = var_names_.find(name);
-    if (it != var_names_.end())
-      ++(it->second);
-    else
-      var_names_[name] = 1;
-  }
-
- protected:
-  void RunImpl() override {
-    auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
-    std::vector<Tensor *> tensors;
-    for (auto &pair : var_names_) {
-      auto &name = pair.first;
-      auto it = ref_cnts_->find(name);
-      if (it == ref_cnts_->end()) continue;
-
-      auto *var = exec_scope->FindVar(name);
-      if (var == nullptr) continue;
-
-      if (var->IsType<LoDTensor>()) {
-        if (it->second.fetch_sub(pair.second) <= pair.second) {
-          tensors.emplace_back(var->GetMutable<LoDTensor>());
-        }
-      } else if (var->IsType<SelectedRows>()) {
-        if (it->second.fetch_sub(pair.second) <= pair.second) {
-          tensors.emplace_back(
-              var->GetMutable<SelectedRows>()->mutable_value());
-        }
-      }
-    }
-
-    if (!tensors.empty()) {
-      ClearTensors(tensors);
-    }
-  }
-
- private:
-  void ClearTensors(const std::vector<Tensor *> &tensors) {
-    auto *gc = dynamic_cast<StreamGarbageCollector<Tensor> *>(gc_);
-    if (gc != nullptr) {
-      auto compute_stream = dev_ctx_->stream();
-      auto callback_stream = gc->stream();
-      auto callback_func = [=]() {
-        PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
-        PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
-      };
-      gc_->Add(tensors, callback_func);
-    } else {
-      gc_->Add(tensors);
-    }
-  }
-
-  bool IsStreamGarabageCollector() const {
-    return dynamic_cast<const StreamGarbageCollector<Tensor> *>(gc_) != nullptr;
-  }
-
-  const Scope *scope_;
-  platform::CUDADeviceContext *dev_ctx_;
-  std::unordered_map<std::string, int> var_names_;
-  GarbageCollector<Tensor> *gc_;       // not own
-  AtomicReferenceCountMap *ref_cnts_;  // not own
-  cudaEvent_t event_;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc
index 08783fb5f8..f094c7afa9 100644
--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -17,184 +17,96 @@
 #include <vector>
 
 #include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/reference_count_pass.h"
+#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
-static ComputationOpHandle *FindNextComputationOpHandle(VarHandle *var_in) {
-  std::queue<VarHandleBase *> queue;
-  queue.push(var_in);
+static ComputationOpHandle *FindNextComputationOpHandleOrReturnItself(
+    OpHandleBase *op, size_t scope_idx) {
+  std::queue<OpHandleBase *> q;
+  std::unordered_set<OpHandleBase *> visited;
+  q.push(op);
   do {
-    auto *var = queue.front();
-    queue.pop();
-    for (auto *op : var->PendingOps()) {
-      auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
-      if (compute_op != nullptr && compute_op->GetPlace() == var_in->place_) {
-        return compute_op;
-      }
-      for (auto *out_var : op->Outputs()) {
-        queue.push(out_var);
+    auto *op = q.front();
+    q.pop();
+    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
+    if (compute_op != nullptr && compute_op->GetScopeIdx() == scope_idx) {
+      return compute_op;
+    }
+    for (auto *out_var : op->Outputs()) {
+      for (auto *pending_op : out_var->PendingOps()) {
+        if (visited.count(pending_op)) continue;
+        visited.insert(pending_op);
       }
     }
-  } while (!queue.empty());
+  } while (!q.empty());
   return nullptr;
 }
 
-static void AddDependencyBetween(OpHandleBase *in, OpHandleBase *out,
-                                 ir::Graph *graph) {
-  auto it = std::find_if(
-      in->Outputs().begin(), in->Outputs().end(), [](VarHandleBase *var) {
-        return dynamic_cast<DummyVarHandle *>(var) != nullptr;
-      });
-
-  if (it != in->Outputs().end()) {
-    out->AddInput(*it);
-  } else {
-    auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
-    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
-    in->AddOutput(dep_var);
-    out->AddInput(dep_var);
-  }
-}
-
 std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
-  auto &ref_cnts = Get<DeviceReferenceCountMap>(kGlobalReferenceCount);
-  auto &cur_ref_cnts = Get<AtomicDeviceReferenceCountMap>(kCurReferenceCount);
-  auto &gcs = Get<DeviceGarbageCollectorMap>(kGarbageCollector);
-
-  // It is not easy to find the right reference counts of varaibles in graph
-  // Step 1: Find all variables in computation ops
-  // Step 2: Find all variables in non-computation ops which refers to variables
-  // in computation ops
-  std::unordered_set<std::string> names;
-  std::unordered_map<OpHandleBase *, ReferenceCountOpHandle *>
-      compute_ref_cnt_map;
-
-  auto get_ref_cnts_from_compute_op = [&](
-      OpHandleBase *op, const std::vector<VarHandleBase *> &vars) {
-    std::vector<std::string> var_names_in_op;
-    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
-    if (compute_op == nullptr ||
-        !platform::is_gpu_place(compute_op->GetPlace()))
-      return var_names_in_op;
-    auto place = boost::get<platform::CUDAPlace>(compute_op->GetPlace());
-    for (VarHandleBase *var_handle_base : vars) {
-      auto *var_handle = dynamic_cast<VarHandle *>(var_handle_base);
-      if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue;
-
-      if (!platform::is_gpu_place(var_handle->place_) ||
-          boost::get<platform::CUDAPlace>(var_handle->place_) != place)
+  auto &vars = graph->Get<GraphVars>(kGraphVars);
+  auto &ref_cnts = Get<std::vector<ReferenceCountMap>>(kGlobalReferenceCount);
+  auto &last_live_ops_of_vars =
+      Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
+
+  last_live_ops_of_vars = std::vector<LastLiveOpsOfVars>(vars.size());
+  ref_cnts = std::vector<ReferenceCountMap>(vars.size());
+
+  for (size_t i = 0; i < vars.size(); ++i) {
+    for (auto &name_var_pair : vars[i]) {
+      if (name_var_pair.second.empty()) continue;
+      auto *last_ver_var = name_var_pair.second.back();
+
+      VarDesc *var_desc = nullptr;
+      std::find_if(name_var_pair.second.rbegin(), name_var_pair.second.rend(),
+                   [&](VarHandle *var_handle) -> bool {
+                     var_desc = var_handle->Node()->Var();
+                     return var_desc != nullptr;
+                   });
+
+      if (var_desc == nullptr || var_desc->Persistable()) {
         continue;
-
-      VarDesc *var_desc = var_handle->Node()->Var();
-      auto var_name = var_handle->Node()->Name();
-
-      // This is weird but there is really some variables without var_desc
-      // in computation_op
-      if (var_desc == nullptr) {
-        var_desc = compute_op->Node()->Op()->Block()->FindVar(var_name);
-        if (var_desc == nullptr) continue;
       }
 
-      if (var_desc->Persistable()) continue;
       auto var_type = var_desc->Proto()->type().type();
       if (var_type != proto::VarType::LOD_TENSOR &&
-          var_type != proto::VarType::SELECTED_ROWS) {
+          var_type != proto::VarType::SELECTED_ROWS &&
+          var_type != proto::VarType::LOD_TENSOR_ARRAY) {
         continue;
       }
 
-      // compute op only runs in one device
-      if (ref_cnts[place.device]->count(var_name))
-        ++(*ref_cnts[place.device])[var_name];
-      else
-        (*ref_cnts[place.device])[var_name] = 1;
-
-      names.insert(var_name);
-      var_names_in_op.push_back(var_name);
-    }
-    return var_names_in_op;
-  };
-
-  auto update_ref_cnts_from_non_compute_op = [&](
-      OpHandleBase *op, const std::vector<VarHandleBase *> &vars) {
-    if (dynamic_cast<ComputationOpHandle *>(op) != nullptr) return;
-    for (VarHandleBase *var_handle_base : vars) {
-      auto *var_handle = dynamic_cast<VarHandle *>(var_handle_base);
-      if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue;
-
-      auto var_name = var_handle->Node()->Name();
-      auto var_place = var_handle->place_;
-      if (!platform::is_gpu_place(var_place)) continue;
-      auto place = boost::get<platform::CUDAPlace>(var_place);
-      if (names.count(var_name) == 0) continue;
-      if (ref_cnts.count(place.device) &&
-          ref_cnts[place.device]->count(var_name)) {
-        ++(*ref_cnts[place.device])[var_name];
-
-        auto *next_compute_op = FindNextComputationOpHandle(var_handle);
-        if (next_compute_op != nullptr) {
-          if (compute_ref_cnt_map.count(next_compute_op)) {
-            compute_ref_cnt_map[next_compute_op]->AddVar(var_name);
-            VLOG(5) << "Add reference count of " << var_name << " to Operator "
-                    << next_compute_op->Name();
-          } else {
-            // Create new reference_count_op_handle
-            ir::Node *ref_cnt_node = graph->CreateEmptyNode(
-                "reference_count", ir::Node::Type::kOperation);
-            auto *ref_cnt_handle = new ReferenceCountOpHandle(
-                ref_cnt_node, next_compute_op->GetScope(), place, {var_name},
-                gcs[place.device].get(), cur_ref_cnts[place.device].get());
-            AddDependencyBetween(next_compute_op, ref_cnt_handle, graph.get());
-            compute_ref_cnt_map[next_compute_op] = ref_cnt_handle;
-          }
+      std::unordered_set<ComputationOpHandle *> last_live_op;
+      auto add_last_live_op = [&](OpHandleBase *op) {
+        auto *compute_op = FindNextComputationOpHandleOrReturnItself(op, i);
+        if (compute_op) {
+          last_live_op.insert(compute_op);
+        }
+      };
+      const std::string &var_name = name_var_pair.first;
+      auto &pending_ops = last_ver_var->PendingOps();
+      if (pending_ops.empty()) {
+        auto *generated_op = last_ver_var->GeneratedOp();
+        if (generated_op) {
+          ref_cnts[i].emplace(var_name, 1);
+          add_last_live_op(generated_op);
+        }
+      } else {
+        ref_cnts[i].emplace(var_name, pending_ops.size());
+        for (auto *pending_op : pending_ops) {
+          add_last_live_op(pending_op);
         }
       }
-    }
-  };
 
-  auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
-  for (auto &op : all_ops) {
-    auto in_var_names = get_ref_cnts_from_compute_op(op, op->Inputs());
-    auto out_var_names = get_ref_cnts_from_compute_op(op, op->Outputs());
-    if (in_var_names.empty() && out_var_names.empty()) continue;
-    in_var_names.insert(in_var_names.end(), out_var_names.begin(),
-                        out_var_names.end());
-    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
-    auto place = boost::get<platform::CUDAPlace>(compute_op->GetPlace());
-    ir::Node *ref_cnt_node =
-        graph->CreateEmptyNode("reference_count", ir::Node::Type::kOperation);
-    auto *ref_cnt_handle = new ReferenceCountOpHandle(
-        ref_cnt_node, compute_op->GetScope(), place, in_var_names,
-        gcs[place.device].get(), cur_ref_cnts[place.device].get());
-    AddDependencyBetween(compute_op, ref_cnt_handle, graph.get());
-    compute_ref_cnt_map[compute_op] = ref_cnt_handle;
-  }
-
-  for (auto &op : all_ops) {
-    update_ref_cnts_from_non_compute_op(op, op->Inputs());
-    update_ref_cnts_from_non_compute_op(op, op->Outputs());
-  }
-
-  std::vector<OpHandleBase *> new_all_ops;
-  new_all_ops.reserve(compute_ref_cnt_map.size() + all_ops.size());
-  for (auto &op : all_ops) {
-    new_all_ops.emplace_back(std::move(op));
-    auto it = compute_ref_cnt_map.find(new_all_ops.back());
-    if (it != compute_ref_cnt_map.end()) {
-      // Add LeafNode to ReferenceCountOpHandle
-      auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
-      graph->Get<GraphDepVars>(kGraphDepVars).emplace(dummy_leaf);
-      it->second->AddOutput(dummy_leaf);
-      new_all_ops.emplace_back(std::move(it->second));
+      last_live_ops_of_vars[i].emplace(var_name, std::move(last_live_op));
     }
   }
-
-  all_ops.swap(new_all_ops);
   return graph;
 }
 
@@ -205,5 +117,4 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
 REGISTER_PASS(reference_count_pass,
               paddle::framework::details::ReferenceCountPass)
     .RequirePassAttr(paddle::framework::details::kGlobalReferenceCount)
-    .RequirePassAttr(paddle::framework::details::kCurReferenceCount)
-    .RequirePassAttr(paddle::framework::details::kGarbageCollector);
+    .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars);
diff --git a/paddle/fluid/framework/details/reference_count_pass.h b/paddle/fluid/framework/details/reference_count_pass.h
index 7081280b06..bcbef02735 100644
--- a/paddle/fluid/framework/details/reference_count_pass.h
+++ b/paddle/fluid/framework/details/reference_count_pass.h
@@ -14,7 +14,6 @@
 
 #pragma once
 
-#include "paddle/fluid/framework/details/reference_count_op_handle.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
 
@@ -22,10 +21,6 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-constexpr char kGlobalReferenceCount[] = "reference_count";
-constexpr char kCurReferenceCount[] = "current_reference_count";
-constexpr char kGarbageCollector[] = "garbage_collector";
-
 class ReferenceCountPass : public ir::Pass {
  protected:
   std::unique_ptr<ir::Graph> ApplyImpl(
diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.h b/paddle/fluid/framework/details/reference_count_pass_helper.h
new file mode 100644
index 0000000000..77846f7bdf
--- /dev/null
+++ b/paddle/fluid/framework/details/reference_count_pass_helper.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <atomic>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/garbage_collector.h"
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class ComputationOpHandle;
+
+using ReferenceCountMap = std::unordered_map<std::string, size_t>;
+
+using AtomicReferenceCountMap =
+    std::unordered_map<std::string, std::atomic<size_t>>;
+
+using GarbageCollectorList =
+    std::vector<std::unique_ptr<GarbageCollector<Tensor>>>;
+
+const char kGlobalReferenceCount[] = "reference_count";
+const char kCurReferenceCount[] = "current_reference_count";
+const char kGarbageCollector[] = "garbage_collector";
+
+using LastLiveOpsOfVars =
+    std::unordered_map<std::string, std::unordered_set<ComputationOpHandle*>>;
+const char kLastLiveOpsOfVars[] = "last_live_ops_of_var";
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index e5b1eaa731..f1bf6542a3 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -18,9 +18,6 @@
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/platform/profiler.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/framework/details/reference_count_op_handle.h"
-#endif
 
 namespace paddle {
 namespace framework {
@@ -33,7 +30,11 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
       underlying_executor_(std::move(underlying_executor)),
       local_scopes_(std::move(local_scopes)),
       var_infos_(std::move(var_infos)),
-      places_(std::move(places)) {}
+      places_(std::move(places)) {
+  if (Graph().Has(details::kGarbageCollector)) {
+    gc_ = &(Graph().Get<GarbageCollectorList>(details::kGarbageCollector));
+  }
+}
 
 FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors) {
@@ -69,27 +70,16 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
   platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
   drop_scope_counter_ += 1;
 
-#ifdef PADDLE_WITH_CUDA
-  const std::string gc_name = "garbage_collector";
-  DeviceGarbageCollectorMap *gc =
-      Graph().Has(gc_name) ? &(Graph().Get<DeviceGarbageCollectorMap>(gc_name))
-                           : nullptr;
-#endif
-
   if (!fetch_tensors.empty() ||
       drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
     drop_scope_counter_ = 0;
     // Wait All computational streams
-    for (auto p : places_) {
-      platform::DeviceContextPool::Instance().Get(p)->Wait();
-#ifdef PADDLE_WITH_CUDA
-      if (gc != nullptr && platform::is_gpu_place(p)) {
-        auto gpu_place = boost::get<platform::CUDAPlace>(p);
-        auto &gc_at_place = gc->at(gpu_place.device);
-        gc_at_place->Wait();
-        gc_at_place->Reset();
+    for (size_t i = 0; i < places_.size(); ++i) {
+      platform::DeviceContextPool::Instance().Get(places_[i])->Wait();
+      if (gc_) {
+        (*gc_)[i]->Wait();
+        (*gc_)[i]->Reset();
       }
-#endif
     }
     for (auto &scope : local_scopes_) {
       auto &local_scope =
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
index 5e87e0bf50..ce3061d6e6 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -21,9 +21,11 @@
 #include "paddle/fluid/framework/details/var_handle.h"
 
 #include "paddle/fluid/framework/details/execution_strategy.h"
+#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/place.h"
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -55,6 +57,8 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
   std::vector<Scope*> local_scopes_;
   std::vector<VariableInfo> var_infos_;
   std::vector<platform::Place> places_;
+
+  GarbageCollectorList* gc_{nullptr};
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index 818b3334ea..cbe8f606ef 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -65,7 +65,7 @@ class GarbageCollector {
 
     if (clear_deque != nullptr) {
       callback();
-      ClearCallback([=]() {
+      ClearCallback([clear_deque]() {
         for (auto *obj : *clear_deque) obj->clear();
       });
     }
@@ -109,7 +109,6 @@ class DefaultStreamGarbageCollector : public GarbageCollector<T> {
   }
 
   void Wait() const override {
-    this->dev_ctx_->Wait();
     static_cast<const platform::CUDADeviceContext *>(this->dev_ctx_)
         ->WaitStreamCallback();
   }
@@ -127,14 +126,14 @@ class StreamGarbageCollector : public GarbageCollector<T> {
   StreamGarbageCollector(const platform::CUDAPlace &place,
                          size_t max_memory_size)
       : GarbageCollector<T>(place, max_memory_size) {
-    PADDLE_ENFORCE(cudaSetDevice(place.device));
+    platform::SetDeviceId(place.device);
     PADDLE_ENFORCE(cudaStreamCreate(&stream_));
     callback_manager_.reset(new platform::StreamCallbackManager(stream_));
   }
 
   ~StreamGarbageCollector() {
     auto place = boost::get<platform::CUDAPlace>(this->dev_ctx_->GetPlace());
-    PADDLE_ENFORCE(cudaSetDevice(place.device));
+    platform::SetDeviceId(place.device);
     PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
     PADDLE_ENFORCE(cudaStreamDestroy(stream_));
   }
@@ -148,8 +147,11 @@ class StreamGarbageCollector : public GarbageCollector<T> {
   cudaStream_t stream() const { return stream_; }
 
  protected:
+  // ClearCallback and Wait()/Reset() cannot be call in multiple threads
+  // But it is not important, because they would not be called in multiple
+  // threads
+  // either in Executor or ParallelExecutor
   void ClearCallback(const std::function<void()> &callback) override {
-    std::lock_guard<std::mutex> guard(this->mutex_);
     callback_manager_->AddCallback(callback);
   }
 
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 947c934f0f..7a2560c14d 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -73,14 +73,21 @@ class Graph {
   }
 
   bool Has(const std::string &attr_name) const {
-    return attrs_.find(attr_name) != attrs_.end();
+    return attrs_.count(attr_name) > 0;
   }
 
   template <typename AttrType>
   AttrType &Get(const std::string &attr_name) const {
     PADDLE_ENFORCE(Has(attr_name), "%s attr not registered for graph.",
                    attr_name);
-    return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
+    try {
+      return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
+    } catch (boost::bad_any_cast &) {
+      PADDLE_THROW(
+          "Invalid attribute type of %s error, expected: %s, actual: %s",
+          attr_name, typeid(AttrType *).name(),
+          attrs_.at(attr_name).type().name());
+    }
   }
 
   template <typename AttrType>
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index a3559247db..27746ff145 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -51,11 +51,18 @@ class Pass {
   AttrType &Get(const std::string &attr_name) const {
     PADDLE_ENFORCE(attrs_.find(attr_name) != attrs_.end(),
                    "%s attr not registered for pass.", attr_name);
-    return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
+    try {
+      return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
+    } catch (boost::bad_any_cast &) {
+      PADDLE_THROW(
+          "Invalid attribute type of %s error, expected: %s, actual: %s",
+          attr_name, typeid(AttrType *).name(),
+          attrs_.at(attr_name).type().name());
+    }
   }
 
   bool Has(const std::string &attr_name) const {
-    return attrs_.find(attr_name) != attrs_.end();
+    return attrs_.count(attr_name) > 0;
   }
 
   void Erase(const std::string &attr_name) {
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index b98408ee77..e71f93beef 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -26,6 +26,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -49,6 +50,15 @@ class ParallelExecutorPrivate {
       }
     }
   }
+
+  void ResetRuntimeReferenceCount() {
+    for (size_t i = 0; i < rt_ref_cnts_.size(); ++i) {
+      for (auto &pair : rt_ref_cnts_[i]) {
+        rt_cur_ref_cnts_[i][pair.first] = pair.second;
+      }
+    }
+  }
+
   std::vector<platform::Place> places_;
   std::vector<Scope *> local_scopes_;
   Scope *global_scope_;  // not owned
@@ -60,6 +70,13 @@ class ParallelExecutorPrivate {
   bool own_local_scope_;
   bool use_cuda_;
   bool use_all_reduce_;
+
+  // rt_ref_cnts_ is only initialized when ParallelExecutor constructs, and then
+  // keeps unchanged
+  // Before each iteration, rt_cur_ref_cnts_ is reset to ref_cnts_
+  std::vector<details::ReferenceCountMap> rt_ref_cnts_;
+  std::vector<details::AtomicReferenceCountMap> rt_cur_ref_cnts_;
+  details::GarbageCollectorList gcs_;
 };
 
 std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
@@ -128,35 +145,56 @@ ParallelExecutor::ParallelExecutor(
   std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
       main_program, member_->places_, loss_var_name, params,
       member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get());
+#else
+  std::unique_ptr<ir::Graph> graph =
+      build_strategy.Apply(main_program, member_->places_, loss_var_name,
+                           params, member_->local_scopes_, member_->use_cuda_);
+#endif
 
   auto max_memory_size = GetEagerDeletionThreshold();
   if (max_memory_size >= 0) {
-    for (auto &place : member_->places_) {
-      if (!platform::is_gpu_place(place)) continue;
-      auto gpu_place = boost::get<platform::CUDAPlace>(place);
-      if (gcs_[gpu_place.device] == nullptr) {
-        ref_cnts_[gpu_place.device].reset(new details::ReferenceCountMap());
-        cur_ref_cnts_[gpu_place.device].reset(
-            new details::AtomicReferenceCountMap());
-        gcs_[gpu_place.device].reset(
-            new StreamGarbageCollector<Tensor>(gpu_place, max_memory_size));
+    size_t place_num = member_->places_.size();
+    for (size_t i = 0; i < place_num; ++i) {
+      auto &place = member_->places_[i];
+#ifdef PADDLE_WITH_CUDA
+      if (platform::is_gpu_place(place)) {
+        member_->gcs_.emplace_back(new StreamGarbageCollector<Tensor>(
+            boost::get<platform::CUDAPlace>(place), max_memory_size));
+        VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
+      } else if (platform::is_cpu_place(place)) {
+#endif
+        member_->gcs_.emplace_back(new CPUGarbageCollector<Tensor>(
+            boost::get<platform::CPUPlace>(place), max_memory_size));
+        VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
+#ifdef PADDLE_WITH_CUDA
       }
-    }
-    if (!gcs_.empty()) {
-      auto ref_cnt_pass =
-          ir::PassRegistry::Instance().Get("reference_count_pass");
-      ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, &ref_cnts_);
-      ref_cnt_pass->SetNotOwned(details::kCurReferenceCount, &cur_ref_cnts_);
-      ref_cnt_pass->SetNotOwned(details::kGarbageCollector, &gcs_);
-      graph = ref_cnt_pass->Apply(std::move(graph));
-      graph->SetNotOwned("garbage_collector", &gcs_);
+#endif
     }
   }
-#else
-  std::unique_ptr<ir::Graph> graph =
-      build_strategy.Apply(main_program, member_->places_, loss_var_name,
-                           params, member_->local_scopes_, member_->use_cuda_);
-#endif
+
+  if (!member_->gcs_.empty()) {
+    std::vector<details::LastLiveOpsOfVars> last_live_ops_of_vars;
+
+    auto ref_cnt_pass =
+        ir::PassRegistry::Instance().Get("reference_count_pass");
+    ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount,
+                              &(member_->rt_ref_cnts_));
+    ref_cnt_pass->SetNotOwned(details::kLastLiveOpsOfVars,
+                              &last_live_ops_of_vars);
+    VLOG(10) << "ReferenceCountPass Applied";
+    graph = ref_cnt_pass->Apply(std::move(graph));
+
+    auto eager_deletion_pass =
+        ir::PassRegistry::Instance().Get("eager_deletion_pass");
+    eager_deletion_pass->SetNotOwned(details::kCurReferenceCount,
+                                     &(member_->rt_cur_ref_cnts_));
+    eager_deletion_pass->SetNotOwned(details::kGarbageCollector,
+                                     &(member_->gcs_));
+    eager_deletion_pass->SetNotOwned(details::kLastLiveOpsOfVars,
+                                     &last_live_ops_of_vars);
+    graph = eager_deletion_pass->Apply(std::move(graph));
+    VLOG(10) << "EagerDeletionPass Applied";
+  }
 
   // Step 3. Create vars in each scope. Passes may also create new vars.
   //         skip control vars and empty vars
@@ -271,18 +309,16 @@ void ParallelExecutor::BCastParamsToDevices(
 void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
                            const std::string &fetched_var_name) {
   platform::RecordBlock b(0);
-#ifdef PADDLE_WITH_CUDA
-  if (!gcs_.empty()) {
-    ResetReferenceCount();
-    for (auto &pair : cur_ref_cnts_) {
-      auto &name_map = *(pair.second);
+  if (!member_->gcs_.empty()) {
+    member_->ResetRuntimeReferenceCount();
+    size_t n = member_->rt_ref_cnts_.size();
+    for (size_t i = 0; i < n; ++i) {
       for (auto &fetch_name : fetch_tensors) {
-        name_map.erase(fetch_name);
+        member_->rt_cur_ref_cnts_[i].erase(fetch_name);
       }
-      name_map.erase(fetched_var_name);
+      member_->rt_cur_ref_cnts_[i].erase(fetched_var_name);
     }
   }
-#endif
   auto fetch_data = member_->executor_->Run(fetch_tensors);
   *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
       fetch_data;
@@ -326,13 +362,11 @@ ParallelExecutor::~ParallelExecutor() {
   for (auto &p : member_->places_) {
     platform::DeviceContextPool::Instance().Get(p)->Wait();
   }
-  // member_ must be destructed before gcs_ since the destructor of
-  // ReferenceCountOpHandle use raw pointers of gcs_ inside.
-  member_.reset();
+  delete member_;
 }
 
 }  // namespace framework
 }  // namespace paddle
-#ifdef PADDLE_WITH_CUDA
+
 USE_PASS(reference_count_pass);
-#endif
+USE_PASS(eager_deletion_pass);
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index ef09b98b2a..1fc17a0d64 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include <atomic>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -29,10 +28,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/framework/details/reference_count_pass.h"
-#endif
-
 namespace paddle {
 namespace framework {
 
@@ -75,24 +70,7 @@ class ParallelExecutor {
  private:
   void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
 
-  std::unique_ptr<ParallelExecutorPrivate> member_;
-
-#ifdef PADDLE_WITH_CUDA
-  // ref_cnts_ is only initialized when ParallelExecutor constructs, and then
-  // keeps unchanged
-  // Before each iteration, cur_ref_cnts_ is reset to ref_cnts_
-  details::DeviceReferenceCountMap ref_cnts_;
-  details::AtomicDeviceReferenceCountMap cur_ref_cnts_;
-  details::DeviceGarbageCollectorMap gcs_;
-
-  void ResetReferenceCount() {
-    for (auto &pair1 : ref_cnts_) {
-      for (auto &pair2 : *(pair1.second)) {
-        (*(cur_ref_cnts_[pair1.first]))[pair2.first] = pair2.second;
-      }
-    }
-  }
-#endif
+  ParallelExecutorPrivate *member_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 93cb5eb2dc..23c7ebe842 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -56,9 +56,16 @@ ELSE()
     set(MKLDNN_CTX_DEPS)
 ENDIF()
 
+nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) 
+IF(WITH_GPU)
+  set(STREAM_CALLBACK_DEPS stream_callback_manager)
+ELSE()
+  set(STREAM_CALLBACK_DEPS)
+ENDIF()
+
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
-cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc
+cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc ${STREAM_CALLBACK_DEPS}
     place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
 
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
new file mode 100644
index 0000000000..ae915365f8
--- /dev/null
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/stream_callback_manager.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+
+struct StreamCallbackContext {
+  inline StreamCallbackContext(const StreamCallbackManager *manager,
+                               std::function<void()> callback)
+      : manager_(manager), callback_(std::move(callback)) {}
+
+  const StreamCallbackManager *manager_;  // do not own
+  std::function<void()> callback_;
+};
+
+StreamCallbackManager::StreamCallbackManager(const cudaStream_t stream)
+    : stream_(stream), thread_pool_(new ::ThreadPool(1)) {}
+
+void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
+  auto *stream_callback_context =
+      new StreamCallbackContext(this, std::move(callback));
+#if CUDA_VERSION >= 10000
+  PADDLE_ENFORCE(cudaLaunchHostFunc(stream_,
+                                    StreamCallbackManager::StreamCallbackFunc,
+                                    stream_callback_context));
+#else
+  PADDLE_ENFORCE(
+      cudaStreamAddCallback(stream_, StreamCallbackManager::StreamCallbackFunc,
+                            stream_callback_context, 0));
+#endif
+}
+
+void StreamCallbackManager::Wait() const {
+  thread_pool_.reset(new ::ThreadPool(1));
+}
+
+#if CUDA_VERSION >= 10000
+void CUDART_CB StreamCallbackManager::StreamCallbackFunc(void *user_data)
+#else
+void CUDART_CB StreamCallbackManager::StreamCallbackFunc(cudaStream_t stream,
+                                                         cudaError_t status,
+                                                         void *user_data)
+#endif
+{
+  auto *callback_context_ptr =
+      reinterpret_cast<StreamCallbackContext *>(user_data);
+  callback_context_ptr->manager_->thread_pool_->enqueue(
+      [callback_context_ptr]() {
+        std::unique_ptr<StreamCallbackContext> callback_context(
+            callback_context_ptr);
+        callback_context->callback_();
+      });
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h
index ed8734c98c..eac4806d13 100644
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -19,66 +19,29 @@
 #include <cuda_runtime.h>
 #include <functional>
 #include <memory>
-#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
 
-class StreamCallbackManager;
-
-struct StreamCallbackContext {
-  template <typename Callback>
-  inline StreamCallbackContext(const StreamCallbackManager *manager,
-                               Callback &&callback)
-      : manager_(manager), callback_(callback) {}
-
-  const StreamCallbackManager *manager_;  // do not own
-  std::function<void()> callback_;
-};
-
+// NOTE(zjl): clean StreamCallback to make compilation faster
 class StreamCallbackManager {
  public:
-  explicit inline StreamCallbackManager(cudaStream_t stream = nullptr)
-      : stream_(stream), thread_pool_(new ThreadPool(1)) {}
+  explicit StreamCallbackManager(const cudaStream_t stream);
 
-  template <typename Callback>
-  inline void AddCallback(Callback &&callback) const {
-    auto *stream_callback_context =
-        new StreamCallbackContext(this, std::forward<Callback>(callback));
-#if CUDA_VERSION >= 10000
-    PADDLE_ENFORCE(cudaLaunchHostFunc(stream_,
-                                      StreamCallbackManager::StreamCallbackFunc,
-                                      stream_callback_context));  // NOLINT
-#else
-    PADDLE_ENFORCE(cudaStreamAddCallback(
-        stream_, StreamCallbackManager::StreamCallbackFunc,
-        stream_callback_context, 0));  // NOLINT
-#endif
-  }
+  void AddCallback(std::function<void()> callback) const;
 
-  void Wait() const { thread_pool_.reset(new ThreadPool(1)); }
+  void Wait() const;
 
  private:
   const cudaStream_t stream_;
-  mutable std::unique_ptr<ThreadPool> thread_pool_;
+  mutable std::unique_ptr<::ThreadPool> thread_pool_;
 
-// cudaStreamCallback cannot call CUDA API inside, so we have to use
-// thread_pool here
 #if CUDA_VERSION >= 10000
-  static void CUDART_CB StreamCallbackFunc(void *user_data)
+  static void CUDART_CB StreamCallbackFunc(void *user_data);
 #else
   static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
-                                           cudaError_t status, void *user_data)
+                                           cudaError_t status, void *user_data);
 #endif
-  {
-    auto *callback_context_ptr =
-        reinterpret_cast<StreamCallbackContext *>(user_data);
-    callback_context_ptr->manager_->thread_pool_->enqueue([=]() {
-      std::unique_ptr<StreamCallbackContext> callback_context(
-          callback_context_ptr);
-      callback_context->callback_();
-    });
-  }
 };
 
 }  // namespace platform

From c47c451a007f33078bfb8f38be4a6cd50922f361 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Mon, 3 Dec 2018 11:45:53 +0000
Subject: [PATCH 02/45] fix bug

---
 paddle/fluid/framework/details/CMakeLists.txt |   2 +-
 .../details/computation_op_handle.cc          |   2 +
 .../details/eager_deletion_op_handle.cc       |  23 ++--
 .../details/eager_deletion_op_handle.h        |   8 +-
 .../framework/details/eager_deletion_pass.cc  |  81 ++++++------
 .../fluid/framework/details/op_graph_view.h   |  29 +++-
 .../framework/details/reference_count_pass.cc | 125 ++++++++++++++++--
 .../scope_buffered_ssa_graph_executor.cc      |  21 ++-
 .../scope_buffered_ssa_graph_executor.h       |   2 +
 paddle/fluid/framework/executor.cc            | 104 +++++++++++----
 paddle/fluid/framework/executor.h             |  51 ++-----
 paddle/fluid/framework/garbage_collector.h    |  44 +++---
 paddle/fluid/framework/operator.cc            |   2 +
 paddle/fluid/framework/parallel_executor.cc   |  13 +-
 paddle/fluid/framework/scope.cc               |   6 +
 paddle/fluid/framework/scope.h                |   1 +
 paddle/fluid/framework/tensor.h               |   2 +-
 .../fluid/operators/controlflow/while_op.cc   |  44 +++++-
 paddle/fluid/operators/reader/ctr_reader.h    |  12 +-
 paddle/fluid/platform/device_context.h        |  10 +-
 .../fluid/platform/stream_callback_manager.cc |  67 +++++-----
 .../fluid/platform/stream_callback_manager.h  |  20 +--
 paddle/fluid/pybind/tensor_py.h               |  12 +-
 python/paddle/fluid/__init__.py               |   5 +-
 24 files changed, 458 insertions(+), 228 deletions(-)

diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 8cf97d667d..8049f5d3f7 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -35,7 +35,7 @@ cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_e
 
 cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows op_handle_base)
 cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
-cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass)
+cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view)
 
 cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
 cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)
diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
index 7beb8c8de9..2bf43fd4e0 100644
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -31,6 +31,8 @@ ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
 void ComputationOpHandle::RunImpl() {
   WaitInputVarGenerated(place_);
 
+  VLOG(10) << "Run Op" << Name();
+
   auto run_func = [this]() {
     op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
   };
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index cd26203376..41f616035d 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -16,6 +16,7 @@
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
 
 namespace paddle {
 namespace framework {
@@ -23,28 +24,32 @@ namespace details {
 
 EagerDeletionOpHandle::EagerDeletionOpHandle(
     ir::Node *node, const Scope *scope, const platform::Place &place,
-    const std::vector<std::string> &var_names, GarbageCollector<Tensor> *gc,
-    AtomicReferenceCountMap *ref_cnts)
-    : OpHandleBase(node), scope_(scope), gc_(gc), ref_cnts_(ref_cnts) {
+    const std::unordered_set<std::string> &var_names,
+    GarbageCollector<Tensor> *gc, AtomicReferenceCountMap *ref_cnts)
+    : OpHandleBase(node),
+      scope_(scope),
+      var_names_(var_names),
+      gc_(gc),
+      ref_cnts_(ref_cnts) {
 #ifdef PADDLE_WITH_CUDA
   if (platform::is_gpu_place(place)) {
     dev_ctx_ = static_cast<platform::CUDADeviceContext *>(
         platform::DeviceContextPool::Instance().Get(place));
     if (dynamic_cast<StreamGarbageCollector<Tensor> *>(gc_)) {
-      platform::SetDeviceId(boost::get<platform::CUDAPlace>(place).device);
+      platform::CUDADeviceGuard guard(
+          boost::get<platform::CUDAPlace>(place).device);
       PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+      PADDLE_ENFORCE_NOT_NULL(event_);
     }
   }
 #endif
-
-  for (auto &name : var_names) AddVar(name);
 }
 
 EagerDeletionOpHandle::~EagerDeletionOpHandle() {
 #ifdef PADDLE_WITH_CUDA
   if (event_) {
     auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
-    platform::SetDeviceId(gpu_place.device);
+    platform::CUDADeviceGuard guard(gpu_place.device);
     PADDLE_ENFORCE(cudaEventDestroy(event_));
   }
 #endif
@@ -52,10 +57,6 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
 
 std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; }
 
-void EagerDeletionOpHandle::AddVar(const std::string &name) {
-  var_names_.insert(name);
-}
-
 void EagerDeletionOpHandle::RunImpl() {
   auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
   std::vector<Tensor *> tensors;
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h
index 8254f21bdf..d8de59cc4d 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.h
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h
@@ -25,13 +25,11 @@ class Scope;
 
 namespace details {
 
-class EagerDeletionPass;
-
 class EagerDeletionOpHandle : public OpHandleBase {
  public:
   EagerDeletionOpHandle(ir::Node *node, const Scope *scope,
                         const platform::Place &place,
-                        const std::vector<std::string> &var_names,
+                        const std::unordered_set<std::string> &var_names,
                         GarbageCollector<Tensor> *gc,
                         AtomicReferenceCountMap *ref_cnts);
 
@@ -45,8 +43,6 @@ class EagerDeletionOpHandle : public OpHandleBase {
  private:
   void ClearTensors(const std::vector<Tensor *> &tensors);
 
-  void AddVar(const std::string &name);
-
   const Scope *scope_;
   std::unordered_set<std::string> var_names_;
   GarbageCollector<Tensor> *gc_;       // not own
@@ -55,8 +51,6 @@ class EagerDeletionOpHandle : public OpHandleBase {
   platform::CUDADeviceContext *dev_ctx_{nullptr};
   cudaEvent_t event_{nullptr};
 #endif
-
-  friend class EagerDeletionPass;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc
index f877c2881c..3a1b37e533 100644
--- a/paddle/fluid/framework/details/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/details/eager_deletion_pass.cc
@@ -26,62 +26,61 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-static void AddDependencyBetween(OpHandleBase *in, OpHandleBase *out,
-                                 ir::Graph *graph) {
-  auto it = std::find_if(
-      in->Outputs().begin(), in->Outputs().end(), [](VarHandleBase *var) {
-        return dynamic_cast<DummyVarHandle *>(var) != nullptr;
-      });
-
-  if (it != in->Outputs().end()) {
-    out->AddInput(*it);
-  } else {
-    auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
-    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
-    in->AddOutput(dep_var);
-    out->AddInput(dep_var);
-  }
-
-  // Add leaf node to eager_deletion_node
-  if (out->Outputs().empty()) {
-    auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
-    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dummy_leaf);
-    out->AddOutput(dummy_leaf);
-  }
-}
-
 std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
-  auto &vars = graph->Get<GraphVars>(kGraphVars);
+  const auto &vars = graph->Get<GraphVars>(kGraphVars);
 
   auto &ref_cnts =
       Get<std::vector<AtomicReferenceCountMap>>(kCurReferenceCount);
-  auto &last_live_ops = Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
+  const auto &last_live_ops =
+      Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
   auto &gcs = Get<GarbageCollectorList>(kGarbageCollector);
 
   ref_cnts = std::vector<AtomicReferenceCountMap>(vars.size());
 
-  std::unordered_map<ComputationOpHandle *, EagerDeletionOpHandle *> op_map;
+  std::unordered_map<ComputationOpHandle *, std::unordered_set<std::string>>
+      op_vars_map;
+
   for (auto &var_ops_map : last_live_ops) {
     for (auto &var_ops_pair : var_ops_map) {
       const std::string &var_name = var_ops_pair.first;
-      for (ComputationOpHandle *op : var_ops_pair.second) {
-        auto it = op_map.find(op);
-        if (it != op_map.end()) {
-          it->second->AddVar(var_name);
-        } else {
-          auto *eager_deletion_node = graph->CreateEmptyNode(
-              "eager_deletion", ir::Node::Type::kOperation);
-          auto *eager_deletion_op = new EagerDeletionOpHandle(
-              eager_deletion_node, op->GetScope(), op->GetPlace(), {var_name},
-              gcs[op->GetScopeIdx()].get(), &(ref_cnts[op->GetScopeIdx()]));
-          AddDependencyBetween(op, eager_deletion_op, graph.get());
-          op_map[op] = eager_deletion_op;
-        }
+      for (auto *op : var_ops_pair.second) {
+        op_vars_map[op].insert(var_name);
       }
     }
   }
-  VLOG(10) << "Create " << op_map.size() << " EagerDeletionOpHandle(s)";
+
+  for (auto &pair : op_vars_map) {
+    auto *op = pair.first;
+    auto &var_names = pair.second;
+
+    auto *eager_deletion_node =
+        graph->CreateEmptyNode("eager_deletion", ir::Node::Type::kOperation);
+    auto *eager_deletion_op = new EagerDeletionOpHandle(
+        eager_deletion_node, op->GetScope(), op->GetPlace(),
+        std::move(var_names), gcs[op->GetScopeIdx()].get(),
+        &(ref_cnts[op->GetScopeIdx()]));
+
+    auto it = std::find_if(
+        op->Outputs().begin(), op->Outputs().end(), [](VarHandleBase *var) {
+          return dynamic_cast<DummyVarHandle *>(var) != nullptr;
+        });
+
+    if (it != op->Outputs().end()) {
+      eager_deletion_op->AddInput(*it);
+    } else {
+      auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
+      graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
+      op->AddOutput(dep_var);
+      eager_deletion_op->AddInput(dep_var);
+    }
+
+    auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
+    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dummy_leaf);
+    eager_deletion_op->AddOutput(dummy_leaf);
+  }
+
+  VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)";
   return graph;
 }
 
diff --git a/paddle/fluid/framework/details/op_graph_view.h b/paddle/fluid/framework/details/op_graph_view.h
index afb3e8e594..77aa02eba5 100644
--- a/paddle/fluid/framework/details/op_graph_view.h
+++ b/paddle/fluid/framework/details/op_graph_view.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include <memory>
+#include <queue>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
@@ -34,6 +34,11 @@ class OpGraphView {
 
   bool HasOp(OpHandleBase *op) const;
 
+  // Use a visitor to visit all pending ops of op
+  // Stop when callback returns false
+  template <typename Callback>
+  bool VisitAllPendingOps(OpHandleBase *op, Callback &&callback) const;
+
  private:
   void Build(const std::vector<OpHandleBase *> &ops);
   void EnforceHasOp(OpHandleBase *op) const;
@@ -44,6 +49,28 @@ class OpGraphView {
       pending_ops_;
 };
 
+template <typename Callback>
+bool OpGraphView::VisitAllPendingOps(OpHandleBase *op,
+                                     Callback &&callback) const {
+  EnforceHasOp(op);
+  std::unordered_set<OpHandleBase *> visited;
+  std::queue<OpHandleBase *> q;
+  q.push(op);
+  do {
+    op = q.front();
+    q.pop();
+    for (auto &pending_op : pending_ops_.at(op)) {
+      if (visited.count(pending_op) == 0) {
+        visited.insert(pending_op);
+        if (!callback(pending_op)) {
+          return false;
+        }
+      }
+    }
+  } while (!q.empty());
+  return true;
+}
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc
index f094c7afa9..2320d3926a 100644
--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -14,11 +14,13 @@
 
 #include <queue>
 #include <string>
+#include <type_traits>
 #include <vector>
 
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/details/op_graph_view.h"
 #include "paddle/fluid/framework/details/reference_count_pass.h"
 #include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
@@ -27,6 +29,89 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+struct OpConnectionDetector {
+ public:
+  enum RelationShip { kSame = 0, kNoDeps = 1, kBefore = 2, kAfter = 3 };
+
+  explicit OpConnectionDetector(const std::vector<OpHandleBase *> &all_ops)
+      : graph_(all_ops) {}
+
+  template <typename OpSet>
+  std::unordered_set<typename OpSet::key_type> MaxNoDepOps(
+      const OpSet &op_set) {
+    using KeyType = typename OpSet::key_type;
+    static_assert(
+        std::is_base_of<OpHandleBase,
+                        typename std::remove_pointer<KeyType>::type>::value,
+        "Key type of OpSet must be or derived of OpHandleBase");
+
+    std::vector<OpHandleBase *> ops(op_set.begin(), op_set.end());
+    std::unordered_set<KeyType> ret;
+    auto rels = GetRelations(ops);
+    auto not_before = [](RelationShip r) { return r != kBefore; };
+    for (size_t i = 0; i < rels.size(); ++i) {
+      if (std::all_of(rels[i].begin(), rels[i].end(), not_before)) {
+        ret.insert(static_cast<KeyType>(ops[i]));
+      }
+    }
+    return ret;
+  }
+
+ private:
+  std::vector<std::vector<RelationShip>> GetRelations(
+      const std::vector<OpHandleBase *> ops) {
+    std::unordered_map<OpHandleBase *, size_t> op_to_idx;
+    for (size_t i = 0; i < ops.size(); ++i) {
+      PADDLE_ENFORCE(graph_.HasOp(ops[i]), "Op does not exist in graph");
+      op_to_idx[ops[i]] = i;
+    }
+
+    PADDLE_ENFORCE(op_to_idx.size() == ops.size(), "Duplicate ops");
+
+    std::vector<std::vector<RelationShip>> ret(ops.size());
+    for (auto &e : ret) {
+      e.assign(ops.size(), kSame);
+    }
+
+    size_t found_num = ops.size();
+    size_t total_num = ops.size() * ops.size();
+    auto visitor = [&](OpHandleBase *op, size_t i) {
+      auto it = op_to_idx.find(op);
+      if (it != op_to_idx.end()) {
+        size_t j = it->second;
+        if (ret[i][j] != kSame) {
+          ret[i][j] = kBefore;
+          ret[j][i] = kAfter;
+          found_num += 2;
+          if (found_num == total_num) {
+            return false;
+          }
+        }
+      }
+      return true;
+    };
+
+    for (size_t i = 0; i < ops.size(); ++i) {
+      auto sub_visitor = [&, i](OpHandleBase *op) { return visitor(op, i); };
+      if (!graph_.VisitAllPendingOps(ops[i], sub_visitor)) {
+        break;
+      }
+    }
+
+    for (size_t i = 0; i < ops.size(); ++i) {
+      for (size_t j = i + 1; j < ops.size(); ++j) {
+        if (ret[i][j] != kSame) continue;
+        ret[i][j] = kNoDeps;
+        ret[j][i] = kNoDeps;
+      }
+    }
+
+    return ret;
+  }
+
+  const OpGraphView graph_;
+};
+
 static ComputationOpHandle *FindNextComputationOpHandleOrReturnItself(
     OpHandleBase *op, size_t scope_idx) {
   std::queue<OpHandleBase *> q;
@@ -59,9 +144,15 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
   last_live_ops_of_vars = std::vector<LastLiveOpsOfVars>(vars.size());
   ref_cnts = std::vector<ReferenceCountMap>(vars.size());
 
+  OpConnectionDetector detector(ir::FilterByNodeWrapper<OpHandleBase>(*graph));
+
   for (size_t i = 0; i < vars.size(); ++i) {
     for (auto &name_var_pair : vars[i]) {
-      if (name_var_pair.second.empty()) continue;
+      if (name_var_pair.second.empty()) {
+        continue;
+      }
+
+      const std::string &var_name = name_var_pair.first;
       auto *last_ver_var = name_var_pair.second.back();
 
       VarDesc *var_desc = nullptr;
@@ -83,30 +174,46 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
       }
 
       std::unordered_set<ComputationOpHandle *> last_live_op;
-      auto add_last_live_op = [&](OpHandleBase *op) {
+      auto add_last_live_op = [&](OpHandleBase *op) -> bool {
         auto *compute_op = FindNextComputationOpHandleOrReturnItself(op, i);
         if (compute_op) {
           last_live_op.insert(compute_op);
+          return true;
+        } else {
+          return false;
         }
       };
-      const std::string &var_name = name_var_pair.first;
+
+      bool can_delete = false;
       auto &pending_ops = last_ver_var->PendingOps();
       if (pending_ops.empty()) {
         auto *generated_op = last_ver_var->GeneratedOp();
-        if (generated_op) {
-          ref_cnts[i].emplace(var_name, 1);
-          add_last_live_op(generated_op);
+        if (generated_op && add_last_live_op(generated_op)) {
+          can_delete = true;
         }
       } else {
-        ref_cnts[i].emplace(var_name, pending_ops.size());
+        can_delete = true;
         for (auto *pending_op : pending_ops) {
-          add_last_live_op(pending_op);
+          if (!add_last_live_op(pending_op)) {
+            can_delete = false;
+            break;
+          }
         }
       }
 
-      last_live_ops_of_vars[i].emplace(var_name, std::move(last_live_op));
+      if (can_delete) {
+        size_t original_size = last_live_op.size();
+        last_live_op = detector.MaxNoDepOps(last_live_op);
+        if (last_live_op.size() != original_size) {
+          VLOG(10) << "Shrink last living op number of " << var_name << " from "
+                   << original_size << " to " << last_live_op.size();
+        }
+        ref_cnts[i].emplace(var_name, last_live_op.size());
+        last_live_ops_of_vars[i].emplace(var_name, std::move(last_live_op));
+      }
     }
   }
+
   return graph;
 }
 
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index f1bf6542a3..0cc3ac8bfb 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -36,6 +36,15 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
   }
 }
 
+void ScopeBufferedSSAGraphExecutor::WaitAllGarbageCollectors() {
+  if (gc_) {
+    for (auto &gc : *gc_) {
+      gc->Wait();
+      gc->Reset();
+    }
+  }
+}
+
 FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors) {
   if (drop_scope_counter_ == 0) {
@@ -74,19 +83,19 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
       drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
     drop_scope_counter_ = 0;
     // Wait All computational streams
-    for (size_t i = 0; i < places_.size(); ++i) {
-      platform::DeviceContextPool::Instance().Get(places_[i])->Wait();
-      if (gc_) {
-        (*gc_)[i]->Wait();
-        (*gc_)[i]->Reset();
-      }
+    for (auto &p : places_) {
+      platform::DeviceContextPool::Instance().Get(p)->Wait();
     }
+    WaitAllGarbageCollectors();
     for (auto &scope : local_scopes_) {
       auto &local_scope =
           *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
       scope->DeleteScope(local_scope);
     }
+  } else {
+    WaitAllGarbageCollectors();
   }
+
   if (eptr) {
     std::rethrow_exception(eptr);
   } else {
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
index ce3061d6e6..4d52183a20 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -50,6 +50,8 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
   FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override;
 
  private:
+  void WaitAllGarbageCollectors();
+
   size_t drop_scope_counter_{0};
 
   ExecutionStrategy strategy_;
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 96132a2c18..02d1e4114e 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -37,11 +37,49 @@ namespace {
 int kProgramId = -1;
 }  // namespace
 
+static std::unordered_map<std::string, size_t> GetNonPersistableReferenceCounts(
+    const BlockDesc& block, const std::vector<std::string>& skip_var_list) {
+  std::unordered_map<std::string, size_t> ref_cnts;
+  std::unordered_set<std::string> skip_vars(skip_var_list.begin(),
+                                            skip_var_list.end());
+
+  auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
+    for (auto& name_pair : name_map) {
+      for (auto& name : name_pair.second) {
+        if (skip_vars.count(name)) continue;
+        auto* var_desc = block.FindVar(name);
+        if (var_desc == nullptr || var_desc->Persistable()) continue;
+        auto type = var_desc->Proto()->type().type();
+        if (type != proto::VarType::LOD_TENSOR &&
+            type != proto::VarType::SELECTED_ROWS &&
+            type != proto::VarType::LOD_TENSOR_ARRAY) {
+          continue;
+        }
+
+        auto it = ref_cnts.find(name);
+        if (it != ref_cnts.end()) {
+          ++it->second;
+        } else {
+          ref_cnts[name] = 1;
+        }
+      }
+    }
+  };
+
+  for (auto op_desc : block.AllOps()) {
+    update_ref_cnts(op_desc, op_desc->Inputs());
+    update_ref_cnts(op_desc, op_desc->Outputs());
+  }
+  return ref_cnts;
+}
+
 ExecutorPrepareContext::ExecutorPrepareContext(
-    const framework::ProgramDesc& prog, size_t block_id)
+    const framework::ProgramDesc& prog, size_t block_id,
+    const std::vector<std::string>& skip_ref_cnt_vars)
     : prog_(prog), block_id_(block_id) {
   if (GetEagerDeletionThreshold() >= 0) {
-    ref_cnts_ = GetNonPersistableReferenceCount<int>(prog_, block_id_);
+    ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id),
+                                                 skip_ref_cnt_vars);
   }
 }
 
@@ -49,10 +87,9 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
   VLOG(5) << "destroy ExecutorPrepareContext";
 }
 
-template <typename RefCntMap>
-static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
-                                GarbageCollector<Tensor>* gc,
-                                RefCntMap* ref_cnts) {
+static void DeleteUnusedTensors(
+    const Scope& scope, const OperatorBase* op, GarbageCollector<Tensor>* gc,
+    std::unordered_map<std::string, size_t>* ref_cnts) {
   std::unordered_set<Tensor*> erase_tensors;
 
   auto handler = [&](const VariableNameMap& name_map) {
@@ -60,7 +97,7 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
       for (auto& name : name_pair.second) {
         auto it = ref_cnts->find(name);
         if (it == ref_cnts->end()) continue;
-        if ((it->second)-- == 1) {
+        if (--(it->second) == 0) {
           auto* var = scope.FindVar(name);
           if (var != nullptr) {
             VLOG(10) << "Erase tensor \'" << name << "\'";
@@ -69,6 +106,11 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
             } else if (var->IsType<SelectedRows>()) {
               erase_tensors.insert(
                   var->GetMutable<SelectedRows>()->mutable_value());
+            } else if (var->IsType<LoDTensorArray>()) {
+              auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>();
+              for (auto& t : *lod_tensor_arr) {
+                erase_tensors.insert(&t);
+              }
             }
           }
         }
@@ -351,9 +393,10 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
 }
 
 std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
-    const ProgramDesc& program, int block_id) {
+    const ProgramDesc& program, int block_id,
+    const std::vector<std::string>& skip_ref_cnt_vars) {
   std::unique_ptr<ExecutorPrepareContext> ctx(
-      new ExecutorPrepareContext(program, block_id));
+      new ExecutorPrepareContext(program, block_id, skip_ref_cnt_vars));
   PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
   auto& block = program.Block(block_id);
   for (auto& op_desc : block.AllOps()) {
@@ -364,16 +407,28 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
 }
 
 std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
-    const ProgramDesc& program, const std::vector<int>& block_ids) {
+    const ProgramDesc& program, const std::vector<int>& block_ids,
+    const std::vector<std::vector<std::string>>& skip_ref_cnt_vars) {
+  PADDLE_ENFORCE(
+      skip_ref_cnt_vars.empty() || skip_ref_cnt_vars.size() == block_ids.size(),
+      "skip_ref_cnt_vars should be either empty or equals to block number %d",
+      block_ids.size());
   std::vector<std::shared_ptr<ExecutorPrepareContext>> result;
+  size_t idx = 0;
   for (auto& bid : block_ids) {
-    auto* ctx = new ExecutorPrepareContext(program, bid);
+    ExecutorPrepareContext* ctx;
+    if (skip_ref_cnt_vars.empty()) {
+      ctx = new ExecutorPrepareContext(program, bid);
+    } else {
+      ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx]);
+    }
     PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
     auto& block = program.Block(bid);
     for (auto& op_desc : block.AllOps()) {
       ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
     }
     result.push_back(std::shared_ptr<ExecutorPrepareContext>(ctx));
+    ++idx;
   }
   return result;
 }
@@ -392,18 +447,18 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
 
   int64_t max_memory_size = GetEagerDeletionThreshold();
   std::unique_ptr<GarbageCollector<Tensor>> gc;
-  // WhileOp would set keep_kids to true,
-  // because WhileGradOp needs the scopes created in WhileOp.
-  // Perhaps, we should not perform eager deletion in WhileOp
-  // The scopes and variables created by WhileOp would be deleted
-  // in WhileGradOp.
-  if (max_memory_size >= 0 && !keep_kids) {
+  if (max_memory_size >= 0) {
     ctx->ResetReferenceCount();
 #ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(place_)) {
-      gc.reset(new DefaultStreamGarbageCollector<Tensor>(
-          boost::get<platform::CUDAPlace>(place_), max_memory_size));
-    } else {
+      if (IsFastEagerDeletionModeEnabled()) {
+        gc.reset(new UnsafeFastGPUGarbageCollector<Tensor>(
+            boost::get<platform::CUDAPlace>(place_), max_memory_size));
+      } else {
+        gc.reset(new DefaultStreamGarbageCollector<Tensor>(
+            boost::get<platform::CUDAPlace>(place_), max_memory_size));
+      }
+    } else if (platform::is_cpu_place(place_)) {
 #endif
       gc.reset(new CPUGarbageCollector<Tensor>(
           boost::get<platform::CPUPlace>(place_), max_memory_size));
@@ -415,17 +470,14 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   for (auto& op : ctx->ops_) {
     op->Run(*local_scope, place_);
 
-    if (gc != nullptr) {
+    if (gc) {
       DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
                           &(ctx->cur_ref_cnts_));
     }
   }
 
-  if (gc != nullptr) {
-    gc->Wait();
-  } else {
-    platform::DeviceContextPool::Instance().Get(place_)->Wait();
-  }
+  platform::DeviceContextPool::Instance().Get(place_)->Wait();
+  if (gc) gc->Wait();
 
   if (local_scope != scope) {
     scope->DeleteScope(local_scope);
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 36b36d49c2..f00d4314b6 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -28,42 +28,11 @@ namespace paddle {
 namespace framework {
 extern void InitializeVariable(Variable* var, proto::VarType::Type var_type);
 
-template <typename T>
-std::unordered_map<std::string, T> GetNonPersistableReferenceCount(
-    const ProgramDesc& prog, size_t block_id) {
-  auto& block = prog.Block(block_id);
-  std::unordered_map<std::string, T> ref_cnts;
-
-  auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
-    for (auto& name_pair : name_map) {
-      for (auto& name : name_pair.second) {
-        auto* var_desc = block.FindVar(name);
-        if (var_desc == nullptr || var_desc->Persistable()) continue;
-        auto type = var_desc->Proto()->type().type();
-        if (type != proto::VarType::LOD_TENSOR &&
-            type != proto::VarType::SELECTED_ROWS) {
-          continue;
-        }
-
-        auto it = ref_cnts.find(name);
-        if (it != ref_cnts.end()) {
-          ++it->second;
-        } else {
-          ref_cnts[name] = 1;
-        }
-      }
-    }
-  };
-
-  for (auto op_desc : block.AllOps()) {
-    update_ref_cnts(op_desc, op_desc->Inputs());
-    update_ref_cnts(op_desc, op_desc->Outputs());
-  }
-  return ref_cnts;
-}
-
 struct ExecutorPrepareContext {
-  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id);
+  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id,
+                         const std::vector<std::string>& skip_ref_cnt_vars =
+                             std::vector<std::string>());
+
   ~ExecutorPrepareContext();
 
   void ResetReferenceCount() { cur_ref_cnts_ = ref_cnts_; }
@@ -72,8 +41,8 @@ struct ExecutorPrepareContext {
   size_t block_id_;
   std::vector<std::unique_ptr<OperatorBase>> ops_;
 
-  std::unordered_map<std::string, int> ref_cnts_;
-  std::unordered_map<std::string, int> cur_ref_cnts_;
+  std::unordered_map<std::string, size_t> ref_cnts_;
+  std::unordered_map<std::string, size_t> cur_ref_cnts_;
 };
 
 class Executor {
@@ -109,10 +78,14 @@ class Executor {
            const std::string& fetch_holder_name = "fetch");
 
   static std::unique_ptr<ExecutorPrepareContext> Prepare(
-      const ProgramDesc& program, int block_id);
+      const ProgramDesc& program, int block_id,
+      const std::vector<std::string>& skip_ref_cnt_vars =
+          std::vector<std::string>());
 
   static std::vector<std::shared_ptr<ExecutorPrepareContext>> Prepare(
-      const ProgramDesc& program, const std::vector<int>& block_ids);
+      const ProgramDesc& program, const std::vector<int>& block_ids,
+      const std::vector<std::vector<std::string>>& skip_ref_cnt_vars =
+          std::vector<std::vector<std::string>>());
 
   void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id);
 
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index cbe8f606ef..1382e0d461 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -19,6 +19,9 @@
 #include <functional>
 #include <memory>
 #include <mutex>  // NOLINT
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
@@ -36,6 +39,11 @@ class GarbageCollector {
 
   virtual ~GarbageCollector() {}
 
+  size_t NumOfGarbages() const {
+    std::lock_guard<std::mutex> guard(mutex_);
+    return garbages_->size();
+  }
+
   void Reset() {
     std::lock_guard<std::mutex> guard(mutex_);
     garbages_.reset(new std::deque<T *>());
@@ -49,7 +57,7 @@ class GarbageCollector {
 
   template <typename Container, typename Callback>
   void Add(const Container &objs, Callback &&callback) {
-    std::shared_ptr<std::deque<T *>> clear_deque;
+    std::deque<T *> *clear_deque = nullptr;
     {
       std::lock_guard<std::mutex> guard(mutex_);
       for (auto *obj : objs) {
@@ -58,7 +66,7 @@ class GarbageCollector {
       }
       if (cur_memory_size_ >= max_memory_size_) {
         cur_memory_size_ = 0;
-        clear_deque = garbages_;
+        clear_deque = garbages_.release();
         garbages_.reset(new std::deque<T *>());
       }
     }
@@ -67,6 +75,7 @@ class GarbageCollector {
       callback();
       ClearCallback([clear_deque]() {
         for (auto *obj : *clear_deque) obj->clear();
+        delete clear_deque;
       });
     }
   }
@@ -77,7 +86,7 @@ class GarbageCollector {
   virtual void ClearCallback(const std::function<void()> &callback) = 0;
 
   platform::DeviceContext *dev_ctx_;
-  std::shared_ptr<std::deque<T *>> garbages_;
+  std::unique_ptr<std::deque<T *>> garbages_;
   mutable std::mutex mutex_;
   const size_t max_memory_size_;
   size_t cur_memory_size_ = 0;
@@ -96,6 +105,19 @@ class CPUGarbageCollector : public GarbageCollector<T> {
 };
 
 #ifdef PADDLE_WITH_CUDA
+template <typename T>
+class UnsafeFastGPUGarbageCollector : public GarbageCollector<T> {
+ public:
+  UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place,
+                                size_t max_memory_size)
+      : GarbageCollector<T>(place, max_memory_size) {}
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override {
+    callback();
+  }
+};
+
 template <typename T>
 class DefaultStreamGarbageCollector : public GarbageCollector<T> {
  public:
@@ -109,7 +131,7 @@ class DefaultStreamGarbageCollector : public GarbageCollector<T> {
   }
 
   void Wait() const override {
-    static_cast<const platform::CUDADeviceContext *>(this->dev_ctx_)
+    static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
         ->WaitStreamCallback();
   }
 
@@ -126,31 +148,23 @@ class StreamGarbageCollector : public GarbageCollector<T> {
   StreamGarbageCollector(const platform::CUDAPlace &place,
                          size_t max_memory_size)
       : GarbageCollector<T>(place, max_memory_size) {
-    platform::SetDeviceId(place.device);
+    platform::CUDADeviceGuard guard(place.device);
     PADDLE_ENFORCE(cudaStreamCreate(&stream_));
     callback_manager_.reset(new platform::StreamCallbackManager(stream_));
   }
 
   ~StreamGarbageCollector() {
     auto place = boost::get<platform::CUDAPlace>(this->dev_ctx_->GetPlace());
-    platform::SetDeviceId(place.device);
+    platform::CUDADeviceGuard guard(place.device);
     PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
     PADDLE_ENFORCE(cudaStreamDestroy(stream_));
   }
 
-  void Wait() const override {
-    PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
-    std::lock_guard<std::mutex> guard(this->mutex_);
-    callback_manager_->Wait();
-  }
+  void Wait() const override { callback_manager_->Wait(); }
 
   cudaStream_t stream() const { return stream_; }
 
  protected:
-  // ClearCallback and Wait()/Reset() cannot be call in multiple threads
-  // But it is not important, because they would not be called in multiple
-  // threads
-  // either in Executor or ParallelExecutor
   void ClearCallback(const std::function<void()> &callback) override {
     callback_manager_->AddCallback(callback);
   }
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 8bfdf38912..a5f714fc89 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -873,6 +873,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
           t = &(var->Get<SelectedRows>().value());
         }
         if (t != nullptr) {
+          PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized: %s",
+                         ipt_name, DebugString());
           int tmp = static_cast<int>(ToDataType(t->type()));
           PADDLE_ENFORCE(
               tmp == data_type || data_type == -1,
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index e71f93beef..3d466e44a1 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -158,8 +158,13 @@ ParallelExecutor::ParallelExecutor(
       auto &place = member_->places_[i];
 #ifdef PADDLE_WITH_CUDA
       if (platform::is_gpu_place(place)) {
-        member_->gcs_.emplace_back(new StreamGarbageCollector<Tensor>(
-            boost::get<platform::CUDAPlace>(place), max_memory_size));
+        if (IsFastEagerDeletionModeEnabled()) {
+          member_->gcs_.emplace_back(new UnsafeFastGPUGarbageCollector<Tensor>(
+              boost::get<platform::CUDAPlace>(place), max_memory_size));
+        } else {
+          member_->gcs_.emplace_back(new StreamGarbageCollector<Tensor>(
+              boost::get<platform::CUDAPlace>(place), max_memory_size));
+        }
         VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
       } else if (platform::is_cpu_place(place)) {
 #endif
@@ -181,8 +186,8 @@ ParallelExecutor::ParallelExecutor(
                               &(member_->rt_ref_cnts_));
     ref_cnt_pass->SetNotOwned(details::kLastLiveOpsOfVars,
                               &last_live_ops_of_vars);
-    VLOG(10) << "ReferenceCountPass Applied";
     graph = ref_cnt_pass->Apply(std::move(graph));
+    VLOG(10) << "ReferenceCountPass Applied";
 
     auto eager_deletion_pass =
         ir::PassRegistry::Instance().Get("eager_deletion_pass");
@@ -194,6 +199,8 @@ ParallelExecutor::ParallelExecutor(
                                      &last_live_ops_of_vars);
     graph = eager_deletion_pass->Apply(std::move(graph));
     VLOG(10) << "EagerDeletionPass Applied";
+
+    graph->SetNotOwned(details::kGarbageCollector, &(member_->gcs_));
   }
 
   // Step 3. Create vars in each scope. Passes may also create new vars.
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 0d261dd7cc..cb3b6cdc3e 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -38,6 +38,10 @@ DEFINE_double(
     "Memory size threshold (GB) when the garbage collector clear tensors."
     "Disabled when this value is less than 0");
 
+DEFINE_bool(fast_eager_deletion_mode, true,
+            "Fast eager deletion mode. If enabled, memory would release "
+            "immediately without waiting GPU kernel ends.");
+
 // When in inference scenario, the scopes will not be written by two threads in
 // a mean time, but a scope may be read by multiple threads concurrently, and
 // the mutex will cause serious performance issue.
@@ -58,6 +62,8 @@ int64_t GetEagerDeletionThreshold() {
                                     (static_cast<int64_t>(1) << 30));
 }
 
+bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; }
+
 Scope::~Scope() { DropKids(); }
 
 Scope& Scope::NewScope() const {
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index 1901ffbe57..aded1f771c 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -27,6 +27,7 @@ namespace paddle {
 namespace framework {
 
 int64_t GetEagerDeletionThreshold();
+bool IsFastEagerDeletionModeEnabled();
 
 class Scope;
 
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 71e8badd4b..3a4c52410e 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -153,7 +153,7 @@ class Tensor {
 
   void set_layout(const DataLayout layout) { layout_ = layout; }
 
-  void clear() { holder_ = nullptr; }
+  void clear() { holder_.reset(); }
 
   const std::shared_ptr<memory::Allocation>& Holder() const { return holder_; }
   size_t offset() const { return offset_; }
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index 6c1b2f329a..d8410b4058 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -59,7 +59,21 @@ class WhileOp : public framework::OperatorBase {
                    "Condition of while op must in CPU memory.");
 
     bool is_test = Attr<bool>("is_test");
-    auto ctx = executor.Prepare(*program, block->ID());
+    auto &skip_eager_deletion_vars =
+        Attr<std::vector<std::string>>("skip_eager_deletion_vars");
+    if (framework::GetEagerDeletionThreshold() >= 0 && VLOG_IS_ON(10)) {
+      std::string debug_string =
+          "Skip " + std::to_string(skip_eager_deletion_vars.size()) +
+          " vars in eager deletion mode: ";
+      for (auto &var : skip_eager_deletion_vars) {
+        debug_string.append(var);
+        debug_string.push_back(' ');
+      }
+      VLOG(10) << debug_string;
+    }
+
+    auto ctx =
+        executor.Prepare(*program, block->ID(), skip_eager_deletion_vars);
     while (cond.data<bool>()[0]) {
       auto &current_scope = scope.NewScope();
       step_scopes->push_back(&current_scope);
@@ -96,6 +110,10 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool, default false) Set to true for inference only, false "
                   "for training. Some layers may run faster when this is true.")
         .SetDefault(false);
+    AddAttr<std::vector<std::string>>("skip_eager_deletion_vars",
+                                      "Vars that would skip eager deletion."
+                                      "Users should not set this manually.")
+        .SetDefault(std::vector<std::string>());
     AddComment(R"DOC(
 )DOC");
   }
@@ -341,6 +359,30 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
     // while operator could be renamed.
     while_grad->SetAttr("original_output_grad", output_grads_list);
 
+    /* The following codes are used in eager deletion mode */
+    if (framework::GetEagerDeletionThreshold() >= 0) {
+      std::unordered_set<std::string> skip_vars;
+      for (auto *op_desc : grad_block->AllOps()) {
+        for (auto &in_arg_name : op_desc->InputArgumentNames()) {
+          // If input var of ops inside grad_block is not from grad_block,
+          // it cannot be deleted when forward while_op runs
+          if (in_arg_name != framework::kEmptyVarName &&
+              !grad_block->HasVar(in_arg_name)) {
+            skip_vars.insert(in_arg_name);
+          }
+        }
+      }
+
+      if (!skip_vars.empty()) {
+        // FIXME(zjl): ugly const_cast here, maybe we should find a better way
+        // to modify forward while_op
+        auto &fwd_while_op = const_cast<framework::OpDesc &>(ForwardOp());
+        fwd_while_op.SetAttr(
+            "skip_eager_deletion_vars",
+            std::vector<std::string>(skip_vars.begin(), skip_vars.end()));
+      }
+    }
+
     return std::unique_ptr<framework::OpDesc>(while_grad);
   }
 };
diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h
index 9b2a11bae1..7fc07efe73 100644
--- a/paddle/fluid/operators/reader/ctr_reader.h
+++ b/paddle/fluid/operators/reader/ctr_reader.h
@@ -16,6 +16,7 @@
 
 #include <sys/time.h>
 
+#include <algorithm>
 #include <chrono>  // NOLINT
 #include <cstdlib>
 #include <fstream>
@@ -55,8 +56,7 @@ class CTRReader : public framework::FileReader {
     PADDLE_ENFORCE_GT(thread_num, 0, "thread num should be larger then 0!");
     PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
     PADDLE_ENFORCE_GT(file_list.size(), 0, "file list should not be empty");
-    thread_num_ =
-        file_list_.size() > thread_num ? thread_num : file_list_.size();
+    thread_num_ = std::min<size_t>(file_list_.size(), thread_num);
     queue_ = queue;
     SplitFiles();
     for (size_t i = 0; i < thread_num_; ++i) {
@@ -95,10 +95,10 @@ class CTRReader : public framework::FileReader {
     queue_->ReOpen();
     VLOG(3) << "reopen success";
     VLOG(3) << "thread_num " << thread_num_;
-    for (int thread_id = 0; thread_id < thread_num_; thread_id++) {
-      read_threads_.emplace_back(new std::thread(
-          std::bind(&ReadThread, file_groups_[thread_id], slots_, batch_size_,
-                    thread_id, &read_thread_status_, queue_)));
+    for (size_t thread_id = 0; thread_id < thread_num_; thread_id++) {
+      read_threads_.emplace_back(new std::thread(std::bind(
+          &ReadThread, file_groups_[thread_id], slots_, batch_size_,
+          static_cast<int>(thread_id), &read_thread_status_, queue_)));
     }
     monitor_thread_.reset(new std::thread(
         std::bind(&MonitorThread, &read_thread_status_, queue_)));
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 3edd727978..37453a8c29 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -223,14 +223,10 @@ class CUDADeviceContext : public DeviceContext {
 
   template <typename Callback>
   void AddStreamCallback(Callback&& callback) const {
-    std::lock_guard<std::mutex> guard(callback_mtx_);
     callback_manager_->AddCallback(callback);
   }
 
-  void WaitStreamCallback() const {
-    std::lock_guard<std::mutex> guard(callback_mtx_);
-    callback_manager_->Wait();
-  }
+  void WaitStreamCallback() const { callback_manager_->Wait(); }
 
 #if CUDA_VERSION >= 9000
   /*! \brief CublasCall may need to change cublas's config,
@@ -261,9 +257,7 @@ class CUDADeviceContext : public DeviceContext {
 
   mutable std::mutex mtx_;
 
-  // This lock is only used by callback
-  // If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes
-  mutable std::mutex callback_mtx_;
+  // StreamCallbackManager is thread-safe
   std::unique_ptr<StreamCallbackManager> callback_manager_;
 
   mutable std::mutex cublas_mtx_;
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
index ae915365f8..58ec6f2f5d 100644
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -18,52 +18,47 @@
 namespace paddle {
 namespace platform {
 
-struct StreamCallbackContext {
-  inline StreamCallbackContext(const StreamCallbackManager *manager,
-                               std::function<void()> callback)
-      : manager_(manager), callback_(std::move(callback)) {}
-
-  const StreamCallbackManager *manager_;  // do not own
-  std::function<void()> callback_;
-};
+#if CUDA_VERSION >= 10000
+static void CUDART_CB StreamCallbackFunc(void *user_data);
+#else
+static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
+                                         cudaError_t status, void *user_data)
+#endif
+{
+  std::unique_ptr<std::function<void()>> func(
+      reinterpret_cast<std::function<void()> *>(user_data));
+  (*func)();
+}
 
 StreamCallbackManager::StreamCallbackManager(const cudaStream_t stream)
-    : stream_(stream), thread_pool_(new ::ThreadPool(1)) {}
+    : stream_(stream), thread_pool_(1) {}
 
 void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
-  auto *stream_callback_context =
-      new StreamCallbackContext(this, std::move(callback));
+  auto *callback_func = new std::function<void()>(std::move(callback));
+  auto *func = new std::function<void()>([this, callback_func] {
+    std::lock_guard<std::mutex> lock(mtx_);
+    last_future_ = thread_pool_.enqueue([callback_func] {
+      std::unique_ptr<std::function<void()>> releaser(callback_func);
+      (*callback_func)();
+    });
+  });
 #if CUDA_VERSION >= 10000
-  PADDLE_ENFORCE(cudaLaunchHostFunc(stream_,
-                                    StreamCallbackManager::StreamCallbackFunc,
-                                    stream_callback_context));
+  PADDLE_ENFORCE(cudaLaunchHostFunc(stream_, StreamCallbackFunc, func));
 #else
-  PADDLE_ENFORCE(
-      cudaStreamAddCallback(stream_, StreamCallbackManager::StreamCallbackFunc,
-                            stream_callback_context, 0));
+  PADDLE_ENFORCE(cudaStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
 #endif
 }
 
-void StreamCallbackManager::Wait() const {
-  thread_pool_.reset(new ::ThreadPool(1));
-}
+StreamCallbackManager::~StreamCallbackManager() { Wait(); }
 
-#if CUDA_VERSION >= 10000
-void CUDART_CB StreamCallbackManager::StreamCallbackFunc(void *user_data)
-#else
-void CUDART_CB StreamCallbackManager::StreamCallbackFunc(cudaStream_t stream,
-                                                         cudaError_t status,
-                                                         void *user_data)
-#endif
-{
-  auto *callback_context_ptr =
-      reinterpret_cast<StreamCallbackContext *>(user_data);
-  callback_context_ptr->manager_->thread_pool_->enqueue(
-      [callback_context_ptr]() {
-        std::unique_ptr<StreamCallbackContext> callback_context(
-            callback_context_ptr);
-        callback_context->callback_();
-      });
+void StreamCallbackManager::Wait() const {
+  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
+  {
+    std::lock_guard<std::mutex> lock(mtx_);
+    if (last_future_.valid()) {
+      last_future_.wait();
+    }
+  }
 }
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h
index eac4806d13..0d5d85bf46 100644
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -18,30 +18,32 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <functional>
+#include <future>  // NOLINT
 #include <memory>
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
 
-// NOTE(zjl): clean StreamCallback to make compilation faster
+// NOTE(zjl): clean StreamCallbackManager to make compilation faster
+// Make StreamCallbackManager thread-safe
 class StreamCallbackManager {
  public:
   explicit StreamCallbackManager(const cudaStream_t stream);
 
+  ~StreamCallbackManager();
+
   void AddCallback(std::function<void()> callback) const;
 
   void Wait() const;
 
  private:
   const cudaStream_t stream_;
-  mutable std::unique_ptr<::ThreadPool> thread_pool_;
-
-#if CUDA_VERSION >= 10000
-  static void CUDART_CB StreamCallbackFunc(void *user_data);
-#else
-  static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
-                                           cudaError_t status, void *user_data);
-#endif
+  mutable ::ThreadPool thread_pool_;
+  mutable std::mutex mtx_;
+  mutable std::future<void> last_future_;
 };
 
 }  // namespace platform
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 02a75236f6..24800e1709 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -162,7 +162,7 @@ void PyCPUTensorSetFromArray(
     paddle::platform::CPUPlace place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
+  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
     dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
@@ -182,7 +182,7 @@ inline void PyCPUTensorSetFromArray(
     paddle::platform::CPUPlace place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
+  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
     dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
@@ -200,7 +200,7 @@ void PyCUDATensorSetFromArray(
     paddle::platform::CUDAPlace place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
+  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
     dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
@@ -221,7 +221,7 @@ inline void PyCUDATensorSetFromArray(
     paddle::platform::CUDAPlace place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
+  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
     dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
@@ -240,7 +240,7 @@ void PyCUDAPinnedTensorSetFromArray(
     const paddle::platform::CUDAPinnedPlace &place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
+  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
     dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
@@ -260,7 +260,7 @@ inline void PyCUDAPinnedTensorSetFromArray(
     const paddle::platform::CUDAPinnedPlace &place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
+  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
     dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index f7fefb3e5b..2690149e9b 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -116,8 +116,9 @@ def __bootstrap__():
         'check_nan_inf', 'benchmark', 'eager_delete_scope', 'use_mkldnn',
         'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem',
         'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size",
-        'eager_delete_tensor_gb', 'allocator_strategy',
-        'reader_queue_speed_test_mode', 'print_sub_graph_dir'
+        'eager_delete_tensor_gb', 'fast_eager_deletion_mode',
+        'allocator_strategy', 'reader_queue_speed_test_mode',
+        'print_sub_graph_dir'
     ]
     if 'Darwin' not in sysstr:
         read_env_flags.append('use_pinned_memory')

From 35a2578426840642acc0b2100be0b1c96c2cf1e9 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Mon, 3 Dec 2018 13:21:49 +0000
Subject: [PATCH 03/45] fix bug test=develop

---
 .../framework/details/computation_op_handle.cc     |  2 --
 .../framework/details/reference_count_pass.cc      | 14 +++++++++-----
 paddle/fluid/platform/stream_callback_manager.cc   |  2 --
 paddle/fluid/platform/stream_callback_manager.h    |  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
index 2bf43fd4e0..7beb8c8de9 100644
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -31,8 +31,6 @@ ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
 void ComputationOpHandle::RunImpl() {
   WaitInputVarGenerated(place_);
 
-  VLOG(10) << "Run Op" << Name();
-
   auto run_func = [this]() {
     op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
   };
diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc
index 2320d3926a..0c096e0980 100644
--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -29,7 +29,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-struct OpConnectionDetector {
+class OpConnectionDetector {
  public:
   enum RelationShip { kSame = 0, kNoDeps = 1, kBefore = 2, kAfter = 3 };
 
@@ -37,8 +37,8 @@ struct OpConnectionDetector {
       : graph_(all_ops) {}
 
   template <typename OpSet>
-  std::unordered_set<typename OpSet::key_type> MaxNoDepOps(
-      const OpSet &op_set) {
+  OpSet MaxNoDepOps(const OpSet &op_set) {
+    if (op_set.size() <= 1) return op_set;
     using KeyType = typename OpSet::key_type;
     static_assert(
         std::is_base_of<OpHandleBase,
@@ -46,7 +46,7 @@ struct OpConnectionDetector {
         "Key type of OpSet must be or derived of OpHandleBase");
 
     std::vector<OpHandleBase *> ops(op_set.begin(), op_set.end());
-    std::unordered_set<KeyType> ret;
+    OpSet ret;
     auto rels = GetRelations(ops);
     auto not_before = [](RelationShip r) { return r != kBefore; };
     for (size_t i = 0; i < rels.size(); ++i) {
@@ -79,7 +79,7 @@ struct OpConnectionDetector {
       auto it = op_to_idx.find(op);
       if (it != op_to_idx.end()) {
         size_t j = it->second;
-        if (ret[i][j] != kSame) {
+        if (i != j && ret[i][j] == kSame) {
           ret[i][j] = kBefore;
           ret[j][i] = kAfter;
           found_num += 2;
@@ -208,6 +208,10 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
           VLOG(10) << "Shrink last living op number of " << var_name << " from "
                    << original_size << " to " << last_live_op.size();
         }
+
+        PADDLE_ENFORCE(!last_live_op.empty(),
+                       "Last living ops of %s cannot be empty", var_name);
+
         ref_cnts[i].emplace(var_name, last_live_op.size());
         last_live_ops_of_vars[i].emplace(var_name, std::move(last_live_op));
       }
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
index 58ec6f2f5d..466c77469e 100644
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -49,8 +49,6 @@ void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
 #endif
 }
 
-StreamCallbackManager::~StreamCallbackManager() { Wait(); }
-
 void StreamCallbackManager::Wait() const {
   PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
   {
diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h
index 0d5d85bf46..8668bcb113 100644
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -33,7 +33,7 @@ class StreamCallbackManager {
  public:
   explicit StreamCallbackManager(const cudaStream_t stream);
 
-  ~StreamCallbackManager();
+  ~StreamCallbackManager() = default;
 
   void AddCallback(std::function<void()> callback) const;
 

From 2d0d037d8e9e1580d38e800fd1a0d0b0056422eb Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Tue, 4 Dec 2018 09:45:50 +0000
Subject: [PATCH 04/45] fix while_op eager deletion bug add unittest
 test=develop

---
 paddle/fluid/framework/executor.cc            |  2 +-
 .../fluid/operators/controlflow/while_op.cc   | 84 +++++++++++++------
 .../unittests/test_eager_deletion_mnist.py    | 27 ++++++
 .../test_eager_deletion_seresnext.py          | 27 ++++++
 .../test_eager_deletion_transformer.py        | 27 ++++++
 5 files changed, 140 insertions(+), 27 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py

diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 5823f33034..f443c2d8cf 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -101,7 +101,7 @@ static void DeleteUnusedTensors(
         if (--(it->second) == 0) {
           auto* var = scope.FindVar(name);
           if (var != nullptr) {
-            VLOG(10) << "Erase tensor \'" << name << "\'";
+            VLOG(2) << "Erase tensor \'" << name << "\'";
             if (var->IsType<LoDTensor>()) {
               erase_tensors.insert(var->GetMutable<LoDTensor>());
             } else if (var->IsType<SelectedRows>()) {
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index d8410b4058..da7cad82d8 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -32,6 +32,20 @@ static constexpr char kStepScopes[] = "StepScopes";
 static constexpr char kX[] = "X";
 static constexpr char kXGRAD[] = "X@GRAD";
 static constexpr char kOutputs[] = "Out";
+static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars";
+
+namespace {  // NOLINT
+static std::string GetSkipEagerDeletionVarsDebugString(
+    const std::vector<std::string> &vars) {
+  std::string str = "Skip " + std::to_string(vars.size()) +
+                    " var(s) in eager deletion mode: ";
+  for (auto &var : vars) {
+    str.append(var);
+    str.push_back(' ');
+  }
+  return str;
+}
+}  // NOLINT
 
 class WhileOp : public framework::OperatorBase {
  public:
@@ -59,21 +73,12 @@ class WhileOp : public framework::OperatorBase {
                    "Condition of while op must in CPU memory.");
 
     bool is_test = Attr<bool>("is_test");
-    auto &skip_eager_deletion_vars =
-        Attr<std::vector<std::string>>("skip_eager_deletion_vars");
-    if (framework::GetEagerDeletionThreshold() >= 0 && VLOG_IS_ON(10)) {
-      std::string debug_string =
-          "Skip " + std::to_string(skip_eager_deletion_vars.size()) +
-          " vars in eager deletion mode: ";
-      for (auto &var : skip_eager_deletion_vars) {
-        debug_string.append(var);
-        debug_string.push_back(' ');
-      }
-      VLOG(10) << debug_string;
+    auto &skip_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
+    if (framework::GetEagerDeletionThreshold() >= 0) {
+      VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
     }
 
-    auto ctx =
-        executor.Prepare(*program, block->ID(), skip_eager_deletion_vars);
+    auto ctx = executor.Prepare(*program, block->ID(), skip_vars);
     while (cond.data<bool>()[0]) {
       auto &current_scope = scope.NewScope();
       step_scopes->push_back(&current_scope);
@@ -110,7 +115,7 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool, default false) Set to true for inference only, false "
                   "for training. Some layers may run faster when this is true.")
         .SetDefault(false);
-    AddAttr<std::vector<std::string>>("skip_eager_deletion_vars",
+    AddAttr<std::vector<std::string>>(kSkipEagerDeletionVars,
                                       "Vars that would skip eager deletion."
                                       "Users should not set this manually.")
         .SetDefault(std::vector<std::string>());
@@ -137,7 +142,12 @@ class WhileGradOp : public framework::OperatorBase {
     framework::Executor executor(dev_place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
     auto *program = block->Program();
-    auto ctx = executor.Prepare(*program, block->ID());
+
+    auto &skip_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
+    if (framework::GetEagerDeletionThreshold() >= 0) {
+      VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
+    }
+    auto ctx = executor.Prepare(*program, block->ID(), skip_vars);
 
     auto *step_scopes =
         scope.FindVar(Input(kStepScopes))->GetMutable<StepScopeVar>();
@@ -359,29 +369,51 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
     // while operator could be renamed.
     while_grad->SetAttr("original_output_grad", output_grads_list);
 
-    /* The following codes are used in eager deletion mode */
+    /* The followi_ng codes are used in eager deletion mode */
+    std::unordered_set<std::string> bwd_skip_vars;
     if (framework::GetEagerDeletionThreshold() >= 0) {
-      std::unordered_set<std::string> skip_vars;
+      std::unordered_set<std::string> fwd_skip_vars;
       for (auto *op_desc : grad_block->AllOps()) {
+        auto skippable = [&](const std::string &name) {
+          return !grad_block->HasVar(name) &&
+                 (fwd_block->HasVarRecursive(name) ||
+                  parent_block->HasVarRecursive(name));
+        };
         for (auto &in_arg_name : op_desc->InputArgumentNames()) {
-          // If input var of ops inside grad_block is not from grad_block,
-          // it cannot be deleted when forward while_op runs
-          if (in_arg_name != framework::kEmptyVarName &&
-              !grad_block->HasVar(in_arg_name)) {
-            skip_vars.insert(in_arg_name);
+          if (skippable(in_arg_name)) {
+            fwd_skip_vars.insert(in_arg_name);
+          }
+        }
+
+        for (auto &out_arg_name : op_desc->OutputArgumentNames()) {
+          if (skippable(out_arg_name)) {
+            fwd_skip_vars.insert(out_arg_name);
           }
         }
       }
 
-      if (!skip_vars.empty()) {
+      if (!fwd_skip_vars.empty()) {
         // FIXME(zjl): ugly const_cast here, maybe we should find a better way
         // to modify forward while_op
         auto &fwd_while_op = const_cast<framework::OpDesc &>(ForwardOp());
-        fwd_while_op.SetAttr(
-            "skip_eager_deletion_vars",
-            std::vector<std::string>(skip_vars.begin(), skip_vars.end()));
+        fwd_while_op.SetAttr(kSkipEagerDeletionVars,
+                             std::vector<std::string>(fwd_skip_vars.begin(),
+                                                      fwd_skip_vars.end()));
+      }
+
+      // Find backward skip vars
+      auto fwd_input = Input(kX);
+      for (size_t i = 0; i < igs.size(); ++i) {
+        if (igs[i] == framework::kEmptyVarName) {
+          continue;
+        }
+        bwd_skip_vars.insert(igs[i]);
+        bwd_skip_vars.insert(framework::GradVarName(fwd_input[i]));
       }
     }
+    while_grad->SetAttr(
+        kSkipEagerDeletionVars,
+        std::vector<std::string>(bwd_skip_vars.begin(), bwd_skip_vars.end()));
 
     return std::unique_ptr<framework::OpDesc>(while_grad);
   }
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
new file mode 100644
index 0000000000..7ec1f0ae75
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
+
+from test_parallel_executor_mnist import TestMNIST
+
+
+class EagerDeletionTestMNIST(TestMNIST):
+    pass
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py
new file mode 100644
index 0000000000..2dcdbdb8f1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
+
+from test_parallel_executor_seresnext import TestResnet
+
+
+class EagerDeletionTestSEResNext(TestResnet):
+    pass
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
new file mode 100644
index 0000000000..754d5fd409
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
+
+from test_parallel_executor_transformer import TestTransformer
+
+
+class EagerDeletionTestTransformer(TestTransformer):
+    pass
+
+
+if __name__ == '__main__':
+    unittest.main()

From e694d0c2e487a854103e0cc4796f92af6d27ccfd Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Tue, 4 Dec 2018 09:45:50 +0000
Subject: [PATCH 05/45] fix while_op eager deletion bug add unittest
 test=develop

---
 .../details/eager_deletion_op_handle.cc       |  2 +
 paddle/fluid/framework/executor.cc            |  2 +-
 .../fluid/operators/controlflow/while_op.cc   | 84 +++++++++++++------
 .../unittests/test_eager_deletion_mnist.py    | 27 ++++++
 .../test_eager_deletion_seresnext.py          | 27 ++++++
 .../test_eager_deletion_transformer.py        | 27 ++++++
 6 files changed, 142 insertions(+), 27 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py

diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index 41f616035d..54715fed8d 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -16,7 +16,9 @@
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 5823f33034..f443c2d8cf 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -101,7 +101,7 @@ static void DeleteUnusedTensors(
         if (--(it->second) == 0) {
           auto* var = scope.FindVar(name);
           if (var != nullptr) {
-            VLOG(10) << "Erase tensor \'" << name << "\'";
+            VLOG(2) << "Erase tensor \'" << name << "\'";
             if (var->IsType<LoDTensor>()) {
               erase_tensors.insert(var->GetMutable<LoDTensor>());
             } else if (var->IsType<SelectedRows>()) {
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index d8410b4058..da7cad82d8 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -32,6 +32,20 @@ static constexpr char kStepScopes[] = "StepScopes";
 static constexpr char kX[] = "X";
 static constexpr char kXGRAD[] = "X@GRAD";
 static constexpr char kOutputs[] = "Out";
+static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars";
+
+namespace {  // NOLINT
+static std::string GetSkipEagerDeletionVarsDebugString(
+    const std::vector<std::string> &vars) {
+  std::string str = "Skip " + std::to_string(vars.size()) +
+                    " var(s) in eager deletion mode: ";
+  for (auto &var : vars) {
+    str.append(var);
+    str.push_back(' ');
+  }
+  return str;
+}
+}  // NOLINT
 
 class WhileOp : public framework::OperatorBase {
  public:
@@ -59,21 +73,12 @@ class WhileOp : public framework::OperatorBase {
                    "Condition of while op must in CPU memory.");
 
     bool is_test = Attr<bool>("is_test");
-    auto &skip_eager_deletion_vars =
-        Attr<std::vector<std::string>>("skip_eager_deletion_vars");
-    if (framework::GetEagerDeletionThreshold() >= 0 && VLOG_IS_ON(10)) {
-      std::string debug_string =
-          "Skip " + std::to_string(skip_eager_deletion_vars.size()) +
-          " vars in eager deletion mode: ";
-      for (auto &var : skip_eager_deletion_vars) {
-        debug_string.append(var);
-        debug_string.push_back(' ');
-      }
-      VLOG(10) << debug_string;
+    auto &skip_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
+    if (framework::GetEagerDeletionThreshold() >= 0) {
+      VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
     }
 
-    auto ctx =
-        executor.Prepare(*program, block->ID(), skip_eager_deletion_vars);
+    auto ctx = executor.Prepare(*program, block->ID(), skip_vars);
     while (cond.data<bool>()[0]) {
       auto &current_scope = scope.NewScope();
       step_scopes->push_back(&current_scope);
@@ -110,7 +115,7 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool, default false) Set to true for inference only, false "
                   "for training. Some layers may run faster when this is true.")
         .SetDefault(false);
-    AddAttr<std::vector<std::string>>("skip_eager_deletion_vars",
+    AddAttr<std::vector<std::string>>(kSkipEagerDeletionVars,
                                       "Vars that would skip eager deletion."
                                       "Users should not set this manually.")
         .SetDefault(std::vector<std::string>());
@@ -137,7 +142,12 @@ class WhileGradOp : public framework::OperatorBase {
     framework::Executor executor(dev_place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
     auto *program = block->Program();
-    auto ctx = executor.Prepare(*program, block->ID());
+
+    auto &skip_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
+    if (framework::GetEagerDeletionThreshold() >= 0) {
+      VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
+    }
+    auto ctx = executor.Prepare(*program, block->ID(), skip_vars);
 
     auto *step_scopes =
         scope.FindVar(Input(kStepScopes))->GetMutable<StepScopeVar>();
@@ -359,29 +369,51 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
     // while operator could be renamed.
     while_grad->SetAttr("original_output_grad", output_grads_list);
 
-    /* The following codes are used in eager deletion mode */
+    /* The followi_ng codes are used in eager deletion mode */
+    std::unordered_set<std::string> bwd_skip_vars;
     if (framework::GetEagerDeletionThreshold() >= 0) {
-      std::unordered_set<std::string> skip_vars;
+      std::unordered_set<std::string> fwd_skip_vars;
       for (auto *op_desc : grad_block->AllOps()) {
+        auto skippable = [&](const std::string &name) {
+          return !grad_block->HasVar(name) &&
+                 (fwd_block->HasVarRecursive(name) ||
+                  parent_block->HasVarRecursive(name));
+        };
         for (auto &in_arg_name : op_desc->InputArgumentNames()) {
-          // If input var of ops inside grad_block is not from grad_block,
-          // it cannot be deleted when forward while_op runs
-          if (in_arg_name != framework::kEmptyVarName &&
-              !grad_block->HasVar(in_arg_name)) {
-            skip_vars.insert(in_arg_name);
+          if (skippable(in_arg_name)) {
+            fwd_skip_vars.insert(in_arg_name);
+          }
+        }
+
+        for (auto &out_arg_name : op_desc->OutputArgumentNames()) {
+          if (skippable(out_arg_name)) {
+            fwd_skip_vars.insert(out_arg_name);
           }
         }
       }
 
-      if (!skip_vars.empty()) {
+      if (!fwd_skip_vars.empty()) {
         // FIXME(zjl): ugly const_cast here, maybe we should find a better way
         // to modify forward while_op
         auto &fwd_while_op = const_cast<framework::OpDesc &>(ForwardOp());
-        fwd_while_op.SetAttr(
-            "skip_eager_deletion_vars",
-            std::vector<std::string>(skip_vars.begin(), skip_vars.end()));
+        fwd_while_op.SetAttr(kSkipEagerDeletionVars,
+                             std::vector<std::string>(fwd_skip_vars.begin(),
+                                                      fwd_skip_vars.end()));
+      }
+
+      // Find backward skip vars
+      auto fwd_input = Input(kX);
+      for (size_t i = 0; i < igs.size(); ++i) {
+        if (igs[i] == framework::kEmptyVarName) {
+          continue;
+        }
+        bwd_skip_vars.insert(igs[i]);
+        bwd_skip_vars.insert(framework::GradVarName(fwd_input[i]));
       }
     }
+    while_grad->SetAttr(
+        kSkipEagerDeletionVars,
+        std::vector<std::string>(bwd_skip_vars.begin(), bwd_skip_vars.end()));
 
     return std::unique_ptr<framework::OpDesc>(while_grad);
   }
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
new file mode 100644
index 0000000000..7ec1f0ae75
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
+
+from test_parallel_executor_mnist import TestMNIST
+
+
+class EagerDeletionTestMNIST(TestMNIST):
+    pass
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py
new file mode 100644
index 0000000000..2dcdbdb8f1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
+
+from test_parallel_executor_seresnext import TestResnet
+
+
+class EagerDeletionTestSEResNext(TestResnet):
+    pass
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
new file mode 100644
index 0000000000..754d5fd409
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
+
+from test_parallel_executor_transformer import TestTransformer
+
+
+class EagerDeletionTestTransformer(TestTransformer):
+    pass
+
+
+if __name__ == '__main__':
+    unittest.main()

From d0c8b9b9b350f774a7b195bf6c807b90b5f895f9 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Tue, 4 Dec 2018 12:00:28 +0000
Subject: [PATCH 06/45] remove timeout unittest test=develop

---
 paddle/fluid/framework/tensor.h               |  2 +-
 .../test_eager_deletion_seresnext.py          | 27 -------------------
 2 files changed, 1 insertion(+), 28 deletions(-)
 delete mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py

diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 3a4c52410e..71e8badd4b 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -153,7 +153,7 @@ class Tensor {
 
   void set_layout(const DataLayout layout) { layout_ = layout; }
 
-  void clear() { holder_.reset(); }
+  void clear() { holder_ = nullptr; }
 
   const std::shared_ptr<memory::Allocation>& Holder() const { return holder_; }
   size_t offset() const { return offset_; }
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py
deleted file mode 100644
index 2dcdbdb8f1..0000000000
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_seresnext.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
-
-from test_parallel_executor_seresnext import TestResnet
-
-
-class EagerDeletionTestSEResNext(TestResnet):
-    pass
-
-
-if __name__ == '__main__':
-    unittest.main()

From 387bac46b5e4d95e2888773975d1b6c3a906a588 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Fri, 7 Dec 2018 03:09:43 +0000
Subject: [PATCH 07/45] refine code test=develop

---
 .../framework/details/eager_deletion_pass.cc  |  10 +-
 .../fluid/framework/details/op_graph_view.cc  |   2 +
 .../framework/details/reference_count_pass.cc |  14 +-
 .../details/reference_count_pass_helper.h     |  10 +-
 .../scope_buffered_ssa_graph_executor.cc      |   8 +-
 .../scope_buffered_ssa_graph_executor.h       |   2 +-
 paddle/fluid/framework/executor.cc            |  14 +-
 paddle/fluid/framework/executor.h             |   6 +-
 paddle/fluid/framework/parallel_executor.cc   | 153 ++++++++++--------
 .../fluid/operators/controlflow/while_op.cc   |  10 +-
 10 files changed, 122 insertions(+), 107 deletions(-)

diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc
index 3a1b37e533..85991c71e6 100644
--- a/paddle/fluid/framework/details/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/details/eager_deletion_pass.cc
@@ -31,10 +31,11 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
   const auto &vars = graph->Get<GraphVars>(kGraphVars);
 
   auto &ref_cnts =
-      Get<std::vector<AtomicReferenceCountMap>>(kCurReferenceCount);
+      Get<std::vector<AtomicReferenceCountMap>>(kRuntimeReferenceCount);
   const auto &last_live_ops =
       Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
-  auto &gcs = Get<GarbageCollectorList>(kGarbageCollector);
+  auto &gcs = Get<GarbageCollectorMap>(kGarbageCollector);
+  const auto &places = Get<std::vector<platform::Place>>(kAllPlaces);
 
   ref_cnts = std::vector<AtomicReferenceCountMap>(vars.size());
 
@@ -58,7 +59,7 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
         graph->CreateEmptyNode("eager_deletion", ir::Node::Type::kOperation);
     auto *eager_deletion_op = new EagerDeletionOpHandle(
         eager_deletion_node, op->GetScope(), op->GetPlace(),
-        std::move(var_names), gcs[op->GetScopeIdx()].get(),
+        std::move(var_names), gcs.at(places[op->GetScopeIdx()]).get(),
         &(ref_cnts[op->GetScopeIdx()]));
 
     auto it = std::find_if(
@@ -90,6 +91,7 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
 
 REGISTER_PASS(eager_deletion_pass,
               paddle::framework::details::EagerDeletionPass)
-    .RequirePassAttr(paddle::framework::details::kCurReferenceCount)
+    .RequirePassAttr(paddle::framework::details::kRuntimeReferenceCount)
     .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars)
+    .RequirePassAttr(paddle::framework::details::kAllPlaces)
     .RequirePassAttr(paddle::framework::details::kGarbageCollector);
diff --git a/paddle/fluid/framework/details/op_graph_view.cc b/paddle/fluid/framework/details/op_graph_view.cc
index 4838c4198f..b6b5ad42c4 100644
--- a/paddle/fluid/framework/details/op_graph_view.cc
+++ b/paddle/fluid/framework/details/op_graph_view.cc
@@ -23,6 +23,8 @@ namespace details {
 OpGraphView::OpGraphView(const std::vector<OpHandleBase *> &ops) { Build(ops); }
 
 void OpGraphView::Build(const std::vector<OpHandleBase *> &ops) {
+  preceding_ops_.clear();
+  pending_ops_.clear();
   for (auto &op : ops) {
     preceding_ops_[op];
     pending_ops_[op];
diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc
index 0c096e0980..f2c9dfb524 100644
--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -29,22 +29,22 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-class OpConnectionDetector {
+class OpRelationDetector {
  public:
   enum RelationShip { kSame = 0, kNoDeps = 1, kBefore = 2, kAfter = 3 };
 
-  explicit OpConnectionDetector(const std::vector<OpHandleBase *> &all_ops)
+  explicit OpRelationDetector(const std::vector<OpHandleBase *> &all_ops)
       : graph_(all_ops) {}
 
   template <typename OpSet>
-  OpSet MaxNoDepOps(const OpSet &op_set) {
-    if (op_set.size() <= 1) return op_set;
+  OpSet MaxNoDepOps(const OpSet &op_set) const {
     using KeyType = typename OpSet::key_type;
     static_assert(
         std::is_base_of<OpHandleBase,
                         typename std::remove_pointer<KeyType>::type>::value,
-        "Key type of OpSet must be or derived of OpHandleBase");
+        "Key type of OpSet must be OpHandleBase, or derived of OpHandleBase");
 
+    if (op_set.size() <= 1) return op_set;
     std::vector<OpHandleBase *> ops(op_set.begin(), op_set.end());
     OpSet ret;
     auto rels = GetRelations(ops);
@@ -59,7 +59,7 @@ class OpConnectionDetector {
 
  private:
   std::vector<std::vector<RelationShip>> GetRelations(
-      const std::vector<OpHandleBase *> ops) {
+      const std::vector<OpHandleBase *> ops) const {
     std::unordered_map<OpHandleBase *, size_t> op_to_idx;
     for (size_t i = 0; i < ops.size(); ++i) {
       PADDLE_ENFORCE(graph_.HasOp(ops[i]), "Op does not exist in graph");
@@ -144,7 +144,7 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
   last_live_ops_of_vars = std::vector<LastLiveOpsOfVars>(vars.size());
   ref_cnts = std::vector<ReferenceCountMap>(vars.size());
 
-  OpConnectionDetector detector(ir::FilterByNodeWrapper<OpHandleBase>(*graph));
+  OpRelationDetector detector(ir::FilterByNodeWrapper<OpHandleBase>(*graph));
 
   for (size_t i = 0; i < vars.size(); ++i) {
     for (auto &name_var_pair : vars[i]) {
diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.h b/paddle/fluid/framework/details/reference_count_pass_helper.h
index 77846f7bdf..eb534f9701 100644
--- a/paddle/fluid/framework/details/reference_count_pass_helper.h
+++ b/paddle/fluid/framework/details/reference_count_pass_helper.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <atomic>
+#include <map>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -33,12 +34,13 @@ using ReferenceCountMap = std::unordered_map<std::string, size_t>;
 using AtomicReferenceCountMap =
     std::unordered_map<std::string, std::atomic<size_t>>;
 
-using GarbageCollectorList =
-    std::vector<std::unique_ptr<GarbageCollector<Tensor>>>;
+using GarbageCollectorMap =
+    std::map<platform::Place, std::unique_ptr<GarbageCollector<Tensor>>>;
 
-const char kGlobalReferenceCount[] = "reference_count";
-const char kCurReferenceCount[] = "current_reference_count";
+const char kGlobalReferenceCount[] = "global_reference_count";
+const char kRuntimeReferenceCount[] = "runtime_reference_count";
 const char kGarbageCollector[] = "garbage_collector";
+const char kAllPlaces[] = "all_places";
 
 using LastLiveOpsOfVars =
     std::unordered_map<std::string, std::unordered_set<ComputationOpHandle*>>;
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index da5e277f27..b8775fc329 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -32,15 +32,15 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
       var_infos_(std::move(var_infos)),
       places_(std::move(places)) {
   if (Graph().Has(details::kGarbageCollector)) {
-    gc_ = &(Graph().Get<GarbageCollectorList>(details::kGarbageCollector));
+    gc_ = &(Graph().Get<GarbageCollectorMap>(details::kGarbageCollector));
   }
 }
 
 void ScopeBufferedSSAGraphExecutor::WaitAllGarbageCollectors() {
   if (gc_) {
-    for (auto &gc : *gc_) {
-      gc->Wait();
-      gc->Reset();
+    for (auto &gc_pair : *gc_) {
+      gc_pair.second->Wait();
+      gc_pair.second->Reset();
     }
   }
 }
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
index 4d52183a20..6086a219e0 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -60,7 +60,7 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
   std::vector<VariableInfo> var_infos_;
   std::vector<platform::Place> places_;
 
-  GarbageCollectorList* gc_{nullptr};
+  GarbageCollectorMap* gc_{nullptr};
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index f443c2d8cf..04425a5983 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -56,13 +56,7 @@ static std::unordered_map<std::string, size_t> GetNonPersistableReferenceCounts(
             type != proto::VarType::LOD_TENSOR_ARRAY) {
           continue;
         }
-
-        auto it = ref_cnts.find(name);
-        if (it != ref_cnts.end()) {
-          ++it->second;
-        } else {
-          ref_cnts[name] = 1;
-        }
+        ++ref_cnts[name];
       }
     }
   };
@@ -79,8 +73,8 @@ ExecutorPrepareContext::ExecutorPrepareContext(
     const std::vector<std::string>& skip_ref_cnt_vars)
     : prog_(prog), block_id_(block_id) {
   if (GetEagerDeletionThreshold() >= 0) {
-    ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id),
-                                                 skip_ref_cnt_vars);
+    global_ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id),
+                                                        skip_ref_cnt_vars);
   }
 }
 
@@ -443,7 +437,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
 
     if (gc) {
       DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
-                          &(ctx->cur_ref_cnts_));
+                          &(ctx->runtime_ref_cnts_));
     }
   }
 
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 412ebd1904..5a040ac641 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -34,14 +34,14 @@ struct ExecutorPrepareContext {
 
   ~ExecutorPrepareContext();
 
-  void ResetReferenceCount() { cur_ref_cnts_ = ref_cnts_; }
+  void ResetReferenceCount() { runtime_ref_cnts_ = global_ref_cnts_; }
 
   const framework::ProgramDesc& prog_;
   size_t block_id_;
   std::vector<std::unique_ptr<OperatorBase>> ops_;
 
-  std::unordered_map<std::string, size_t> ref_cnts_;
-  std::unordered_map<std::string, size_t> cur_ref_cnts_;
+  std::unordered_map<std::string, size_t> global_ref_cnts_;
+  std::unordered_map<std::string, size_t> runtime_ref_cnts_;
 };
 
 class Executor {
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 3d466e44a1..dfd031f119 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -51,11 +51,22 @@ class ParallelExecutorPrivate {
     }
   }
 
-  void ResetRuntimeReferenceCount() {
-    for (size_t i = 0; i < rt_ref_cnts_.size(); ++i) {
-      for (auto &pair : rt_ref_cnts_[i]) {
-        rt_cur_ref_cnts_[i][pair.first] = pair.second;
+  std::unique_ptr<ir::Graph> PrepareGCAndRefCnts(
+      std::unique_ptr<ir::Graph> graph, size_t max_memory_size);
+
+  inline bool HasGarbageCollectors() const { return !gcs_.empty(); }
+
+  void ResetRuntimeReferenceCount(const std::vector<std::string> &fetch_tensors,
+                                  const std::string &fetched_var_name) {
+    for (size_t i = 0; i < runtime_ref_cnts_.size(); ++i) {
+      for (auto &pair : global_ref_cnts_[i]) {
+        runtime_ref_cnts_[i][pair.first] = pair.second;
+      }
+
+      for (auto &fetch_name : fetch_tensors) {
+        runtime_ref_cnts_[i].erase(fetch_name);
       }
+      runtime_ref_cnts_[i].erase(fetched_var_name);
     }
   }
 
@@ -71,14 +82,75 @@ class ParallelExecutorPrivate {
   bool use_cuda_;
   bool use_all_reduce_;
 
-  // rt_ref_cnts_ is only initialized when ParallelExecutor constructs, and then
-  // keeps unchanged
-  // Before each iteration, rt_cur_ref_cnts_ is reset to ref_cnts_
-  std::vector<details::ReferenceCountMap> rt_ref_cnts_;
-  std::vector<details::AtomicReferenceCountMap> rt_cur_ref_cnts_;
-  details::GarbageCollectorList gcs_;
+  // global_ref_cnts_ is only initialized when ParallelExecutor constructs, and
+  // then keeps unchanged
+  // Before each iteration, runtime_ref_cnts_ is reset to global_ref_cnts_
+  std::vector<details::ReferenceCountMap> global_ref_cnts_;
+  std::vector<details::AtomicReferenceCountMap> runtime_ref_cnts_;
+  details::GarbageCollectorMap gcs_;
 };
 
+std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
+    std::unique_ptr<ir::Graph> graph, size_t max_memory_size) {
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto &place = places_[i];
+    if (gcs_.count(place) > 0) {
+      continue;
+    }
+#ifdef PADDLE_WITH_CUDA
+    GarbageCollector<Tensor> *gc = nullptr;
+    if (platform::is_gpu_place(place)) {
+      if (IsFastEagerDeletionModeEnabled()) {
+        gc = new UnsafeFastGPUGarbageCollector<Tensor>(
+            boost::get<platform::CUDAPlace>(place), max_memory_size);
+      } else {
+        gc = new StreamGarbageCollector<Tensor>(
+            boost::get<platform::CUDAPlace>(place), max_memory_size);
+      }
+      VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
+    } else if (platform::is_cpu_place(place)) {
+#endif
+      gc = new CPUGarbageCollector<Tensor>(
+          boost::get<platform::CPUPlace>(place), max_memory_size);
+      VLOG(10) << "Created GarbageCollector at " << place;
+#ifdef PADDLE_WITH_CUDA
+    }
+#endif
+
+    if (gc) {
+      gcs_[place] = std::unique_ptr<GarbageCollector<Tensor>>(gc);
+    }
+  }
+
+  if (gcs_.empty()) {
+    std::vector<details::LastLiveOpsOfVars> last_live_ops_of_vars;
+
+    auto ref_cnt_pass =
+        ir::PassRegistry::Instance().Get("reference_count_pass");
+    ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount,
+                              &global_ref_cnts_);
+    ref_cnt_pass->SetNotOwned(details::kLastLiveOpsOfVars,
+                              &last_live_ops_of_vars);
+    graph = ref_cnt_pass->Apply(std::move(graph));
+    VLOG(10) << "ReferenceCountPass Applied";
+
+    auto eager_deletion_pass =
+        ir::PassRegistry::Instance().Get("eager_deletion_pass");
+    eager_deletion_pass->SetNotOwned(details::kRuntimeReferenceCount,
+                                     &runtime_ref_cnts_);
+    eager_deletion_pass->SetNotOwned(details::kGarbageCollector, &gcs_);
+    eager_deletion_pass->SetNotOwned(details::kLastLiveOpsOfVars,
+                                     &last_live_ops_of_vars);
+    eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_);
+    graph = eager_deletion_pass->Apply(std::move(graph));
+    VLOG(10) << "EagerDeletionPass Applied";
+
+    graph->SetNotOwned(details::kGarbageCollector, &gcs_);
+  }
+
+  return graph;
+}
+
 std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
   return member_->local_scopes_;
 }
@@ -153,54 +225,8 @@ ParallelExecutor::ParallelExecutor(
 
   auto max_memory_size = GetEagerDeletionThreshold();
   if (max_memory_size >= 0) {
-    size_t place_num = member_->places_.size();
-    for (size_t i = 0; i < place_num; ++i) {
-      auto &place = member_->places_[i];
-#ifdef PADDLE_WITH_CUDA
-      if (platform::is_gpu_place(place)) {
-        if (IsFastEagerDeletionModeEnabled()) {
-          member_->gcs_.emplace_back(new UnsafeFastGPUGarbageCollector<Tensor>(
-              boost::get<platform::CUDAPlace>(place), max_memory_size));
-        } else {
-          member_->gcs_.emplace_back(new StreamGarbageCollector<Tensor>(
-              boost::get<platform::CUDAPlace>(place), max_memory_size));
-        }
-        VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
-      } else if (platform::is_cpu_place(place)) {
-#endif
-        member_->gcs_.emplace_back(new CPUGarbageCollector<Tensor>(
-            boost::get<platform::CPUPlace>(place), max_memory_size));
-        VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
-#ifdef PADDLE_WITH_CUDA
-      }
-#endif
-    }
-  }
-
-  if (!member_->gcs_.empty()) {
-    std::vector<details::LastLiveOpsOfVars> last_live_ops_of_vars;
-
-    auto ref_cnt_pass =
-        ir::PassRegistry::Instance().Get("reference_count_pass");
-    ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount,
-                              &(member_->rt_ref_cnts_));
-    ref_cnt_pass->SetNotOwned(details::kLastLiveOpsOfVars,
-                              &last_live_ops_of_vars);
-    graph = ref_cnt_pass->Apply(std::move(graph));
-    VLOG(10) << "ReferenceCountPass Applied";
-
-    auto eager_deletion_pass =
-        ir::PassRegistry::Instance().Get("eager_deletion_pass");
-    eager_deletion_pass->SetNotOwned(details::kCurReferenceCount,
-                                     &(member_->rt_cur_ref_cnts_));
-    eager_deletion_pass->SetNotOwned(details::kGarbageCollector,
-                                     &(member_->gcs_));
-    eager_deletion_pass->SetNotOwned(details::kLastLiveOpsOfVars,
-                                     &last_live_ops_of_vars);
-    graph = eager_deletion_pass->Apply(std::move(graph));
-    VLOG(10) << "EagerDeletionPass Applied";
-
-    graph->SetNotOwned(details::kGarbageCollector, &(member_->gcs_));
+    graph = member_->PrepareGCAndRefCnts(std::move(graph),
+                                         static_cast<size_t>(max_memory_size));
   }
 
   // Step 3. Create vars in each scope. Passes may also create new vars.
@@ -316,15 +342,8 @@ void ParallelExecutor::BCastParamsToDevices(
 void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
                            const std::string &fetched_var_name) {
   platform::RecordBlock b(0);
-  if (!member_->gcs_.empty()) {
-    member_->ResetRuntimeReferenceCount();
-    size_t n = member_->rt_ref_cnts_.size();
-    for (size_t i = 0; i < n; ++i) {
-      for (auto &fetch_name : fetch_tensors) {
-        member_->rt_cur_ref_cnts_[i].erase(fetch_name);
-      }
-      member_->rt_cur_ref_cnts_[i].erase(fetched_var_name);
-    }
+  if (member_->HasGarbageCollectors()) {
+    member_->ResetRuntimeReferenceCount(fetch_tensors, fetched_var_name);
   }
   auto fetch_data = member_->executor_->Run(fetch_tensors);
   *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index da7cad82d8..06920a47ee 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -74,9 +74,7 @@ class WhileOp : public framework::OperatorBase {
 
     bool is_test = Attr<bool>("is_test");
     auto &skip_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
-    if (framework::GetEagerDeletionThreshold() >= 0) {
-      VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
-    }
+    VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
 
     auto ctx = executor.Prepare(*program, block->ID(), skip_vars);
     while (cond.data<bool>()[0]) {
@@ -144,9 +142,7 @@ class WhileGradOp : public framework::OperatorBase {
     auto *program = block->Program();
 
     auto &skip_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
-    if (framework::GetEagerDeletionThreshold() >= 0) {
-      VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
-    }
+    VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
     auto ctx = executor.Prepare(*program, block->ID(), skip_vars);
 
     auto *step_scopes =
@@ -369,7 +365,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
     // while operator could be renamed.
     while_grad->SetAttr("original_output_grad", output_grads_list);
 
-    /* The followi_ng codes are used in eager deletion mode */
+    /* The following codes are used in eager deletion mode */
     std::unordered_set<std::string> bwd_skip_vars;
     if (framework::GetEagerDeletionThreshold() >= 0) {
       std::unordered_set<std::string> fwd_skip_vars;

From 644baa2e45b64f5a52e237ca1981cb30a5043e0c Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Fri, 7 Dec 2018 03:30:17 +0000
Subject: [PATCH 08/45] fix code bug in CPU compilation test=develop

---
 paddle/fluid/framework/parallel_executor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index dfd031f119..fd2bcb8848 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -97,8 +97,8 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
     if (gcs_.count(place) > 0) {
       continue;
     }
-#ifdef PADDLE_WITH_CUDA
     GarbageCollector<Tensor> *gc = nullptr;
+#ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(place)) {
       if (IsFastEagerDeletionModeEnabled()) {
         gc = new UnsafeFastGPUGarbageCollector<Tensor>(

From 8095fb5e686d3e32f1838dfe7fbf4d0108ef1f25 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Fri, 7 Dec 2018 03:30:17 +0000
Subject: [PATCH 09/45] fix code bug in CPU compilation test=develop

---
 paddle/fluid/framework/parallel_executor.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index dfd031f119..e51b1f1f73 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -97,8 +97,8 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
     if (gcs_.count(place) > 0) {
       continue;
     }
-#ifdef PADDLE_WITH_CUDA
     GarbageCollector<Tensor> *gc = nullptr;
+#ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(place)) {
       if (IsFastEagerDeletionModeEnabled()) {
         gc = new UnsafeFastGPUGarbageCollector<Tensor>(
@@ -122,7 +122,7 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
     }
   }
 
-  if (gcs_.empty()) {
+  if (!gcs_.empty()) {
     std::vector<details::LastLiveOpsOfVars> last_live_ops_of_vars;
 
     auto ref_cnt_pass =

From eb8252466b11bdbea7abca6fd4cc5816f1c30830 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Fri, 7 Dec 2018 09:15:23 +0000
Subject: [PATCH 10/45] polish code add unittest model containing while_op
 remove unnecessary codes test=develop

---
 paddle/fluid/framework/CMakeLists.txt         |   4 +-
 paddle/fluid/framework/details/CMakeLists.txt |   5 +-
 .../details/eager_deletion_op_handle.cc       |  48 +++---
 .../details/eager_deletion_op_handle.h        |   8 +-
 .../framework/details/eager_deletion_pass.cc  |  18 +-
 .../fluid/framework/details/op_graph_view.cc  |   1 +
 .../framework/details/reference_count_pass.cc | 156 +++++++++++-------
 .../details/reference_count_pass_helper.cc    |  21 +++
 .../details/reference_count_pass_helper.h     |   4 +-
 .../scope_buffered_ssa_graph_executor.cc      |  21 +--
 .../scope_buffered_ssa_graph_executor.h       |   6 -
 paddle/fluid/framework/executor.cc            |  56 ++++---
 paddle/fluid/framework/garbage_collector.cc   |  89 ++++++++++
 paddle/fluid/framework/garbage_collector.h    | 153 ++++++-----------
 paddle/fluid/framework/parallel_executor.cc   |  28 ++--
 paddle/fluid/framework/scope.cc               |   2 +-
 paddle/fluid/framework/tensor.h               |   4 +
 .../unittests/test_eager_deletion_gru_net.py  |  49 ++++++
 .../unittests/test_eager_deletion_lstm_net.py | 111 +++++++++++++
 19 files changed, 516 insertions(+), 268 deletions(-)
 create mode 100644 paddle/fluid/framework/details/reference_count_pass_helper.cc
 create mode 100644 paddle/fluid/framework/garbage_collector.cc
 create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index c701a2ad63..f2361c5cea 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -72,6 +72,8 @@ cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
 
+cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory)
+
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
 cc_test(reader_test SRCS reader_test.cc DEPS reader)
 
@@ -164,7 +166,7 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor)
 cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
 
 if(WITH_DISTRIBUTE)
-  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper)
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper garbage_collector)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 8049f5d3f7..a6c8ef408a 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -33,9 +33,10 @@ cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base s
 
 cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
 
-cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows op_handle_base)
+cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle)
+cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
 cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
-cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view)
+cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper)
 
 cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
 cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index 54715fed8d..3b27415e43 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -26,8 +26,8 @@ namespace details {
 
 EagerDeletionOpHandle::EagerDeletionOpHandle(
     ir::Node *node, const Scope *scope, const platform::Place &place,
-    const std::unordered_set<std::string> &var_names,
-    GarbageCollector<Tensor> *gc, AtomicReferenceCountMap *ref_cnts)
+    const std::unordered_set<std::string> &var_names, GarbageCollector *gc,
+    AtomicReferenceCountMap *ref_cnts)
     : OpHandleBase(node),
       scope_(scope),
       var_names_(var_names),
@@ -35,9 +35,9 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
       ref_cnts_(ref_cnts) {
 #ifdef PADDLE_WITH_CUDA
   if (platform::is_gpu_place(place)) {
-    dev_ctx_ = static_cast<platform::CUDADeviceContext *>(
+    dev_ctx_ = reinterpret_cast<platform::CUDADeviceContext *>(
         platform::DeviceContextPool::Instance().Get(place));
-    if (dynamic_cast<StreamGarbageCollector<Tensor> *>(gc_)) {
+    if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
       platform::CUDADeviceGuard guard(
           boost::get<platform::CUDAPlace>(place).device);
       PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
@@ -61,10 +61,11 @@ std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; }
 
 void EagerDeletionOpHandle::RunImpl() {
   auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
-  std::vector<Tensor *> tensors;
+  std::deque<std::shared_ptr<memory::Allocation>> garbages;
   for (auto &name : var_names_) {
     auto it = ref_cnts_->find(name);
-    if (it == ref_cnts_->end()) {
+    // Var not found, not reference count has not decreased to 0
+    if (it == ref_cnts_->end() || it->second.fetch_sub(1) != 1) {
       continue;
     }
 
@@ -73,43 +74,44 @@ void EagerDeletionOpHandle::RunImpl() {
       continue;
     }
 
+    VLOG(2) << "Erase variable " << name;
+
     if (var->IsType<LoDTensor>()) {
-      if (it->second.fetch_sub(1) == 1) {
-        tensors.emplace_back(var->GetMutable<LoDTensor>());
-      }
+      garbages.emplace_back(var->GetMutable<LoDTensor>()->MoveMemory());
     } else if (var->IsType<SelectedRows>()) {
-      if (it->second.fetch_sub(1) == 1) {
-        tensors.emplace_back(var->GetMutable<SelectedRows>()->mutable_value());
-      }
+      garbages.emplace_back(
+          var->GetMutable<SelectedRows>()->mutable_value()->MoveMemory());
     } else if (var->IsType<LoDTensorArray>()) {
-      if (it->second.fetch_sub(1) == 1) {
-        auto *tensor_arr = var->GetMutable<LoDTensorArray>();
-        for (auto &t : *tensor_arr) {
-          tensors.emplace_back(&t);
-        }
+      auto *tensor_arr = var->GetMutable<LoDTensorArray>();
+      for (auto &t : *tensor_arr) {
+        garbages.emplace_back(t.MoveMemory());
       }
+    } else {
+      PADDLE_THROW("Type %s of %s is not supported eager deletion",
+                   var->Type().name(), name);
     }
   }
 
-  if (!tensors.empty()) {
-    ClearTensors(tensors);
+  if (!garbages.empty()) {
+    ClearGarbages(&garbages);
   }
 }
 
-void EagerDeletionOpHandle::ClearTensors(const std::vector<Tensor *> &tensors) {
+void EagerDeletionOpHandle::ClearGarbages(
+    std::deque<std::shared_ptr<memory::Allocation>> *garbages) {
 #ifdef PADDLE_WITH_CUDA
   if (event_) {
     auto compute_stream = dev_ctx_->stream();
     auto callback_stream =
-        static_cast<StreamGarbageCollector<Tensor> *>(gc_)->stream();
+        reinterpret_cast<StreamGarbageCollector *>(gc_)->stream();
     auto callback_func = [=]() {
       PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
       PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
     };
-    gc_->Add(tensors, callback_func);
+    gc_->Add(std::move(*garbages), callback_func);
   } else {
 #endif
-    gc_->Add(tensors);
+    gc_->Add(std::move(*garbages));
 #ifdef PADDLE_WITH_CUDA
   }
 #endif
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h
index d8de59cc4d..64867afad5 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.h
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
+#include <deque>
 #include <string>
-#include <vector>
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 
@@ -30,7 +30,7 @@ class EagerDeletionOpHandle : public OpHandleBase {
   EagerDeletionOpHandle(ir::Node *node, const Scope *scope,
                         const platform::Place &place,
                         const std::unordered_set<std::string> &var_names,
-                        GarbageCollector<Tensor> *gc,
+                        GarbageCollector *gc,
                         AtomicReferenceCountMap *ref_cnts);
 
   ~EagerDeletionOpHandle();
@@ -41,11 +41,11 @@ class EagerDeletionOpHandle : public OpHandleBase {
   void RunImpl() override;
 
  private:
-  void ClearTensors(const std::vector<Tensor *> &tensors);
+  void ClearGarbages(std::deque<std::shared_ptr<memory::Allocation>> *garbages);
 
   const Scope *scope_;
   std::unordered_set<std::string> var_names_;
-  GarbageCollector<Tensor> *gc_;       // not own
+  GarbageCollector *gc_;               // not own
   AtomicReferenceCountMap *ref_cnts_;  // not own
 #ifdef PADDLE_WITH_CUDA
   platform::CUDADeviceContext *dev_ctx_{nullptr};
diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc
index 85991c71e6..4e42d0b497 100644
--- a/paddle/fluid/framework/details/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/details/eager_deletion_pass.cc
@@ -28,17 +28,21 @@ namespace details {
 
 std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
-  const auto &vars = graph->Get<GraphVars>(kGraphVars);
-
   auto &ref_cnts =
       Get<std::vector<AtomicReferenceCountMap>>(kRuntimeReferenceCount);
+  PADDLE_ENFORCE(ref_cnts.empty(),
+                 "kRuntimeReferenceCount should be initialized here!");
+
+  const auto &vars = graph->Get<GraphVars>(kGraphVars);
+  ref_cnts.resize(vars.size());
+
   const auto &last_live_ops =
       Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
-  auto &gcs = Get<GarbageCollectorMap>(kGarbageCollector);
+  const auto &gcs = Get<GarbageCollectorMap>(kGarbageCollector);
   const auto &places = Get<std::vector<platform::Place>>(kAllPlaces);
 
-  ref_cnts = std::vector<AtomicReferenceCountMap>(vars.size());
-
+  // a reverse map of last_live_ops
+  //   i.e., last op --> variable names which can be deleted.
   std::unordered_map<ComputationOpHandle *, std::unordered_set<std::string>>
       op_vars_map;
 
@@ -58,8 +62,8 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
     auto *eager_deletion_node =
         graph->CreateEmptyNode("eager_deletion", ir::Node::Type::kOperation);
     auto *eager_deletion_op = new EagerDeletionOpHandle(
-        eager_deletion_node, op->GetScope(), op->GetPlace(),
-        std::move(var_names), gcs.at(places[op->GetScopeIdx()]).get(),
+        eager_deletion_node, op->GetScope(), op->GetPlace(), var_names,
+        gcs.at(places[op->GetScopeIdx()]).get(),
         &(ref_cnts[op->GetScopeIdx()]));
 
     auto it = std::find_if(
diff --git a/paddle/fluid/framework/details/op_graph_view.cc b/paddle/fluid/framework/details/op_graph_view.cc
index b6b5ad42c4..d3865c2c29 100644
--- a/paddle/fluid/framework/details/op_graph_view.cc
+++ b/paddle/fluid/framework/details/op_graph_view.cc
@@ -42,6 +42,7 @@ void OpGraphView::Build(const std::vector<OpHandleBase *> &ops) {
 
 std::unordered_set<OpHandleBase *> OpGraphView::AllOps() const {
   std::unordered_set<OpHandleBase *> ret;
+  ret.reserve(preceding_ops_.size());
   for (auto &pair : preceding_ops_) {
     ret.insert(pair.first);
   }
diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc
index f2c9dfb524..13a042d8e6 100644
--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -29,15 +29,17 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-class OpRelationDetector {
- public:
+// A functor to shrink/remove operators who depend on other operators in a set
+class ShrinkDepsOpFunctor {
+ private:
   enum RelationShip { kSame = 0, kNoDeps = 1, kBefore = 2, kAfter = 3 };
 
-  explicit OpRelationDetector(const std::vector<OpHandleBase *> &all_ops)
+ public:
+  explicit ShrinkDepsOpFunctor(const std::vector<OpHandleBase *> &all_ops)
       : graph_(all_ops) {}
 
   template <typename OpSet>
-  OpSet MaxNoDepOps(const OpSet &op_set) const {
+  OpSet operator()(const OpSet &op_set) const {
     using KeyType = typename OpSet::key_type;
     static_assert(
         std::is_base_of<OpHandleBase,
@@ -51,7 +53,7 @@ class OpRelationDetector {
     auto not_before = [](RelationShip r) { return r != kBefore; };
     for (size_t i = 0; i < rels.size(); ++i) {
       if (std::all_of(rels[i].begin(), rels[i].end(), not_before)) {
-        ret.insert(static_cast<KeyType>(ops[i]));
+        ret.emplace(static_cast<KeyType>(ops[i]));
       }
     }
     return ret;
@@ -59,7 +61,7 @@ class OpRelationDetector {
 
  private:
   std::vector<std::vector<RelationShip>> GetRelations(
-      const std::vector<OpHandleBase *> ops) const {
+      const std::vector<OpHandleBase *> &ops) const {
     std::unordered_map<OpHandleBase *, size_t> op_to_idx;
     for (size_t i = 0; i < ops.size(); ++i) {
       PADDLE_ENFORCE(graph_.HasOp(ops[i]), "Op does not exist in graph");
@@ -112,6 +114,10 @@ class OpRelationDetector {
   const OpGraphView graph_;
 };
 
+/**
+ * Find the nearest downstream computation op handle. If the op is a
+ * computation op, just return itself.
+ */
 static ComputationOpHandle *FindNextComputationOpHandleOrReturnItself(
     OpHandleBase *op, size_t scope_idx) {
   std::queue<OpHandleBase *> q;
@@ -134,33 +140,87 @@ static ComputationOpHandle *FindNextComputationOpHandleOrReturnItself(
   return nullptr;
 }
 
+static std::unordered_set<ComputationOpHandle *>
+ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx,
+                                     const ShrinkDepsOpFunctor &shrink_func,
+                                     bool *ok) {
+  // stage one. Get last op for variable.
+  std::unordered_set<OpHandleBase *> candidates;
+  {
+    if (var->PendingOps().empty() && var->GeneratedOp()) {
+      // No operator depends on this variable. So the last operator is the op
+      // who generates this variable.
+      candidates.emplace(var->GeneratedOp());
+    } else {
+      candidates = var->PendingOps();
+    }
+
+    // No pending ops or generated op is nullptr
+    if (candidates.empty()) {
+      *ok = false;
+      return {};
+    }
+  }
+
+  // stage two. Try to cast them to computation op.
+  // return (*ok=false) when failed.
+  //
+  // The reason why we cannot make any types of op handle to be the last lived
+  // op is:
+  //    some op handle may operate on many DeviceContext, however, our garbage
+  //    collector can only wait one DeviceContext for now. So currently, we wait
+  //    the nearest compute op.
+  std::unordered_set<ComputationOpHandle *> computation_op;
+  {
+    for (auto *op : candidates) {
+      auto *compute_op =
+          FindNextComputationOpHandleOrReturnItself(op, scope_idx);
+      if (compute_op == nullptr) {
+        *ok = false;
+        return {};
+      }
+      computation_op.emplace(compute_op);
+    }
+  }
+
+  // stage three. Try to shrink computation op if they depend on each other.
+  // Get the smallest set of the most ops.
+  *ok = true;
+  return shrink_func(computation_op);
+}
+
+static VarDesc *TryGetLatestVarDesc(const std::vector<VarHandle *> &vars) {
+  VarDesc *var_desc = nullptr;
+  std::find_if(vars.rbegin(), vars.rend(), [&](VarHandle *var_handle) -> bool {
+    var_desc = var_handle->Node()->Var();
+    return var_desc != nullptr;
+  });
+  return var_desc;
+}
+
 std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
-  auto &vars = graph->Get<GraphVars>(kGraphVars);
   auto &ref_cnts = Get<std::vector<ReferenceCountMap>>(kGlobalReferenceCount);
   auto &last_live_ops_of_vars =
       Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
 
-  last_live_ops_of_vars = std::vector<LastLiveOpsOfVars>(vars.size());
-  ref_cnts = std::vector<ReferenceCountMap>(vars.size());
+  PADDLE_ENFORCE(last_live_ops_of_vars.empty() && ref_cnts.empty(),
+                 "Last Live Ops and Reference Counts of vars should be "
+                 "initialized at here.");
 
-  OpRelationDetector detector(ir::FilterByNodeWrapper<OpHandleBase>(*graph));
+  const auto &vars = graph->Get<GraphVars>(kGraphVars);
 
-  for (size_t i = 0; i < vars.size(); ++i) {
-    for (auto &name_var_pair : vars[i]) {
-      if (name_var_pair.second.empty()) {
-        continue;
-      }
+  last_live_ops_of_vars.resize(vars.size());
+  ref_cnts.resize(vars.size());
 
-      const std::string &var_name = name_var_pair.first;
-      auto *last_ver_var = name_var_pair.second.back();
+  ShrinkDepsOpFunctor shrink_func(
+      ir::FilterByNodeWrapper<OpHandleBase>(*graph));
 
-      VarDesc *var_desc = nullptr;
-      std::find_if(name_var_pair.second.rbegin(), name_var_pair.second.rend(),
-                   [&](VarHandle *var_handle) -> bool {
-                     var_desc = var_handle->Node()->Var();
-                     return var_desc != nullptr;
-                   });
+  for (size_t i = 0; i < vars.size(); ++i) {
+    for (auto &name_var_pair : vars[i]) {
+      // Whether this variable can be reused or deleted? If not, we do not
+      // compute reference counts and dependencies.
+      VarDesc *var_desc = TryGetLatestVarDesc(name_var_pair.second);
 
       if (var_desc == nullptr || var_desc->Persistable()) {
         continue;
@@ -170,50 +230,20 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
       if (var_type != proto::VarType::LOD_TENSOR &&
           var_type != proto::VarType::SELECTED_ROWS &&
           var_type != proto::VarType::LOD_TENSOR_ARRAY) {
+        // Var type cannot be deleted
         continue;
       }
 
-      std::unordered_set<ComputationOpHandle *> last_live_op;
-      auto add_last_live_op = [&](OpHandleBase *op) -> bool {
-        auto *compute_op = FindNextComputationOpHandleOrReturnItself(op, i);
-        if (compute_op) {
-          last_live_op.insert(compute_op);
-          return true;
-        } else {
-          return false;
-        }
-      };
-
-      bool can_delete = false;
-      auto &pending_ops = last_ver_var->PendingOps();
-      if (pending_ops.empty()) {
-        auto *generated_op = last_ver_var->GeneratedOp();
-        if (generated_op && add_last_live_op(generated_op)) {
-          can_delete = true;
-        }
-      } else {
-        can_delete = true;
-        for (auto *pending_op : pending_ops) {
-          if (!add_last_live_op(pending_op)) {
-            can_delete = false;
-            break;
-          }
-        }
-      }
-
-      if (can_delete) {
-        size_t original_size = last_live_op.size();
-        last_live_op = detector.MaxNoDepOps(last_live_op);
-        if (last_live_op.size() != original_size) {
-          VLOG(10) << "Shrink last living op number of " << var_name << " from "
-                   << original_size << " to " << last_live_op.size();
-        }
-
-        PADDLE_ENFORCE(!last_live_op.empty(),
-                       "Last living ops of %s cannot be empty", var_name);
+      bool ok;
+      auto result = ExtractComputationOpFromLastLivedVar(
+          name_var_pair.second.back(), i, shrink_func, &ok);
 
-        ref_cnts[i].emplace(var_name, last_live_op.size());
-        last_live_ops_of_vars[i].emplace(var_name, std::move(last_live_op));
+      if (ok) {
+        auto &var_name = name_var_pair.first;
+        PADDLE_ENFORCE(!result.empty(), "Last living ops of %s cannot be empty",
+                       var_name);
+        ref_cnts[i].emplace(var_name, result.size());
+        last_live_ops_of_vars[i].emplace(var_name, std::move(result));
       }
     }
   }
diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.cc b/paddle/fluid/framework/details/reference_count_pass_helper.cc
new file mode 100644
index 0000000000..89bd08c2d0
--- /dev/null
+++ b/paddle/fluid/framework/details/reference_count_pass_helper.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.h b/paddle/fluid/framework/details/reference_count_pass_helper.h
index eb534f9701..1c083dbf00 100644
--- a/paddle/fluid/framework/details/reference_count_pass_helper.h
+++ b/paddle/fluid/framework/details/reference_count_pass_helper.h
@@ -18,10 +18,10 @@
 #include <map>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "paddle/fluid/framework/garbage_collector.h"
-#include "paddle/fluid/framework/tensor.h"
 
 namespace paddle {
 namespace framework {
@@ -35,7 +35,7 @@ using AtomicReferenceCountMap =
     std::unordered_map<std::string, std::atomic<size_t>>;
 
 using GarbageCollectorMap =
-    std::map<platform::Place, std::unique_ptr<GarbageCollector<Tensor>>>;
+    std::map<platform::Place, std::unique_ptr<GarbageCollector>>;
 
 const char kGlobalReferenceCount[] = "global_reference_count";
 const char kRuntimeReferenceCount[] = "runtime_reference_count";
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index b8775fc329..57f6fc66c5 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -30,20 +30,7 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
       underlying_executor_(std::move(underlying_executor)),
       local_scopes_(std::move(local_scopes)),
       var_infos_(std::move(var_infos)),
-      places_(std::move(places)) {
-  if (Graph().Has(details::kGarbageCollector)) {
-    gc_ = &(Graph().Get<GarbageCollectorMap>(details::kGarbageCollector));
-  }
-}
-
-void ScopeBufferedSSAGraphExecutor::WaitAllGarbageCollectors() {
-  if (gc_) {
-    for (auto &gc_pair : *gc_) {
-      gc_pair.second->Wait();
-      gc_pair.second->Reset();
-    }
-  }
-}
+      places_(std::move(places)) {}
 
 FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors) {
@@ -83,19 +70,15 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
       drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
     drop_scope_counter_ = 0;
     // Wait All computational streams
-    for (auto &p : places_) {
+    for (auto p : places_) {
       platform::DeviceContextPool::Instance().Get(p)->Wait();
     }
-    WaitAllGarbageCollectors();
     for (auto &scope : local_scopes_) {
       auto &local_scope =
           *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
       scope->DeleteScope(local_scope);
     }
-  } else {
-    WaitAllGarbageCollectors();
   }
-
   if (eptr) {
     std::rethrow_exception(eptr);
   } else {
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
index 6086a219e0..5e87e0bf50 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -21,11 +21,9 @@
 #include "paddle/fluid/framework/details/var_handle.h"
 
 #include "paddle/fluid/framework/details/execution_strategy.h"
-#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/place.h"
-
 namespace paddle {
 namespace framework {
 namespace details {
@@ -50,8 +48,6 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
   FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override;
 
  private:
-  void WaitAllGarbageCollectors();
-
   size_t drop_scope_counter_{0};
 
   ExecutionStrategy strategy_;
@@ -59,8 +55,6 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
   std::vector<Scope*> local_scopes_;
   std::vector<VariableInfo> var_infos_;
   std::vector<platform::Place> places_;
-
-  GarbageCollectorMap* gc_{nullptr};
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 04425a5983..767bbb524f 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/executor.h"
+#include <deque>
 
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
@@ -83,31 +84,37 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
 }
 
 static void DeleteUnusedTensors(
-    const Scope& scope, const OperatorBase* op, GarbageCollector<Tensor>* gc,
+    const Scope& scope, const OperatorBase* op, GarbageCollector* gc,
     std::unordered_map<std::string, size_t>* ref_cnts) {
-  std::unordered_set<Tensor*> erase_tensors;
+  std::deque<std::shared_ptr<memory::Allocation>> garbages;
 
   auto handler = [&](const VariableNameMap& name_map) {
     for (auto& name_pair : name_map) {
       for (auto& name : name_pair.second) {
         auto it = ref_cnts->find(name);
         if (it == ref_cnts->end()) continue;
-        if (--(it->second) == 0) {
-          auto* var = scope.FindVar(name);
-          if (var != nullptr) {
-            VLOG(2) << "Erase tensor \'" << name << "\'";
-            if (var->IsType<LoDTensor>()) {
-              erase_tensors.insert(var->GetMutable<LoDTensor>());
-            } else if (var->IsType<SelectedRows>()) {
-              erase_tensors.insert(
-                  var->GetMutable<SelectedRows>()->mutable_value());
-            } else if (var->IsType<LoDTensorArray>()) {
-              auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>();
-              for (auto& t : *lod_tensor_arr) {
-                erase_tensors.insert(&t);
-              }
-            }
+        if (--(it->second) != 0) {
+          continue;
+        }
+        auto* var = scope.FindVar(name);
+        if (var != nullptr) {
+          continue;
+        }
+
+        VLOG(2) << "Erase variable " << name;
+        if (var->IsType<LoDTensor>()) {
+          garbages.emplace_back(var->GetMutable<LoDTensor>()->MoveMemory());
+        } else if (var->IsType<SelectedRows>()) {
+          garbages.emplace_back(
+              var->GetMutable<SelectedRows>()->mutable_value()->MoveMemory());
+        } else if (var->IsType<LoDTensorArray>()) {
+          auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>();
+          for (auto& t : *lod_tensor_arr) {
+            garbages.emplace_back(t.MoveMemory());
           }
+        } else {
+          PADDLE_THROW("Type %s of %s is not supported eager deletion",
+                       var->Type().name(), name);
         }
       }
     }
@@ -116,8 +123,8 @@ static void DeleteUnusedTensors(
   handler(op->Inputs());
   handler(op->Outputs());
 
-  if (!erase_tensors.empty()) {
-    gc->Add(erase_tensors);
+  if (!garbages.empty()) {
+    gc->Add(std::move(garbages));
   }
 }
 
@@ -411,22 +418,22 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   }
 
   int64_t max_memory_size = GetEagerDeletionThreshold();
-  std::unique_ptr<GarbageCollector<Tensor>> gc;
+  std::unique_ptr<GarbageCollector> gc;
   if (max_memory_size >= 0) {
     ctx->ResetReferenceCount();
 #ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(place_)) {
       if (IsFastEagerDeletionModeEnabled()) {
-        gc.reset(new UnsafeFastGPUGarbageCollector<Tensor>(
+        gc.reset(new UnsafeFastGPUGarbageCollector(
             boost::get<platform::CUDAPlace>(place_), max_memory_size));
       } else {
-        gc.reset(new DefaultStreamGarbageCollector<Tensor>(
+        gc.reset(new DefaultStreamGarbageCollector(
             boost::get<platform::CUDAPlace>(place_), max_memory_size));
       }
     } else if (platform::is_cpu_place(place_)) {
 #endif
-      gc.reset(new CPUGarbageCollector<Tensor>(
-          boost::get<platform::CPUPlace>(place_), max_memory_size));
+      gc.reset(new CPUGarbageCollector(boost::get<platform::CPUPlace>(place_),
+                                       max_memory_size));
 #ifdef PADDLE_WITH_CUDA
     }
 #endif
@@ -442,7 +449,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   }
 
   platform::DeviceContextPool::Instance().Get(place_)->Wait();
-  if (gc) gc->Wait();
 
   if (local_scope != scope) {
     scope->DeleteScope(local_scope);
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
new file mode 100644
index 0000000000..54d9d0dc01
--- /dev/null
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -0,0 +1,89 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
+#include "paddle/fluid/framework/garbage_collector.h"
+
+namespace paddle {
+namespace framework {
+
+GarbageCollector::GarbageCollector(const platform::Place &place,
+                                   size_t max_memory_size)
+    : max_memory_size_((std::max)(max_memory_size, static_cast<size_t>(1))) {
+  garbages_.reset(new GarbageQueue());
+  dev_ctx_ = platform::DeviceContextPool::Instance().Get(place);
+}
+
+CPUGarbageCollector::CPUGarbageCollector(const platform::CPUPlace &place,
+                                         size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void CPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
+  callback();
+}
+
+#ifdef PADDLE_WITH_CUDA
+UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
+    const platform::CUDAPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void UnsafeFastGPUGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  callback();
+}
+
+DefaultStreamGarbageCollector::DefaultStreamGarbageCollector(
+    const platform::CUDAPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void DefaultStreamGarbageCollector::Wait() const {
+  static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
+      ->WaitStreamCallback();
+}
+
+void DefaultStreamGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
+      ->AddStreamCallback(callback);
+}
+
+StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
+                                               size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {
+  platform::CUDADeviceGuard guard(place.device);
+  PADDLE_ENFORCE(cudaStreamCreate(&stream_));
+  callback_manager_.reset(new platform::StreamCallbackManager(stream_));
+}
+
+StreamGarbageCollector::~StreamGarbageCollector() {
+  auto place = boost::get<platform::CUDAPlace>(this->dev_ctx_->GetPlace());
+  platform::CUDADeviceGuard guard(place.device);
+  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
+  PADDLE_ENFORCE(cudaStreamDestroy(stream_));
+}
+
+cudaStream_t StreamGarbageCollector::stream() const { return stream_; }
+
+void StreamGarbageCollector::Wait() const { callback_manager_->Wait(); }
+
+void StreamGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  callback_manager_->AddCallback(callback);
+}
+#endif
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index 1382e0d461..2768671029 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -14,160 +14,83 @@
 
 #pragma once
 
-#include <algorithm>
 #include <deque>
 #include <functional>
 #include <memory>
 #include <mutex>  // NOLINT
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cuda_device_guard.h"
-#endif
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace framework {
 
-// T should have memory_size() and clear() method
-template <typename T>
 class GarbageCollector {
  public:
-  GarbageCollector(const platform::Place &place, size_t max_memory_size)
-      : max_memory_size_((std::max)(max_memory_size, static_cast<size_t>(1))) {
-    garbages_.reset(new std::deque<T *>());
-    dev_ctx_ = platform::DeviceContextPool::Instance().Get(place);
-  }
+  using GarbageQueue = std::deque<std::shared_ptr<memory::Allocation>>;
 
-  virtual ~GarbageCollector() {}
+  GarbageCollector(const platform::Place &place, size_t max_memory_size);
 
-  size_t NumOfGarbages() const {
-    std::lock_guard<std::mutex> guard(mutex_);
-    return garbages_->size();
-  }
+  virtual ~GarbageCollector() = default;
 
-  void Reset() {
-    std::lock_guard<std::mutex> guard(mutex_);
-    garbages_.reset(new std::deque<T *>());
-    cur_memory_size_ = 0;
-  }
+  virtual void Wait() const {}
 
   template <typename Container>
-  void Add(const Container &objs) {
-    Add(objs, []() {});
-  }
+  void Add(Container &&objs);
 
   template <typename Container, typename Callback>
-  void Add(const Container &objs, Callback &&callback) {
-    std::deque<T *> *clear_deque = nullptr;
-    {
-      std::lock_guard<std::mutex> guard(mutex_);
-      for (auto *obj : objs) {
-        garbages_->push_back(obj);
-        cur_memory_size_ += obj->memory_size();
-      }
-      if (cur_memory_size_ >= max_memory_size_) {
-        cur_memory_size_ = 0;
-        clear_deque = garbages_.release();
-        garbages_.reset(new std::deque<T *>());
-      }
-    }
-
-    if (clear_deque != nullptr) {
-      callback();
-      ClearCallback([clear_deque]() {
-        for (auto *obj : *clear_deque) obj->clear();
-        delete clear_deque;
-      });
-    }
-  }
-
-  virtual void Wait() const {}
+  void Add(Container &&objs, Callback &&callback);
 
  protected:
   virtual void ClearCallback(const std::function<void()> &callback) = 0;
 
   platform::DeviceContext *dev_ctx_;
-  std::unique_ptr<std::deque<T *>> garbages_;
+  std::unique_ptr<GarbageQueue> garbages_;
   mutable std::mutex mutex_;
   const size_t max_memory_size_;
-  size_t cur_memory_size_ = 0;
+  size_t cur_memory_size_{0};
 };
 
-template <typename T>
-class CPUGarbageCollector : public GarbageCollector<T> {
+class CPUGarbageCollector : public GarbageCollector {
  public:
-  CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size)
-      : GarbageCollector<T>(place, max_memory_size) {}
+  CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size);
 
  protected:
-  void ClearCallback(const std::function<void()> &callback) override {
-    callback();
-  }
+  void ClearCallback(const std::function<void()> &callback) override;
 };
 
 #ifdef PADDLE_WITH_CUDA
-template <typename T>
-class UnsafeFastGPUGarbageCollector : public GarbageCollector<T> {
+class UnsafeFastGPUGarbageCollector : public GarbageCollector {
  public:
   UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place,
-                                size_t max_memory_size)
-      : GarbageCollector<T>(place, max_memory_size) {}
+                                size_t max_memory_size);
 
  protected:
-  void ClearCallback(const std::function<void()> &callback) override {
-    callback();
-  }
+  void ClearCallback(const std::function<void()> &callback) override;
 };
 
-template <typename T>
-class DefaultStreamGarbageCollector : public GarbageCollector<T> {
+class DefaultStreamGarbageCollector : public GarbageCollector {
  public:
   DefaultStreamGarbageCollector(const platform::CUDAPlace &place,
-                                size_t max_memory_size)
-      : GarbageCollector<T>(place, max_memory_size) {}
+                                size_t max_memory_size);
 
-  cudaStream_t stream() const {
-    return static_cast<const platform::CUDADeviceContext *>(this->dev_ctx_)
-        ->stream();
-  }
-
-  void Wait() const override {
-    static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
-        ->WaitStreamCallback();
-  }
+  void Wait() const override;
 
  protected:
-  void ClearCallback(const std::function<void()> &callback) override {
-    static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
-        ->AddStreamCallback(callback);
-  }
+  void ClearCallback(const std::function<void()> &callback) override;
 };
 
-template <typename T>
-class StreamGarbageCollector : public GarbageCollector<T> {
+class StreamGarbageCollector : public GarbageCollector {
  public:
   StreamGarbageCollector(const platform::CUDAPlace &place,
-                         size_t max_memory_size)
-      : GarbageCollector<T>(place, max_memory_size) {
-    platform::CUDADeviceGuard guard(place.device);
-    PADDLE_ENFORCE(cudaStreamCreate(&stream_));
-    callback_manager_.reset(new platform::StreamCallbackManager(stream_));
-  }
+                         size_t max_memory_size);
 
-  ~StreamGarbageCollector() {
-    auto place = boost::get<platform::CUDAPlace>(this->dev_ctx_->GetPlace());
-    platform::CUDADeviceGuard guard(place.device);
-    PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
-    PADDLE_ENFORCE(cudaStreamDestroy(stream_));
-  }
+  ~StreamGarbageCollector();
 
-  void Wait() const override { callback_manager_->Wait(); }
+  void Wait() const override;
 
-  cudaStream_t stream() const { return stream_; }
+  cudaStream_t stream() const;
 
  protected:
-  void ClearCallback(const std::function<void()> &callback) override {
-    callback_manager_->AddCallback(callback);
-  }
+  void ClearCallback(const std::function<void()> &callback) override;
 
  private:
   cudaStream_t stream_;
@@ -175,5 +98,33 @@ class StreamGarbageCollector : public GarbageCollector<T> {
 };
 #endif
 
+template <typename Container>
+void GarbageCollector::Add(Container &&objs) {
+  Add(std::forward<Container>(objs), []() {});
+}
+
+template <typename Container, typename Callback>
+void GarbageCollector::Add(Container &&objs, Callback &&callback) {
+  GarbageQueue *garbage_queue = nullptr;
+  {
+    std::lock_guard<std::mutex> guard(mutex_);
+    for (auto &obj : objs) {
+      if (!obj) continue;
+      cur_memory_size_ += obj->size();
+      garbages_->push_back(std::move(obj));
+    }
+    if (cur_memory_size_ >= max_memory_size_) {
+      cur_memory_size_ = 0;
+      garbage_queue = garbages_.release();
+      garbages_.reset(new GarbageQueue());
+    }
+  }
+
+  if (garbage_queue) {
+    callback();
+    ClearCallback([garbage_queue]() { delete garbage_queue; });
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index e51b1f1f73..7458b69af8 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -97,29 +97,31 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
     if (gcs_.count(place) > 0) {
       continue;
     }
-    GarbageCollector<Tensor> *gc = nullptr;
+    std::unique_ptr<GarbageCollector> gc;
 #ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(place)) {
       if (IsFastEagerDeletionModeEnabled()) {
-        gc = new UnsafeFastGPUGarbageCollector<Tensor>(
-            boost::get<platform::CUDAPlace>(place), max_memory_size);
+        gc.reset(new UnsafeFastGPUGarbageCollector(
+            boost::get<platform::CUDAPlace>(place), max_memory_size));
       } else {
-        gc = new StreamGarbageCollector<Tensor>(
-            boost::get<platform::CUDAPlace>(place), max_memory_size);
+        gc.reset(new StreamGarbageCollector(
+            boost::get<platform::CUDAPlace>(place), max_memory_size));
       }
       VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
-    } else if (platform::is_cpu_place(place)) {
+    } else {
 #endif
-      gc = new CPUGarbageCollector<Tensor>(
-          boost::get<platform::CPUPlace>(place), max_memory_size);
-      VLOG(10) << "Created GarbageCollector at " << place;
+      if (platform::is_cpu_place(place)) {
+        gc.reset(new CPUGarbageCollector(boost::get<platform::CPUPlace>(place),
+                                         max_memory_size));
+        VLOG(10) << "Created GarbageCollector at " << place;
+      } else {
+        PADDLE_THROW("Unsupported place for garbage collection");
+      }
 #ifdef PADDLE_WITH_CUDA
     }
 #endif
 
-    if (gc) {
-      gcs_[place] = std::unique_ptr<GarbageCollector<Tensor>>(gc);
-    }
+    gcs_.emplace(place, std::move(gc));
   }
 
   if (!gcs_.empty()) {
@@ -144,8 +146,6 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
     eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_);
     graph = eager_deletion_pass->Apply(std::move(graph));
     VLOG(10) << "EagerDeletionPass Applied";
-
-    graph->SetNotOwned(details::kGarbageCollector, &gcs_);
   }
 
   return graph;
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index cb3b6cdc3e..6fa5e99f9f 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -38,7 +38,7 @@ DEFINE_double(
     "Memory size threshold (GB) when the garbage collector clear tensors."
     "Disabled when this value is less than 0");
 
-DEFINE_bool(fast_eager_deletion_mode, true,
+DEFINE_bool(fast_eager_deletion_mode, false,
             "Fast eager deletion mode. If enabled, memory would release "
             "immediately without waiting GPU kernel ends.");
 
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 71e8badd4b..9f7027f5ae 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -158,6 +158,10 @@ class Tensor {
   const std::shared_ptr<memory::Allocation>& Holder() const { return holder_; }
   size_t offset() const { return offset_; }
 
+  std::shared_ptr<memory::Allocation> MoveMemory() {
+    return std::move(holder_);
+  }
+
  private:
   /*! holds the memory block if allocated. */
   std::shared_ptr<memory::Allocation> holder_;
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
new file mode 100644
index 0000000000..1ec174544c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from test_eager_deletion_lstm_net import TestBase
+import paddle.fluid as fluid
+
+
+def gru_net(data,
+            label,
+            dict_dim,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2,
+            emb_lr=400.0):
+    emb = fluid.layers.embedding(
+        input=data,
+        size=[dict_dim, emb_dim],
+        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
+    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 3)
+    gru_h = fluid.layers.dynamic_gru(input=fc0, size=hid_dim, is_reverse=False)
+    gru_max = fluid.layers.sequence_pool(input=gru_h, pool_type='max')
+    gru_max_tanh = fluid.layers.tanh(gru_max)
+    fc1 = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh')
+    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    return avg_cost
+
+
+class GRUTest(TestBase):
+    def setUp(self):
+        self.net = gru_net
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
new file mode 100644
index 0000000000..431765bff2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0'
+os.environ['CPU_NUM'] = '2'
+
+import six
+import unittest
+
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+
+
+def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
+    if use_cuda and not core.is_compiled_with_cuda():
+        print('Skip use_cuda=True because Paddle is not compiled with cuda')
+        return
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    train_reader = paddle.batch(
+        paddle.dataset.imdb.train(word_dict), batch_size=batch_size)
+
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+    cost = network(data, label, len(word_dict))
+    optimizer = fluid.optimizer.Adagrad(learning_rate=0.2)
+    optimizer.minimize(cost)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
+    reader = feeder.decorate_reader(
+        train_reader, multi_devices=use_parallel_executor)
+
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    if use_parallel_executor:
+        train_exe = fluid.ParallelExecutor(
+            use_cuda=use_cuda, loss_name=cost.name)
+        fetch_list = [cost.name]
+    else:
+        train_exe = exe
+        fetch_list = [cost]
+
+    for pass_id in six.moves.xrange(pass_num):
+        batch_id = 0
+        for data in reader():
+            train_exe.run(feed=data,
+                          fetch_list=fetch_list if batch_id % 4 == 0 else [])
+            batch_id += 1
+            if batch_id > 16:
+                break
+
+
+def lstm_net(data,
+             label,
+             dict_dim,
+             emb_dim=128,
+             hid_dim=128,
+             hid_dim2=96,
+             class_dim=2,
+             emb_lr=30.0):
+    emb = fluid.layers.embedding(
+        input=data,
+        size=[dict_dim, emb_dim],
+        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
+    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
+    lstm_h, c = fluid.layers.dynamic_lstm(
+        input=fc0, size=hid_dim * 4, is_reverse=False)
+    lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
+    lstm_max_tanh = fluid.layers.tanh(lstm_max)
+    fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
+    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    return avg_cost
+
+
+class TestBase(unittest.TestCase):
+    def setUp(self):
+        self.net = lstm_net
+
+    def test_network(self):
+        for use_cuda in [True, False]:
+            for use_parallel_executor in [False, True]:
+                print('network: {}, use_cuda: {}, use_parallel_executor: {}'.
+                      format(self.net.__name__, use_cuda,
+                             use_parallel_executor))
+                with fluid.program_guard(fluid.Program(), fluid.Program()):
+                    with fluid.scope_guard(core.Scope()):
+                        train(self.net, use_cuda, use_parallel_executor)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 2c6159a151d573ca697e2dfd591720cc854b4b9b Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Fri, 7 Dec 2018 13:59:36 +0000
Subject: [PATCH 11/45] fix unittest fix cmake test=develop

---
 paddle/fluid/framework/CMakeLists.txt         |  4 +-
 .../test_eager_deletion_dynamic_rnn_base.py   | 86 +++++++++++++++++++
 .../unittests/test_eager_deletion_gru_net.py  |  2 +-
 .../unittests/test_eager_deletion_lstm_net.py | 67 +--------------
 4 files changed, 92 insertions(+), 67 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index f2361c5cea..b236eef3ce 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -171,9 +171,9 @@ if(WITH_DISTRIBUTE)
   set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
   if(NOT WIN32)
-    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper)
+    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper garbage_collector)
   else(NOT WIN32)
-    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
+    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper garbage_collector)
   endif(NOT WIN32)
   cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
new file mode 100644
index 0000000000..e91cfe0b45
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0'
+os.environ['CPU_NUM'] = '2'
+
+import six
+import unittest
+
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+
+
+def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
+    if use_cuda and not core.is_compiled_with_cuda():
+        print('Skip use_cuda=True because Paddle is not compiled with cuda')
+        return
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    train_reader = paddle.batch(
+        paddle.dataset.imdb.train(word_dict), batch_size=batch_size)
+
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+    cost = network(data, label, len(word_dict))
+    optimizer = fluid.optimizer.Adagrad(learning_rate=0.2)
+    optimizer.minimize(cost)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
+    reader = feeder.decorate_reader(
+        train_reader, multi_devices=use_parallel_executor)
+
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    if use_parallel_executor:
+        train_exe = fluid.ParallelExecutor(
+            use_cuda=use_cuda, loss_name=cost.name)
+        fetch_list = [cost.name]
+    else:
+        train_exe = exe
+        fetch_list = [cost]
+
+    for pass_id in six.moves.xrange(pass_num):
+        batch_id = 0
+        for data in reader():
+            train_exe.run(feed=data,
+                          fetch_list=fetch_list if batch_id % 4 == 0 else [])
+            batch_id += 1
+            if batch_id > 16:
+                break
+
+
+class TestBase(unittest.TestCase):
+    def setUp(self):
+        self.net = None
+
+    def test_network(self):
+        if self.net is None:
+            return
+
+        for use_cuda in [True, False]:
+            for use_parallel_executor in [False, True]:
+                print('network: {}, use_cuda: {}, use_parallel_executor: {}'.
+                      format(self.net.__name__, use_cuda,
+                             use_parallel_executor))
+                with fluid.program_guard(fluid.Program(), fluid.Program()):
+                    with fluid.scope_guard(core.Scope()):
+                        train(self.net, use_cuda, use_parallel_executor)
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
index 1ec174544c..5ed3d9fdf3 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import unittest
-from test_eager_deletion_lstm_net import TestBase
+from test_eager_deletion_dynamic_rnn_base import TestBase
 import paddle.fluid as fluid
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
index 431765bff2..8462c06aa5 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
@@ -12,60 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0'
-os.environ['CPU_NUM'] = '2'
-
-import six
-import unittest
-
-import paddle
-import paddle.fluid.core as core
+from test_eager_deletion_dynamic_rnn_base import TestBase
 import paddle.fluid as fluid
-
-
-def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
-    if use_cuda and not core.is_compiled_with_cuda():
-        print('Skip use_cuda=True because Paddle is not compiled with cuda')
-        return
-
-    word_dict = paddle.dataset.imdb.word_dict()
-    train_reader = paddle.batch(
-        paddle.dataset.imdb.train(word_dict), batch_size=batch_size)
-
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-
-    cost = network(data, label, len(word_dict))
-    optimizer = fluid.optimizer.Adagrad(learning_rate=0.2)
-    optimizer.minimize(cost)
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
-    reader = feeder.decorate_reader(
-        train_reader, multi_devices=use_parallel_executor)
-
-    exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())
-
-    if use_parallel_executor:
-        train_exe = fluid.ParallelExecutor(
-            use_cuda=use_cuda, loss_name=cost.name)
-        fetch_list = [cost.name]
-    else:
-        train_exe = exe
-        fetch_list = [cost]
-
-    for pass_id in six.moves.xrange(pass_num):
-        batch_id = 0
-        for data in reader():
-            train_exe.run(feed=data,
-                          fetch_list=fetch_list if batch_id % 4 == 0 else [])
-            batch_id += 1
-            if batch_id > 16:
-                break
+import unittest
 
 
 def lstm_net(data,
@@ -92,20 +41,10 @@ def lstm_net(data,
     return avg_cost
 
 
-class TestBase(unittest.TestCase):
+class LSTMTest(TestBase):
     def setUp(self):
         self.net = lstm_net
 
-    def test_network(self):
-        for use_cuda in [True, False]:
-            for use_parallel_executor in [False, True]:
-                print('network: {}, use_cuda: {}, use_parallel_executor: {}'.
-                      format(self.net.__name__, use_cuda,
-                             use_parallel_executor))
-                with fluid.program_guard(fluid.Program(), fluid.Program()):
-                    with fluid.scope_guard(core.Scope()):
-                        train(self.net, use_cuda, use_parallel_executor)
-
 
 if __name__ == "__main__":
     unittest.main()

From eab47459658b10ec799a5dccbfd9bf8f45b9771a Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Mon, 26 Nov 2018 15:53:33 +0800
Subject: [PATCH 12/45] add adaptive mode for pool.

---
 paddle/fluid/operators/math/pooling.cc | 202 ++++++++++++++++++-------
 paddle/fluid/operators/math/pooling.cu |  26 ++--
 2 files changed, 161 insertions(+), 67 deletions(-)

diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/fluid/operators/math/pooling.cc
index 8df43bb616..68fed9fd4e 100644
--- a/paddle/fluid/operators/math/pooling.cc
+++ b/paddle/fluid/operators/math/pooling.cc
@@ -19,6 +19,16 @@ namespace paddle {
 namespace operators {
 namespace math {
 
+static inline int ADAPT_START_INDEX(int ph, int input_size, int output_size) {
+  return static_cast<int>(
+      floor(static_cast<float>(ph * input_size) / output_size));
+}
+
+static inline int ADAPT_END_INDEX(int ph, int input_size, int output_size) {
+  return static_cast<int>(
+      ceil(static_cast<float>((ph + 1) * input_size) / output_size));
+}
+
 /*
  * All tensors are in NCHW format.
  * Ksize, strides, paddings are two elements. These two elements represent
@@ -31,7 +41,7 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                   const framework::Tensor& input, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_process,
-                  bool exclusive, framework::Tensor* output) {
+                  bool exclusive, bool adaptive, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -54,13 +64,23 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
         for (int ph = 0; ph < output_height; ++ph) {
-          int hstart = ph * stride_height - padding_height;
-          int hend = std::min(hstart + ksize_height, input_height);
-          hstart = std::max(hstart, 0);
+          if (adaptive) {
+            int hstart = ADAPT_START_INDEX(ph, input_height, output_height);
+            int hend = ADAPT_END_INDEX(ph, input_height, output_height);
+          } else {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+          }
           for (int pw = 0; pw < output_width; ++pw) {
-            int wstart = pw * stride_width - padding_width;
-            int wend = std::min(wstart + ksize_width, input_width);
-            wstart = std::max(wstart, 0);
+            if (adaptive) {
+              int wstart = ADAPT_START_INDEX(pw, input_width, output_width);
+              int wend = ADAPT_END_INDEX(pw, input_width, output_width);
+            } else {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+            }
 
             T ele = pool_process.initial();
             for (int h = hstart; h < hend; ++h) {
@@ -68,8 +88,9 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                 pool_process.compute(input_data[h * input_width + w], &ele);
               }
             }
-            int pool_size = exclusive ? (hend - hstart) * (wend - wstart)
-                                      : ksize_height * ksize_width;
+            int pool_size = (exclusive || adaptive)
+                                ? (hend - hstart) * (wend - wstart)
+                                : ksize_height * ksize_width;
             pool_process.finalize(static_cast<T>(pool_size), &ele);
             output_data[ph * output_width + pw] = ele;
           }
@@ -94,7 +115,7 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
       const framework::Tensor& output, const framework::Tensor& output_grad,
       const std::vector<int>& ksize, const std::vector<int>& strides,
       const std::vector<int>& paddings, PoolProcess pool_grad_process,
-      bool exclusive, framework::Tensor* input_grad) {
+      bool exclusive, bool adaptive, framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -118,15 +139,26 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
         for (int ph = 0; ph < output_height; ++ph) {
-          int hstart = ph * stride_height - padding_height;
-          int hend = std::min(hstart + ksize_height, input_height);
-          hstart = std::max(hstart, 0);
+          if (adaptive) {
+            int hstart = ADAPT_START_INDEX(ph, input_height, output_height);
+            int hend = ADAPT_END_INDEX(ph, input_height, output_height);
+          } else {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+          }
           for (int pw = 0; pw < output_width; ++pw) {
-            int wstart = pw * stride_width - padding_width;
-            int wend = std::min(wstart + ksize_width, input_width);
-            wstart = std::max(wstart, 0);
-            int pool_size = exclusive ? (hend - hstart) * (wend - wstart)
-                                      : ksize_height * ksize_width;
+            if (adaptive) {
+              int wstart = ADAPT_START_INDEX(pw, input_width, output_width);
+              int wend = ADAPT_END_INDEX(pw, input_width, output_width);
+            } else {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+            }
+            int pool_size = (exclusive || adaptive)
+                                ? (hend - hstart) * (wend - wstart)
+                                : ksize_height * ksize_width;
             float scale = 1.0 / pool_size;
             for (int h = hstart; h < hend; ++h) {
               for (int w = wstart; w < wend; ++w) {
@@ -251,7 +283,7 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                   const framework::Tensor& input, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_process,
-                  bool exclusive, framework::Tensor* output) {
+                  bool exclusive, bool adaptive, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
@@ -279,17 +311,32 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
         for (int pd = 0; pd < output_depth; ++pd) {
-          int dstart = pd * stride_depth - padding_depth;
-          int dend = std::min(dstart + ksize_depth, input_depth);
-          dstart = std::max(dstart, 0);
+          if (adaptive) {
+            int dstart = ADAPT_START_INDEX(pd, input_depth, output_depth);
+            int dend = ADAPT_END_INDEX(pd, input_depth, output_depth);
+          } else {
+            int dstart = pd * stride_depth - padding_depth;
+            int dend = std::min(dstart + ksize_depth, input_depth);
+            dstart = std::max(dstart, 0);
+          }
           for (int ph = 0; ph < output_height; ++ph) {
-            int hstart = ph * stride_height - padding_height;
-            int hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
+            if (adaptive) {
+              int hstart = ADAPT_START_INDEX(ph, input_height, output_height);
+              int hend = ADAPT_END_INDEX(ph, input_height, output_height);
+            } else {
+              int hstart = ph * stride_height - padding_height;
+              int hend = std::min(hstart + ksize_height, input_height);
+              hstart = std::max(hstart, 0);
+            }
             for (int pw = 0; pw < output_width; ++pw) {
-              int wstart = pw * stride_width - padding_width;
-              int wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
+              if (adaptive) {
+                int wstart = ADAPT_START_INDEX(pw, input_width, output_width);
+                int wend = ADAPT_END_INDEX(pw, input_width, output_width);
+              } else {
+                int wstart = pw * stride_width - padding_width;
+                int wend = std::min(wstart + ksize_width, input_width);
+                wstart = std::max(wstart, 0);
+              }
               int output_idx = (pd * output_height + ph) * output_width + pw;
               T ele = pool_process.initial();
               for (int d = dstart; d < dend; ++d) {
@@ -302,7 +349,7 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                 }
               }
               int pool_size =
-                  exclusive
+                  (exclusive || adaptive)
                       ? (dend - dstart) * (hend - hstart) * (wend - wstart)
                       : ksize_depth * ksize_height * ksize_width;
               pool_process.finalize(static_cast<T>(pool_size), &ele);
@@ -330,7 +377,7 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
       const framework::Tensor& output, const framework::Tensor& output_grad,
       const std::vector<int>& ksize, const std::vector<int>& strides,
       const std::vector<int>& paddings, PoolProcess pool_grad_process,
-      bool exclusive, framework::Tensor* input_grad) {
+      bool exclusive, bool adaptive, framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
@@ -359,21 +406,35 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
         for (int pd = 0; pd < output_depth; ++pd) {
-          int dstart = pd * stride_depth - padding_depth;
-          int dend = std::min(dstart + ksize_depth, input_depth);
-          dstart = std::max(dstart, 0);
+          if (adaptive) {
+            int dstart = ADAPT_START_INDEX(pd, input_depth, output_depth);
+            int dend = ADAPT_END_INDEX(pd, input_depth, output_depth);
+          } else {
+            int dstart = pd * stride_depth - padding_depth;
+            int dend = std::min(dstart + ksize_depth, input_depth);
+            dstart = std::max(dstart, 0);
+          }
           for (int ph = 0; ph < output_height; ++ph) {
-            int hstart = ph * stride_height - padding_height;
-            int hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
-
+            if (adaptive) {
+              int hstart = ADAPT_START_INDEX(ph, input_height, output_height);
+              int hend = ADAPT_END_INDEX(ph, input_height, output_height);
+            } else {
+              int hstart = ph * stride_height - padding_height;
+              int hend = std::min(hstart + ksize_height, input_height);
+              hstart = std::max(hstart, 0);
+            }
             for (int pw = 0; pw < output_width; ++pw) {
-              int wstart = pw * stride_width - padding_width;
-              int wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
+              if (adaptive) {
+                int wstart = ADAPT_START_INDEX(pw, input_width, output_width);
+                int wend = ADAPT_END_INDEX(pw, input_width, output_width);
+              } else {
+                int wstart = pw * stride_width - padding_width;
+                int wend = std::min(wstart + ksize_width, input_width);
+                wstart = std::max(wstart, 0);
+              }
 
               int pool_size =
-                  exclusive
+                  (exclusive || adaptive)
                       ? (dend - dstart) * (hend - hstart) * (wend - wstart)
                       : ksize_depth * ksize_height * ksize_width;
               float scale = 1.0 / pool_size;
@@ -517,8 +578,8 @@ class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, framework::Tensor* output,
-                  framework::Tensor* mask) {
+                  const std::vector<int>& paddings, bool adaptive,
+                  framework::Tensor* output, framework::Tensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -541,13 +602,23 @@ class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
         for (int ph = 0; ph < output_height; ++ph) {
-          int hstart = ph * stride_height - padding_height;
-          int hend = std::min(hstart + ksize_height, input_height);
-          hstart = std::max(hstart, 0);
+          if (adaptive) {
+            int hstart = ADAPT_START_INDEX(ph, input_height, output_height);
+            int hend = ADAPT_END_INDEX(ph, input_height, output_height);
+          } else {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+          }
           for (int pw = 0; pw < output_width; ++pw) {
-            int wstart = pw * stride_width - padding_width;
-            int wend = std::min(wstart + ksize_width, input_width);
-            wstart = std::max(wstart, 0);
+            if (adaptive) {
+              int wstart = ADAPT_START_INDEX(pw, input_width, output_width);
+              int wend = ADAPT_END_INDEX(pw, input_width, output_width);
+            } else {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+            }
 
             T1 ele = static_cast<T1>(-FLT_MAX);
             int index = -1;
@@ -666,17 +737,32 @@ class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
         for (int pd = 0; pd < output_depth; ++pd) {
-          int dstart = pd * stride_depth - padding_depth;
-          int dend = std::min(dstart + ksize_depth, input_depth);
-          dstart = std::max(dstart, 0);
+          if (adaptive) {
+            int dstart = ADAPT_START_INDEX(pd, input_depth, output_depth);
+            int dend = ADAPT_END_INDEX(pd, input_depth, output_depth);
+          } else {
+            int dstart = pd * stride_depth - padding_depth;
+            int dend = std::min(dstart + ksize_depth, input_depth);
+            dstart = std::max(dstart, 0);
+          }
           for (int ph = 0; ph < output_height; ++ph) {
-            int hstart = ph * stride_height - padding_height;
-            int hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
+            if (adaptive) {
+              int hstart = ADAPT_START_INDEX(ph, input_height, output_height);
+              int hend = ADAPT_END_INDEX(ph, input_height, output_height);
+            } else {
+              int hstart = ph * stride_height - padding_height;
+              int hend = std::min(hstart + ksize_height, input_height);
+              hstart = std::max(hstart, 0);
+            }
             for (int pw = 0; pw < output_width; ++pw) {
-              int wstart = pw * stride_width - padding_width;
-              int wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
+              if (adaptive) {
+                int wstart = ADAPT_START_INDEX(pw, input_width, output_width);
+                int wend = ADAPT_END_INDEX(pw, input_width, output_width);
+              } else {
+                int wstart = pw * stride_width - padding_width;
+                int wend = std::min(wstart + ksize_width, input_width);
+                wstart = std::max(wstart, 0);
+              }
 
               int output_idx = (pd * output_height + ph) * output_width + pw;
               T1 ele = static_cast<T1>(-FLT_MAX);
diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu
index cdc79e207a..06e92665c7 100644
--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/fluid/operators/math/pooling.cu
@@ -29,7 +29,7 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
                              const int ksize_width, const int stride_height,
                              const int stride_width, const int padding_height,
                              const int padding_width, PoolProcess pool_process,
-                             bool exclusive, T* output_data) {
+                             bool exclusive, bool adaptive, T* output_data) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw = index % output_width;
@@ -37,13 +37,21 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
     int c = (index / output_width / output_height) % channels;
     int batch_idx = index / output_width / output_height / channels;
 
-    int hstart = ph * stride_height - padding_height;
-    int hend = min(hstart + ksize_height, input_height);
-    hstart = max(hstart, 0);
+    if (adaptive) {
+      int hstart = ADAPT_START_INDEX(ph, input_height, output_height);
+      int hend = ADAPT_END_INDEX(ph, input_height, output_height);
 
-    int wstart = pw * stride_width - padding_width;
-    int wend = min(wstart + ksize_width, input_width);
-    wstart = max(wstart, 0);
+      int wstart = ADAPT_START_INDEX(pw, input_width, output_width);
+      int wend = ADAPT_END_INDEX(pw, input_width, output_width);
+    } else {
+      int hstart = ph * stride_height - padding_height;
+      int hend = min(hstart + ksize_height, input_height);
+      hstart = max(hstart, 0);
+
+      int wstart = pw * stride_width - padding_width;
+      int wend = min(wstart + ksize_width, input_width);
+      wstart = max(wstart, 0);
+    }
 
     input_data += (batch_idx * channels + c) * input_height * input_width;
     T ele = pool_process.initial();
@@ -52,8 +60,8 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
         pool_process.compute(input_data[h * input_width + w], &ele);
       }
     }
-    int pool_size = exclusive ? (hend - hstart) * (wend - wstart)
-                              : ksize_height * ksize_width;
+    int pool_size = (exclusive || adaptive) ? (hend - hstart) * (wend - wstart)
+                                            : ksize_height * ksize_width;
     pool_process.finalize(static_cast<T>(pool_size), &ele);
     output_data[index] = ele;
   }

From 266c6856c90836296f908afa5fff3e08b3ebb718 Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Wed, 28 Nov 2018 22:09:23 +0800
Subject: [PATCH 13/45] add adaptive pool 2d & 3d. test=develop

---
 paddle/fluid/API.spec                         |   2 +
 paddle/fluid/operators/math/pooling.cc        | 143 +++---
 paddle/fluid/operators/math/pooling.cu        | 411 +++++++++++-------
 paddle/fluid/operators/math/pooling.h         |  20 +-
 paddle/fluid/operators/pool_op.cc             |  26 +-
 paddle/fluid/operators/pool_op.h              |  16 +-
 paddle/fluid/operators/pool_with_index_op.cc  |  27 +-
 paddle/fluid/operators/pool_with_index_op.h   |  12 +-
 paddle/fluid/operators/spp_op.h               |   6 +-
 python/paddle/fluid/layers/nn.py              | 186 ++++++++
 .../fluid/tests/unittests/test_layers.py      |  22 +
 .../fluid/tests/unittests/test_pool2d_op.py   |  91 ++--
 .../fluid/tests/unittests/test_pool3d_op.py   | 121 ++++--
 .../fluid/tests/unittests/test_pool_max_op.py |  95 +++-
 14 files changed, 860 insertions(+), 318 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index fd4cf92d85..87ed586aad 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -77,6 +77,8 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name']
 paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
 paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
 paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
+paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=('max', False, True, None))
+paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=('max', False, True, None))
 paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False))
 paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/fluid/operators/math/pooling.cc
index 68fed9fd4e..b4ee82add3 100644
--- a/paddle/fluid/operators/math/pooling.cc
+++ b/paddle/fluid/operators/math/pooling.cc
@@ -61,24 +61,26 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
 
+    int hstart, hend;
+    int wstart, wend;
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
         for (int ph = 0; ph < output_height; ++ph) {
           if (adaptive) {
-            int hstart = ADAPT_START_INDEX(ph, input_height, output_height);
-            int hend = ADAPT_END_INDEX(ph, input_height, output_height);
+            hstart = ADAPT_START_INDEX(ph, input_height, output_height);
+            hend = ADAPT_END_INDEX(ph, input_height, output_height);
           } else {
-            int hstart = ph * stride_height - padding_height;
-            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = ph * stride_height - padding_height;
+            hend = std::min(hstart + ksize_height, input_height);
             hstart = std::max(hstart, 0);
           }
           for (int pw = 0; pw < output_width; ++pw) {
             if (adaptive) {
-              int wstart = ADAPT_START_INDEX(pw, input_width, output_width);
-              int wend = ADAPT_END_INDEX(pw, input_width, output_width);
+              wstart = ADAPT_START_INDEX(pw, input_width, output_width);
+              wend = ADAPT_END_INDEX(pw, input_width, output_width);
             } else {
-              int wstart = pw * stride_width - padding_width;
-              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = pw * stride_width - padding_width;
+              wend = std::min(wstart + ksize_width, input_width);
               wstart = std::max(wstart, 0);
             }
 
@@ -136,24 +138,26 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
+    int hstart, hend;
+    int wstart, wend;
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
         for (int ph = 0; ph < output_height; ++ph) {
           if (adaptive) {
-            int hstart = ADAPT_START_INDEX(ph, input_height, output_height);
-            int hend = ADAPT_END_INDEX(ph, input_height, output_height);
+            hstart = ADAPT_START_INDEX(ph, input_height, output_height);
+            hend = ADAPT_END_INDEX(ph, input_height, output_height);
           } else {
-            int hstart = ph * stride_height - padding_height;
-            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = ph * stride_height - padding_height;
+            hend = std::min(hstart + ksize_height, input_height);
             hstart = std::max(hstart, 0);
           }
           for (int pw = 0; pw < output_width; ++pw) {
             if (adaptive) {
-              int wstart = ADAPT_START_INDEX(pw, input_width, output_width);
-              int wend = ADAPT_END_INDEX(pw, input_width, output_width);
+              wstart = ADAPT_START_INDEX(pw, input_width, output_width);
+              wend = ADAPT_END_INDEX(pw, input_width, output_width);
             } else {
-              int wstart = pw * stride_width - padding_width;
-              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = pw * stride_width - padding_width;
+              wend = std::min(wstart + ksize_width, input_width);
               wstart = std::max(wstart, 0);
             }
             int pool_size = (exclusive || adaptive)
@@ -308,33 +312,36 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
 
+    int dstart, dend;
+    int hstart, hend;
+    int wstart, wend;
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
         for (int pd = 0; pd < output_depth; ++pd) {
           if (adaptive) {
-            int dstart = ADAPT_START_INDEX(pd, input_depth, output_depth);
-            int dend = ADAPT_END_INDEX(pd, input_depth, output_depth);
+            dstart = ADAPT_START_INDEX(pd, input_depth, output_depth);
+            dend = ADAPT_END_INDEX(pd, input_depth, output_depth);
           } else {
-            int dstart = pd * stride_depth - padding_depth;
-            int dend = std::min(dstart + ksize_depth, input_depth);
+            dstart = pd * stride_depth - padding_depth;
+            dend = std::min(dstart + ksize_depth, input_depth);
             dstart = std::max(dstart, 0);
           }
           for (int ph = 0; ph < output_height; ++ph) {
             if (adaptive) {
-              int hstart = ADAPT_START_INDEX(ph, input_height, output_height);
-              int hend = ADAPT_END_INDEX(ph, input_height, output_height);
+              hstart = ADAPT_START_INDEX(ph, input_height, output_height);
+              hend = ADAPT_END_INDEX(ph, input_height, output_height);
             } else {
-              int hstart = ph * stride_height - padding_height;
-              int hend = std::min(hstart + ksize_height, input_height);
+              hstart = ph * stride_height - padding_height;
+              hend = std::min(hstart + ksize_height, input_height);
               hstart = std::max(hstart, 0);
             }
             for (int pw = 0; pw < output_width; ++pw) {
               if (adaptive) {
-                int wstart = ADAPT_START_INDEX(pw, input_width, output_width);
-                int wend = ADAPT_END_INDEX(pw, input_width, output_width);
+                wstart = ADAPT_START_INDEX(pw, input_width, output_width);
+                wend = ADAPT_END_INDEX(pw, input_width, output_width);
               } else {
-                int wstart = pw * stride_width - padding_width;
-                int wend = std::min(wstart + ksize_width, input_width);
+                wstart = pw * stride_width - padding_width;
+                wend = std::min(wstart + ksize_width, input_width);
                 wstart = std::max(wstart, 0);
               }
               int output_idx = (pd * output_height + ph) * output_width + pw;
@@ -403,33 +410,36 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
+    int dstart, dend;
+    int hstart, hend;
+    int wstart, wend;
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
         for (int pd = 0; pd < output_depth; ++pd) {
           if (adaptive) {
-            int dstart = ADAPT_START_INDEX(pd, input_depth, output_depth);
-            int dend = ADAPT_END_INDEX(pd, input_depth, output_depth);
+            dstart = ADAPT_START_INDEX(pd, input_depth, output_depth);
+            dend = ADAPT_END_INDEX(pd, input_depth, output_depth);
           } else {
-            int dstart = pd * stride_depth - padding_depth;
-            int dend = std::min(dstart + ksize_depth, input_depth);
+            dstart = pd * stride_depth - padding_depth;
+            dend = std::min(dstart + ksize_depth, input_depth);
             dstart = std::max(dstart, 0);
           }
           for (int ph = 0; ph < output_height; ++ph) {
             if (adaptive) {
-              int hstart = ADAPT_START_INDEX(ph, input_height, output_height);
-              int hend = ADAPT_END_INDEX(ph, input_height, output_height);
+              hstart = ADAPT_START_INDEX(ph, input_height, output_height);
+              hend = ADAPT_END_INDEX(ph, input_height, output_height);
             } else {
-              int hstart = ph * stride_height - padding_height;
-              int hend = std::min(hstart + ksize_height, input_height);
+              hstart = ph * stride_height - padding_height;
+              hend = std::min(hstart + ksize_height, input_height);
               hstart = std::max(hstart, 0);
             }
             for (int pw = 0; pw < output_width; ++pw) {
               if (adaptive) {
-                int wstart = ADAPT_START_INDEX(pw, input_width, output_width);
-                int wend = ADAPT_END_INDEX(pw, input_width, output_width);
+                wstart = ADAPT_START_INDEX(pw, input_width, output_width);
+                wend = ADAPT_END_INDEX(pw, input_width, output_width);
               } else {
-                int wstart = pw * stride_width - padding_width;
-                int wend = std::min(wstart + ksize_width, input_width);
+                wstart = pw * stride_width - padding_width;
+                wend = std::min(wstart + ksize_width, input_width);
                 wstart = std::max(wstart, 0);
               }
 
@@ -599,24 +609,26 @@ class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
     T1* output_data = output->mutable_data<T1>(context.GetPlace());
     T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
 
+    int hstart, hend;
+    int wstart, wend;
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
         for (int ph = 0; ph < output_height; ++ph) {
           if (adaptive) {
-            int hstart = ADAPT_START_INDEX(ph, input_height, output_height);
-            int hend = ADAPT_END_INDEX(ph, input_height, output_height);
+            hstart = ADAPT_START_INDEX(ph, input_height, output_height);
+            hend = ADAPT_END_INDEX(ph, input_height, output_height);
           } else {
-            int hstart = ph * stride_height - padding_height;
-            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = ph * stride_height - padding_height;
+            hend = std::min(hstart + ksize_height, input_height);
             hstart = std::max(hstart, 0);
           }
           for (int pw = 0; pw < output_width; ++pw) {
             if (adaptive) {
-              int wstart = ADAPT_START_INDEX(pw, input_width, output_width);
-              int wend = ADAPT_END_INDEX(pw, input_width, output_width);
+              wstart = ADAPT_START_INDEX(pw, input_width, output_width);
+              wend = ADAPT_END_INDEX(pw, input_width, output_width);
             } else {
-              int wstart = pw * stride_width - padding_width;
-              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = pw * stride_width - padding_width;
+              wend = std::min(wstart + ksize_width, input_width);
               wstart = std::max(wstart, 0);
             }
 
@@ -655,7 +667,7 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
+                  const std::vector<int>& paddings, bool adaptive,
                   framework::Tensor* input_grad) {
     const int batch_size = input_grad->dims()[0];
     const int input_height = input_grad->dims()[2];
@@ -708,8 +720,8 @@ class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, framework::Tensor* output,
-                  framework::Tensor* mask) {
+                  const std::vector<int>& paddings, bool adaptive,
+                  framework::Tensor* output, framework::Tensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
@@ -734,33 +746,36 @@ class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
     T1* output_data = output->mutable_data<T1>(context.GetPlace());
     T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
 
+    int dstart, dend;
+    int hstart, hend;
+    int wstart, wend;
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
         for (int pd = 0; pd < output_depth; ++pd) {
           if (adaptive) {
-            int dstart = ADAPT_START_INDEX(pd, input_depth, output_depth);
-            int dend = ADAPT_END_INDEX(pd, input_depth, output_depth);
+            dstart = ADAPT_START_INDEX(pd, input_depth, output_depth);
+            dend = ADAPT_END_INDEX(pd, input_depth, output_depth);
           } else {
-            int dstart = pd * stride_depth - padding_depth;
-            int dend = std::min(dstart + ksize_depth, input_depth);
+            dstart = pd * stride_depth - padding_depth;
+            dend = std::min(dstart + ksize_depth, input_depth);
             dstart = std::max(dstart, 0);
           }
           for (int ph = 0; ph < output_height; ++ph) {
             if (adaptive) {
-              int hstart = ADAPT_START_INDEX(ph, input_height, output_height);
-              int hend = ADAPT_END_INDEX(ph, input_height, output_height);
+              hstart = ADAPT_START_INDEX(ph, input_height, output_height);
+              hend = ADAPT_END_INDEX(ph, input_height, output_height);
             } else {
-              int hstart = ph * stride_height - padding_height;
-              int hend = std::min(hstart + ksize_height, input_height);
+              hstart = ph * stride_height - padding_height;
+              hend = std::min(hstart + ksize_height, input_height);
               hstart = std::max(hstart, 0);
             }
             for (int pw = 0; pw < output_width; ++pw) {
               if (adaptive) {
-                int wstart = ADAPT_START_INDEX(pw, input_width, output_width);
-                int wend = ADAPT_END_INDEX(pw, input_width, output_width);
+                wstart = ADAPT_START_INDEX(pw, input_width, output_width);
+                wend = ADAPT_END_INDEX(pw, input_width, output_width);
               } else {
-                int wstart = pw * stride_width - padding_width;
-                int wend = std::min(wstart + ksize_width, input_width);
+                wstart = pw * stride_width - padding_width;
+                wend = std::min(wstart + ksize_width, input_width);
                 wstart = std::max(wstart, 0);
               }
 
@@ -804,7 +819,7 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
+                  const std::vector<int>& paddings, bool adaptive,
                   framework::Tensor* input_grad) {
     const int batch_size = input_grad->dims()[0];
     const int input_depth = input_grad->dims()[2];
diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu
index 06e92665c7..5f3b82ed55 100644
--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/fluid/operators/math/pooling.cu
@@ -21,6 +21,18 @@ namespace paddle {
 namespace operators {
 namespace math {
 
+__device__ __forceinline__ int ADAPT_START_INDEX(int ph, int input_size,
+                                                 int output_size) {
+  return static_cast<int>(
+      floor(static_cast<double>(ph * input_size) / output_size));
+}
+
+__device__ __forceinline__ int ADAPT_END_INDEX(int ph, int input_size,
+                                               int output_size) {
+  return static_cast<int>(
+      ceil(static_cast<double>((ph + 1) * input_size) / output_size));
+}
+
 template <typename PoolProcess, typename T>
 __global__ void KernelPool2D(const int nthreads, const T* input_data,
                              const int channels, const int input_height,
@@ -37,19 +49,21 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
     int c = (index / output_width / output_height) % channels;
     int batch_idx = index / output_width / output_height / channels;
 
+    int hstart, hend;
+    int wstart, wend;
     if (adaptive) {
-      int hstart = ADAPT_START_INDEX(ph, input_height, output_height);
-      int hend = ADAPT_END_INDEX(ph, input_height, output_height);
+      hstart = ADAPT_START_INDEX(ph, input_height, output_height);
+      hend = ADAPT_END_INDEX(ph, input_height, output_height);
 
-      int wstart = ADAPT_START_INDEX(pw, input_width, output_width);
-      int wend = ADAPT_END_INDEX(pw, input_width, output_width);
+      wstart = ADAPT_START_INDEX(pw, input_width, output_width);
+      wend = ADAPT_END_INDEX(pw, input_width, output_width);
     } else {
-      int hstart = ph * stride_height - padding_height;
-      int hend = min(hstart + ksize_height, input_height);
+      hstart = ph * stride_height - padding_height;
+      hend = min(hstart + ksize_height, input_height);
       hstart = max(hstart, 0);
 
-      int wstart = pw * stride_width - padding_width;
-      int wend = min(wstart + ksize_width, input_width);
+      wstart = pw * stride_width - padding_width;
+      wend = min(wstart + ksize_width, input_width);
       wstart = max(wstart, 0);
     }
 
@@ -74,7 +88,7 @@ __global__ void KernelPool2DGrad(
     const int input_width, const int output_height, const int output_width,
     const int ksize_height, const int ksize_width, const int stride_height,
     const int stride_width, const int padding_height, const int padding_width,
-    PoolProcess pool_process, bool exclusive, T* input_grad) {
+    PoolProcess pool_process, bool exclusive, bool adaptive, T* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int offsetW = index % input_width + padding_width;
@@ -82,14 +96,24 @@ __global__ void KernelPool2DGrad(
     int offsetC = (index / input_width / input_height) % channels;
     int batch_idx = index / input_width / input_height / channels;
 
-    int phstart = (offsetH < ksize_height)
-                      ? 0
-                      : (offsetH - ksize_height) / stride_height + 1;
-    int pwstart = (offsetW < ksize_width)
-                      ? 0
-                      : (offsetW - ksize_width) / stride_width + 1;
-    int phend = min(offsetH / stride_height + 1, output_height);
-    int pwend = min(offsetW / stride_width + 1, output_width);
+    int phstart, phend;
+    int pwstart, pwend;
+    if (adaptive) {
+      phstart = offsetH * output_height / input_height;
+      phend =
+          min((offsetH + 1) * output_height / input_height + 1, output_height);
+      pwstart = offsetW * output_width / input_width;
+      pwend = min((offsetW + 1) * output_width / input_width + 1, output_width);
+    } else {
+      phstart = (offsetH < ksize_height)
+                    ? 0
+                    : (offsetH - ksize_height) / stride_height + 1;
+      pwstart = (offsetW < ksize_width)
+                    ? 0
+                    : (offsetW - ksize_width) / stride_width + 1;
+      phend = min(offsetH / stride_height + 1, output_height);
+      pwend = min(offsetW / stride_width + 1, output_width);
+    }
     T gradient = 0;
     T input = input_data[index];
     int output_idx =
@@ -98,14 +122,22 @@ __global__ void KernelPool2DGrad(
     output_grad += output_idx;
     for (int ph = phstart; ph < phend; ++ph) {
       for (int pw = pwstart; pw < pwend; ++pw) {
-        int hstart = ph * stride_height - padding_height;
-        int wstart = pw * stride_width - padding_width;
-        int hend = min(hstart + ksize_height, input_height);
-        int wend = min(wstart + ksize_width, input_width);
-        hstart = max(hstart, 0);
-        wstart = max(wstart, 0);
-        int pool_size = exclusive ? (hend - hstart) * (wend - wstart)
-                                  : ksize_height * ksize_width;
+        int pool_size;
+        if (adaptive) {
+          pool_size = static_cast<int>(ceil(static_cast<double>(input_height) /
+                                            ksize_height)) *
+                      static_cast<int>(
+                          ceil(static_cast<double>(input_width) / ksize_width));
+        } else {
+          int hstart = ph * stride_height - padding_height;
+          int wstart = pw * stride_width - padding_width;
+          int hend = min(hstart + ksize_height, input_height);
+          int wend = min(wstart + ksize_width, input_width);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          pool_size = exclusive ? (hend - hstart) * (wend - wstart)
+                                : ksize_height * ksize_width;
+        }
         int output_sub_idx = ph * output_width + pw;
         pool_process.compute(input, output_data[output_sub_idx],
                              output_grad[output_sub_idx],
@@ -189,7 +221,7 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
   KernelPool2D<PoolProcess, T><<<grid, threads, 0, stream>>>(
       nthreads, input, input_channels, input_height, input_width, output_height,
       output_width, ksize_height, ksize_width, stride_height, stride_width,
-      padding_height, padding_width, pool_compute, exclusive, output);
+      padding_height, padding_width, pool_compute, exclusive, false, output);
 }
 
 /*
@@ -204,7 +236,7 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
                   const framework::Tensor& input, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_process,
-                  bool exclusive, framework::Tensor* output) {
+                  bool exclusive, bool adaptive, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
@@ -231,7 +263,7 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
         nthreads, input_data, input_channels, input_height, input_width,
         output_height, output_width, ksize_height, ksize_width, stride_height,
         stride_width, padding_height, padding_width, pool_process, exclusive,
-        output_data);
+        adaptive, output_data);
   }
 };
 
@@ -250,7 +282,8 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_process,
-                  bool exclusive, framework::Tensor* input_grad) {
+                  bool exclusive, bool adaptive,
+                  framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
@@ -278,7 +311,7 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
         nthreads, input_data, output_data, output_grad_data, input_channels,
         input_height, input_width, output_height, output_width, ksize_height,
         ksize_width, stride_height, stride_width, padding_height, padding_width,
-        pool_process, exclusive, input_grad_data);
+        pool_process, exclusive, adaptive, input_grad_data);
   }
 };
 
@@ -367,7 +400,7 @@ __global__ void KernelPool3D(
     const int ksize_depth, const int ksize_height, const int ksize_width,
     const int stride_depth, const int stride_height, const int stride_width,
     const int padding_depth, const int padding_height, const int padding_width,
-    PoolProcess pool_process, bool exclusive, T* output_data) {
+    PoolProcess pool_process, bool exclusive, bool adaptive, T* output_data) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw = index % output_width;
@@ -376,15 +409,30 @@ __global__ void KernelPool3D(
     int c = (index / output_width / output_height / output_depth) % channels;
     int batch_idx =
         index / output_width / output_height / output_depth / channels;
-    int dstart = pd * stride_depth - padding_depth;
-    int hstart = ph * stride_height - padding_height;
-    int wstart = pw * stride_width - padding_width;
-    int dend = min(dstart + ksize_depth, input_depth);
-    int hend = min(hstart + ksize_height, input_height);
-    int wend = min(wstart + ksize_width, input_width);
-    dstart = max(dstart, 0);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
+
+    int dstart, dend;
+    int hstart, hend;
+    int wstart, wend;
+    if (adaptive) {
+      dstart = ADAPT_START_INDEX(pd, input_depth, output_depth);
+      dend = ADAPT_END_INDEX(pd, input_depth, output_depth);
+
+      hstart = ADAPT_START_INDEX(ph, input_height, output_height);
+      hend = ADAPT_END_INDEX(ph, input_height, output_height);
+
+      wstart = ADAPT_START_INDEX(pw, input_width, output_width);
+      wend = ADAPT_END_INDEX(pw, input_width, output_width);
+    } else {
+      dstart = pd * stride_depth - padding_depth;
+      hstart = ph * stride_height - padding_height;
+      wstart = pw * stride_width - padding_width;
+      dend = min(dstart + ksize_depth, input_depth);
+      hend = min(hstart + ksize_height, input_height);
+      wend = min(wstart + ksize_width, input_width);
+      dstart = max(dstart, 0);
+      hstart = max(hstart, 0);
+      wstart = max(wstart, 0);
+    }
     T ele = pool_process.initial();
     input_data +=
         (batch_idx * channels + c) * input_depth * input_height * input_width;
@@ -396,7 +444,7 @@ __global__ void KernelPool3D(
         }
       }
     }
-    int pool_size = exclusive
+    int pool_size = (exclusive || adaptive)
                         ? (dend - dstart) * (hend - hstart) * (wend - wstart)
                         : ksize_depth * ksize_height * ksize_width;
     pool_process.finalize(static_cast<T>(pool_size), &ele);
@@ -413,7 +461,7 @@ __global__ void KernelPool3DGrad(
     const int ksize_height, const int ksize_width, const int stride_depth,
     const int stride_height, const int stride_width, const int padding_depth,
     const int padding_height, const int padding_width, PoolProcess pool_process,
-    bool exclusive, T* input_grad) {
+    bool exclusive, bool adaptive, T* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int offsetW = index % input_width + padding_width;
@@ -423,18 +471,31 @@ __global__ void KernelPool3DGrad(
     int offsetC = (index / input_width / input_height / input_depth) % channels;
     int batch_idx = index / input_width / input_height / input_depth / channels;
 
-    int pdstart = (offsetD < ksize_depth)
-                      ? 0
-                      : (offsetD - ksize_depth) / stride_depth + 1;
-    int phstart = (offsetH < ksize_height)
-                      ? 0
-                      : (offsetH - ksize_height) / stride_height + 1;
-    int pwstart = (offsetW < ksize_width)
-                      ? 0
-                      : (offsetW - ksize_width) / stride_width + 1;
-    int pdend = min((offsetD) / stride_depth + 1, output_depth);
-    int phend = min((offsetH) / stride_height + 1, output_height);
-    int pwend = min((offsetW) / stride_width + 1, output_width);
+    int pdstart, pdend;
+    int phstart, phend;
+    int pwstart, pwend;
+    if (adaptive) {
+      pdstart = offsetD * output_depth / input_depth;
+      pdend = min((offsetD + 1) * output_depth / input_depth + 1, output_depth);
+      phstart = offsetH * output_height / input_height;
+      phend =
+          min((offsetH + 1) * output_height / input_height + 1, output_height);
+      pwstart = offsetW * output_width / input_width;
+      pwend = min((offsetW + 1) * output_width / input_width + 1, output_width);
+    } else {
+      pdstart = (offsetD < ksize_depth)
+                    ? 0
+                    : (offsetD - ksize_depth) / stride_depth + 1;
+      phstart = (offsetH < ksize_height)
+                    ? 0
+                    : (offsetH - ksize_height) / stride_height + 1;
+      pwstart = (offsetW < ksize_width)
+                    ? 0
+                    : (offsetW - ksize_width) / stride_width + 1;
+      pdend = min((offsetD) / stride_depth + 1, output_depth);
+      phend = min((offsetH) / stride_height + 1, output_height);
+      pwend = min((offsetW) / stride_width + 1, output_width);
+    }
 
     T gradient = 0;
     T input = input_data[index];
@@ -447,18 +508,29 @@ __global__ void KernelPool3DGrad(
       for (int ph = phstart; ph < phend; ++ph) {
         for (int pw = pwstart; pw < pwend; ++pw) {
           // figure out the pooling size
-          int dstart = pd * stride_depth - padding_depth;
-          int hstart = ph * stride_height - padding_height;
-          int wstart = pw * stride_width - padding_width;
-          int dend = min(dstart + ksize_depth, input_depth);
-          int hend = min(hstart + ksize_height, input_height);
-          int wend = min(wstart + ksize_width, input_width);
-          dstart = max(dstart, 0);
-          hstart = max(hstart, 0);
-          wstart = max(wstart, 0);
-          int pool_size =
-              exclusive ? (dend - dstart) * (hend - hstart) * (wend - wstart)
-                        : ksize_depth * ksize_height * ksize_width;
+          int pool_size;
+          if (adaptive) {
+            pool_size =
+                static_cast<int>(
+                    ceil(static_cast<double>(input_depth) / ksize_depth)) *
+                static_cast<int>(
+                    ceil(static_cast<double>(input_height) / ksize_height)) *
+                static_cast<int>(
+                    ceil(static_cast<double>(input_width) / ksize_width));
+          } else {
+            int dstart = pd * stride_depth - padding_depth;
+            int hstart = ph * stride_height - padding_height;
+            int wstart = pw * stride_width - padding_width;
+            int dend = min(dstart + ksize_depth, input_depth);
+            int hend = min(hstart + ksize_height, input_height);
+            int wend = min(wstart + ksize_width, input_width);
+            dstart = max(dstart, 0);
+            hstart = max(hstart, 0);
+            wstart = max(wstart, 0);
+            pool_size =
+                exclusive ? (dend - dstart) * (hend - hstart) * (wend - wstart)
+                          : ksize_depth * ksize_height * ksize_width;
+          }
           int output_sub_idx = (pd * output_height + ph) * output_width + pw;
           pool_process.compute(input, output_data[output_sub_idx],
                                output_grad[output_sub_idx],
@@ -533,7 +605,7 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
                   const framework::Tensor& input, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_process,
-                  bool exclusive, framework::Tensor* output) {
+                  bool exclusive, bool adaptive, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_depth = input.dims()[2];
@@ -567,7 +639,7 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
         input_width, output_depth, output_height, output_width, ksize_depth,
         ksize_height, ksize_width, stride_depth, stride_height, stride_width,
         padding_depth, padding_height, padding_width, pool_process, exclusive,
-        output_data);
+        adaptive, output_data);
   }
 };
 
@@ -586,7 +658,8 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_process,
-                  bool exclusive, framework::Tensor* input_grad) {
+                  bool exclusive, bool adaptive,
+                  framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_depth = input.dims()[2];
@@ -622,7 +695,7 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
         input_depth, input_height, input_width, output_depth, output_height,
         output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
         stride_height, stride_width, padding_depth, padding_height,
-        padding_width, pool_process, exclusive, input_grad_data);
+        padding_width, pool_process, exclusive, adaptive, input_grad_data);
   }
 };
 
@@ -711,7 +784,7 @@ __global__ void KernelMaxPool2dWithIdx(
     const int input_height, const int input_width, const int output_height,
     const int output_width, const int ksize_height, const int ksize_width,
     const int stride_height, const int stride_width, const int padding_height,
-    const int padding_width, T1* output_data, T2* mask_data) {
+    const int padding_width, bool adaptive, T1* output_data, T2* mask_data) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw = index % output_width;
@@ -719,13 +792,23 @@ __global__ void KernelMaxPool2dWithIdx(
     int c = (index / output_width / output_height) % channels;
     int batch_idx = index / output_width / output_height / channels;
 
-    int hstart = ph * stride_height - padding_height;
-    int hend = min(hstart + ksize_height, input_height);
-    hstart = max(hstart, 0);
+    int hstart, hend;
+    int wstart, wend;
+    if (adaptive) {
+      hstart = ADAPT_START_INDEX(ph, input_height, output_height);
+      hend = ADAPT_END_INDEX(ph, input_height, output_height);
 
-    int wstart = pw * stride_width - padding_width;
-    int wend = min(wstart + ksize_width, input_width);
-    wstart = max(wstart, 0);
+      wstart = ADAPT_START_INDEX(pw, input_width, output_width);
+      wend = ADAPT_END_INDEX(pw, input_width, output_width);
+    } else {
+      hstart = ph * stride_height - padding_height;
+      hend = min(hstart + ksize_height, input_height);
+      hstart = max(hstart, 0);
+
+      wstart = pw * stride_width - padding_width;
+      wend = min(wstart + ksize_width, input_width);
+      wstart = max(wstart, 0);
+    }
 
     input_data += (batch_idx * channels + c) * input_height * input_width;
     T1 ele = -FLT_MAX;
@@ -750,36 +833,46 @@ __global__ void KernelMaxPool2DWithIdxGrad(
     const int channels, const int input_height, const int input_width,
     const int output_height, const int output_width, const int ksize_height,
     const int ksize_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, T1* input_grad) {
+    const int padding_height, const int padding_width, bool adaptive,
+    T1* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
-    int w_offset = index % input_width;
-    int h_offset = (index / input_width) % input_height;
-    int c_offset = (index / input_width / input_height) % channels;
+    int offsetW = index % input_width;
+    int offsetH = (index / input_width) % input_height;
+    int offsetC = (index / input_width / input_height) % channels;
     int batch_idx = index / input_width / input_height / channels;
 
-    int ph_start =
-        (h_offset + padding_height < ksize_height)
-            ? 0
-            : (h_offset + padding_height - ksize_height) / stride_height + 1;
-    int pw_start =
-        (w_offset + padding_width < ksize_width)
-            ? 0
-            : (w_offset + padding_width - ksize_width) / stride_width + 1;
-    int ph_end =
-        min((h_offset + padding_height) / stride_height + 1, output_height);
-    int pw_end =
-        min((w_offset + padding_width) / stride_width + 1, output_width);
+    int phstart, phend;
+    int pwstart, pwend;
+    if (adaptive) {
+      phstart = offsetH * output_height / input_height;
+      phend =
+          min((offsetH + 1) * output_height / input_height + 1, output_height);
+      pwstart = offsetW * output_width / input_width;
+      pwend = min((offsetW + 1) * output_width / input_width + 1, output_width);
+    } else {
+      phstart =
+          (offsetH + padding_height < ksize_height)
+              ? 0
+              : (offsetH + padding_height - ksize_height) / stride_height + 1;
+      pwstart =
+          (offsetW + padding_width < ksize_width)
+              ? 0
+              : (offsetW + padding_width - ksize_width) / stride_width + 1;
+      phend =
+          min((offsetH + padding_height) / stride_height + 1, output_height);
+      pwend = min((offsetW + padding_width) / stride_width + 1, output_width);
+    }
 
     T1 gradient = 0;
-    int input_current_featuremap_idx = h_offset * input_width + w_offset;
+    int input_current_featuremap_idx = offsetH * input_width + offsetW;
     int output_idx =
-        (batch_idx * channels + c_offset) * output_height * output_width;
+        (batch_idx * channels + offsetC) * output_height * output_width;
 
     mask_data += output_idx;
     output_grad += output_idx;
-    for (int ph = ph_start; ph < ph_end; ++ph) {
-      for (int pw = pw_start; pw < pw_end; ++pw) {
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
         if (mask_data[ph * output_width + pw] == input_current_featuremap_idx)
           gradient += output_grad[ph * output_width + pw];
       }
@@ -799,8 +892,8 @@ class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, framework::Tensor* output,
-                  framework::Tensor* mask) {
+                  const std::vector<int>& paddings, bool adaptive,
+                  framework::Tensor* output, framework::Tensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
@@ -827,7 +920,8 @@ class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
     KernelMaxPool2dWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, input_channels, input_height, input_width,
         output_height, output_width, ksize_height, ksize_width, stride_height,
-        stride_width, padding_height, padding_width, output_data, mask_data);
+        stride_width, padding_height, padding_width, adaptive, output_data,
+        mask_data);
   }
 };
 
@@ -843,7 +937,7 @@ class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
+                  const std::vector<int>& paddings, bool adaptive,
                   framework::Tensor* input_grad) {
     const int batch_size = input_grad->dims()[0];
     const int input_channels = input_grad->dims()[1];
@@ -870,7 +964,7 @@ class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
     KernelMaxPool2DWithIdxGrad<T1, T2><<<grid, threads, 0, context.stream()>>>(
         nthreads, output_grad_data, mask_data, input_channels, input_height,
         input_width, output_height, output_width, ksize_height, ksize_width,
-        stride_height, stride_width, padding_height, padding_width,
+        stride_height, stride_width, padding_height, padding_width, adaptive,
         input_grad_data);
   }
 };
@@ -892,7 +986,7 @@ __global__ void KernelMaxPool3DWithIdx(
     const int ksize_depth, const int ksize_height, const int ksize_width,
     const int stride_depth, const int stride_height, const int stride_width,
     const int padding_depth, const int padding_height, const int padding_width,
-    T1* output_data, T2* mask_data) {
+    bool adaptive, T1* output_data, T2* mask_data) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw = index % output_width;
@@ -902,15 +996,29 @@ __global__ void KernelMaxPool3DWithIdx(
     int batch_idx =
         index / output_width / output_height / output_depth / channels;
 
-    int dstart = pd * stride_depth - padding_depth;
-    int hstart = ph * stride_height - padding_height;
-    int wstart = pw * stride_width - padding_width;
-    int dend = min(dstart + ksize_depth, input_depth);
-    int hend = min(hstart + ksize_height, input_height);
-    int wend = min(wstart + ksize_width, input_width);
-    dstart = max(dstart, 0);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
+    int dstart, dend;
+    int hstart, hend;
+    int wstart, wend;
+    if (adaptive) {
+      dstart = ADAPT_START_INDEX(pd, input_depth, output_depth);
+      dend = ADAPT_END_INDEX(pd, input_depth, output_depth);
+
+      hstart = ADAPT_START_INDEX(ph, input_height, output_height);
+      hend = ADAPT_END_INDEX(ph, input_height, output_height);
+
+      wstart = ADAPT_START_INDEX(pw, input_width, output_width);
+      wend = ADAPT_END_INDEX(pw, input_width, output_width);
+    } else {
+      dstart = pd * stride_depth - padding_depth;
+      hstart = ph * stride_height - padding_height;
+      wstart = pw * stride_width - padding_width;
+      dend = min(dstart + ksize_depth, input_depth);
+      hend = min(hstart + ksize_height, input_height);
+      wend = min(wstart + ksize_width, input_width);
+      dstart = max(dstart, 0);
+      hstart = max(hstart, 0);
+      wstart = max(wstart, 0);
+    }
 
     T1 ele = -FLT_MAX;
     int max_index = -1;
@@ -940,46 +1048,56 @@ __global__ void KernelMaxPool3DWithIdxGrad(
     const int output_width, const int ksize_depth, const int ksize_height,
     const int ksize_width, const int stride_depth, const int stride_height,
     const int stride_width, const int padding_depth, const int padding_height,
-    const int padding_width, T1* input_grad) {
+    const int padding_width, bool adaptive, T1* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
-    int w_offset = index % input_width;
-    int h_offset = (index / input_width) % input_height;
-    int d_offset = (index / input_width / input_height) % input_depth;
-    int c_offset =
-        (index / input_width / input_height / input_depth) % channels;
+    int offsetW = index % input_width;
+    int offsetH = (index / input_width) % input_height;
+    int offsetD = (index / input_width / input_height) % input_depth;
+    int offsetC = (index / input_width / input_height / input_depth) % channels;
     int batch_idx = index / input_width / input_height / input_depth / channels;
 
-    int pd_start =
-        (d_offset + padding_depth < ksize_depth)
-            ? 0
-            : (d_offset + padding_depth - ksize_depth) / stride_depth + 1;
-    int ph_start =
-        (h_offset + padding_height < ksize_height)
-            ? 0
-            : (h_offset + padding_height - ksize_height) / stride_height + 1;
-    int pw_start =
-        (w_offset + padding_width < ksize_width)
-            ? 0
-            : (w_offset + padding_width - ksize_width) / stride_width + 1;
-    int pd_end =
-        min((d_offset + padding_depth) / stride_depth + 1, output_depth);
-    int ph_end =
-        min((h_offset + padding_height) / stride_height + 1, output_height);
-    int pw_end =
-        min((w_offset + padding_width) / stride_width + 1, output_width);
+    int pdstart, pdend;
+    int phstart, phend;
+    int pwstart, pwend;
+    if (adaptive) {
+      pdstart = offsetD * output_depth / input_depth;
+      pdend = min((offsetD + 1) * output_depth / input_depth + 1, output_depth);
+      phstart = offsetH * output_height / input_height;
+      phend =
+          min((offsetH + 1) * output_height / input_height + 1, output_height);
+      pwstart = offsetW * output_width / input_width;
+      pwend = min((offsetW + 1) * output_width / input_width + 1, output_width);
+    } else {
+      pdstart =
+          (offsetD + padding_depth < ksize_depth)
+              ? 0
+              : (offsetD + padding_depth - ksize_depth) / stride_depth + 1;
+      phstart =
+          (offsetH + padding_height < ksize_height)
+              ? 0
+              : (offsetH + padding_height - ksize_height) / stride_height + 1;
+      pwstart =
+          (offsetW + padding_width < ksize_width)
+              ? 0
+              : (offsetW + padding_width - ksize_width) / stride_width + 1;
+      pdend = min((offsetD + padding_depth) / stride_depth + 1, output_depth);
+      phend =
+          min((offsetH + padding_height) / stride_height + 1, output_height);
+      pwend = min((offsetW + padding_width) / stride_width + 1, output_width);
+    }
 
     T1 gradient = 0;
     int input_current_feature_map_idx =
-        (d_offset * input_height + h_offset) * input_width + w_offset;
-    int output_idx = (batch_idx * channels + c_offset) * output_depth *
+        (offsetD * input_height + offsetH) * input_width + offsetW;
+    int output_idx = (batch_idx * channels + offsetC) * output_depth *
                      output_height * output_width;
     mask += output_idx;
     output_grad += output_idx;
 
-    for (int pd = pd_start; pd < pd_end; ++pd) {
-      for (int ph = ph_start; ph < ph_end; ++ph) {
-        for (int pw = pw_start; pw < pw_end; ++pw) {
+    for (int pd = pdstart; pd < pdend; ++pd) {
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
           if (mask[(pd * output_height + ph) * output_width + pw] ==
               input_current_feature_map_idx)
             gradient +=
@@ -1002,8 +1120,8 @@ class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, framework::Tensor* output,
-                  framework::Tensor* mask) {
+                  const std::vector<int>& paddings, bool adaptive,
+                  framework::Tensor* output, framework::Tensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_depth = input.dims()[2];
@@ -1037,7 +1155,8 @@ class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
         nthreads, input_data, input_channels, input_depth, input_height,
         input_width, output_depth, output_height, output_width, ksize_depth,
         ksize_height, ksize_width, stride_depth, stride_height, stride_width,
-        padding_depth, padding_height, padding_width, output_data, mask_data);
+        padding_depth, padding_height, padding_width, adaptive, output_data,
+        mask_data);
   }
 };
 
@@ -1053,7 +1172,7 @@ class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
+                  const std::vector<int>& paddings, bool adaptive,
                   framework::Tensor* input_grad) {
     const int batch_size = input_grad->dims()[0];
     const int input_channels = input_grad->dims()[1];
@@ -1087,7 +1206,7 @@ class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
         nthreads, output_grad_data, mask_data, input_channels, input_depth,
         input_height, input_width, output_depth, output_height, output_width,
         ksize_depth, ksize_height, ksize_width, stride_depth, stride_height,
-        stride_width, padding_depth, padding_height, padding_width,
+        stride_width, padding_depth, padding_height, padding_width, adaptive,
         input_grad_data);
   }
 };
diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h
index 923babd4c2..d123af8924 100644
--- a/paddle/fluid/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
@@ -102,7 +102,7 @@ class Pool2dFunctor {
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_compute,
-                  bool exclusive, framework::Tensor* output);
+                  bool exclusive, bool adaptive, framework::Tensor* output);
 };
 
 template <typename DeviceContext, typename PoolProcess, typename T>
@@ -114,7 +114,7 @@ class Pool2dGradFunctor {
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_compute,
-                  bool exclusive, framework::Tensor* input_grad);
+                  bool exclusive, bool adaptive, framework::Tensor* input_grad);
 };
 
 template <typename DeviceContext, class T>
@@ -136,7 +136,7 @@ class Pool3dFunctor {
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_compute,
-                  bool exclusive, framework::Tensor* output);
+                  bool exclusive, bool adaptive, framework::Tensor* output);
 };
 
 template <typename DeviceContext, typename PoolProcess, typename T>
@@ -148,7 +148,7 @@ class Pool3dGradFunctor {
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_compute,
-                  bool exclusive, framework::Tensor* input_grad);
+                  bool exclusive, bool adaptive, framework::Tensor* input_grad);
 };
 
 template <typename DeviceContext, class T>
@@ -176,8 +176,8 @@ class MaxPool2dWithIndexFunctor {
   void operator()(const DeviceContext& context, const framework::Tensor& input,
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, framework::Tensor* output,
-                  framework::Tensor* mask);
+                  const std::vector<int>& paddings, bool adaptive,
+                  framework::Tensor* output, framework::Tensor* mask);
 };
 
 template <typename DeviceContext, typename T1, typename T2>
@@ -187,7 +187,7 @@ class MaxPool2dWithIndexGradFunctor {
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
+                  const std::vector<int>& paddings, bool adaptive,
                   framework::Tensor* input_grad);
 };
 
@@ -197,8 +197,8 @@ class MaxPool3dWithIndexFunctor {
   void operator()(const DeviceContext& context, const framework::Tensor& input,
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings, framework::Tensor* output,
-                  framework::Tensor* mask);
+                  const std::vector<int>& paddings, bool adaptive,
+                  framework::Tensor* output, framework::Tensor* mask);
 };
 
 template <typename DeviceContext, typename T1, typename T2>
@@ -208,7 +208,7 @@ class MaxPool3dWithIndexGradFunctor {
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
+                  const std::vector<int>& paddings, bool adaptive,
                   framework::Tensor* input_grad);
 };
 
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index 52b607df74..11b5c49323 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -52,6 +52,7 @@ void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
   std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
   std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
   bool ceil_mode = ctx->Attrs().Get<bool>("ceil_mode");
+  bool adaptive = ctx->Attrs().Get<bool>("adaptive");
 
   PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
                  "Pooling intput should be 4-D or 5-D tensor.");
@@ -72,9 +73,13 @@ void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
                     "Paddings size and pooling size should be the same.");
 
   std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
-  for (size_t i = 0; i < ksize.size(); ++i) {
-    output_shape.push_back(PoolOutputSize(in_x_dims[i + 2], ksize[i],
-                                          paddings[i], strides[i], ceil_mode));
+  if (adaptive) {
+    output_shape.insert(output_shape.end(), ksize.begin(), ksize.end());
+  } else {
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      output_shape.push_back(PoolOutputSize(
+          in_x_dims[i + 2], ksize[i], paddings[i], strides[i], ceil_mode));
+    }
   }
   ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
   ctx->ShareLoD("X", "Out");
@@ -186,6 +191,14 @@ void Pool2dOpMaker::Make() {
       "averaging calculating, otherwise, include the zero-padding. Note, it "
       "is only used when pooling_type is avg. The defalut is True.")
       .SetDefault(true);
+  AddAttr<bool>(
+      "adaptive",
+      "(bool, default False) When true, will perform adaptive pooling instead, "
+      "output shape in H and W dimensions will be same as ksize, input data "
+      "will be divided into grids specify by ksize averagely and perform "
+      "pooling in each grid area to get output pooling value.")
+      .SetDefault(false);
+
   AddAttr<bool>(
       "use_cudnn",
       "(bool, default false) Only used in cudnn kernel, need install cudnn")
@@ -325,6 +338,13 @@ void Pool3dOpMaker::Make() {
       "averaging calculating, otherwise, include the zero-padding. Note, it "
       "is only used when pooling_type is avg. The defalut is True.")
       .SetDefault(true);
+  AddAttr<bool>(
+      "adaptive",
+      "(bool, default False) When true, will perform adaptive pooling instead, "
+      "output shape in H and W dimensions will be same as ksize, input data "
+      "will be divided into grids specify by ksize averagely and perform "
+      "pooling in each grid area to get output pooling value.")
+      .SetDefault(false);
 
   AddAttr<bool>(
       "use_cudnn",
diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h
index c0594b7e3c..6c5900bd0f 100644
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -70,6 +70,7 @@ class PoolKernel : public framework::OpKernel<T> {
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     bool exclusive = context.Attr<bool>("exclusive");
+    bool adaptive = context.Attr<bool>("adaptive");
     if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
@@ -85,7 +86,7 @@ class PoolKernel : public framework::OpKernel<T> {
               pool2d_forward;
           paddle::operators::math::MaxPool<T> pool_process;
           pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         true, out);
+                         true, false, out);
 
         } else if (pooling_type == "avg") {
           paddle::operators::math::Pool2dFunctor<
@@ -93,7 +94,7 @@ class PoolKernel : public framework::OpKernel<T> {
               pool2d_forward;
           paddle::operators::math::AvgPool<T> pool_process;
           pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         exclusive, out);
+                         exclusive, adaptive, out);
         }
       } break;
       case 3: {
@@ -103,14 +104,14 @@ class PoolKernel : public framework::OpKernel<T> {
               pool3d_forward;
           paddle::operators::math::MaxPool<T> pool_process;
           pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         true, out);
+                         true, false, out);
         } else if (pooling_type == "avg") {
           paddle::operators::math::Pool3dFunctor<
               DeviceContext, paddle::operators::math::AvgPool<T>, T>
               pool3d_forward;
           paddle::operators::math::AvgPool<T> pool_process;
           pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         exclusive, out);
+                         exclusive, adaptive, out);
         }
       } break;
       default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
@@ -133,6 +134,7 @@ class PoolGradKernel : public framework::OpKernel<T> {
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     bool exclusive = context.Attr<bool>("exclusive");
+    bool adaptive = context.Attr<bool>("adaptive");
 
     if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
@@ -159,7 +161,8 @@ class PoolGradKernel : public framework::OpKernel<T> {
                 pool2d_backward;
             paddle::operators::math::AvgPoolGrad<T> pool_process;
             pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, pool_process, exclusive, in_x_grad);
+                            paddings, pool_process, exclusive, adaptive,
+                            in_x_grad);
           }
         } break;
         case 3: {
@@ -174,7 +177,8 @@ class PoolGradKernel : public framework::OpKernel<T> {
                 pool3d_backward;
             paddle::operators::math::AvgPoolGrad<T> pool_process;
             pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, pool_process, exclusive, in_x_grad);
+                            paddings, pool_process, exclusive, adaptive,
+                            in_x_grad);
           }
         } break;
         default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc
index 873706593e..f9e25277e5 100644
--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -40,6 +40,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
     std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
     std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
     std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    bool adaptive = ctx->Attrs().Get<bool>("adaptive");
 
     PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
                    "Pooling intput should be 4-D or 5-D tensor.");
@@ -60,9 +61,13 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
                       "Paddings size and pooling size should be the same.");
 
     std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
-    for (size_t i = 0; i < ksize.size(); ++i) {
-      output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i],
-                                               paddings[i], strides[i]));
+    if (adaptive) {
+      output_shape.insert(output_shape.end(), ksize.begin(), ksize.end());
+    } else {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i],
+                                                 paddings[i], strides[i]));
+      }
     }
     ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
     ctx->SetOutputDim("Mask", framework::make_ddim(output_shape));
@@ -133,6 +138,14 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
         "(bool, default:false) Whether to use the global pooling. "
         "If global_pooling = true, ksize and paddings will be ignored.")
         .SetDefault(false);
+    AddAttr<bool>(
+        "adaptive",
+        "(bool, default False) When true, will perform adaptive pooling "
+        "instead, "
+        "output shape in H and W dimensions will be same as ksize, input data "
+        "will be divided into grids specify by ksize averagely and perform "
+        "pooling in each grid area to get output pooling value.")
+        .SetDefault(false);
     AddAttr<std::vector<int>>("strides",
                               "(vector<int>, default {1, 1}), strides(height, "
                               "width) of pooling operator.")
@@ -209,6 +222,14 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
         "(bool, default false) Whether to use the global pooling. "
         "If global_pooling = true, ksize and paddings will be ignored.")
         .SetDefault(false);
+    AddAttr<bool>(
+        "adaptive",
+        "(bool, default False) When true, will perform adaptive pooling "
+        "instead, "
+        "output shape in H and W dimensions will be same as ksize, input data "
+        "will be divided into grids specify by ksize averagely and perform "
+        "pooling in each grid area to get output pooling value.")
+        .SetDefault(false);
     AddAttr<std::vector<int>>("strides",
                               "(vector<int>, default {1,1,1}), strides(depth, "
                               "height, width) of pooling operator.")
diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h
index b55fa76eae..a6bec121d4 100644
--- a/paddle/fluid/operators/pool_with_index_op.h
+++ b/paddle/fluid/operators/pool_with_index_op.h
@@ -36,6 +36,7 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    bool adaptive = context.Attr<bool>("adaptive");
 
     auto& dev_ctx = context.template device_context<DeviceContext>();
     if (context.Attr<bool>("global_pooling")) {
@@ -50,13 +51,15 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
         paddle::operators::math::MaxPool2dWithIndexFunctor<DeviceContext, T1,
                                                            T2>
             pool2d_forward;
-        pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask);
+        pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out,
+                       mask);
       } break;
       case 3: {
         paddle::operators::math::MaxPool3dWithIndexFunctor<DeviceContext, T1,
                                                            T2>
             pool3d_forward;
-        pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask);
+        pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out,
+                       mask);
       } break;
       default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
     }
@@ -75,6 +78,7 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    bool adaptive = context.Attr<bool>("adaptive");
     if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
@@ -93,14 +97,14 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
                                                                  T1, T2>
               pool2d_backward;
           pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides,
-                          paddings, in_x_grad);
+                          paddings, adaptive, in_x_grad);
         } break;
         case 3: {
           paddle::operators::math::MaxPool3dWithIndexGradFunctor<DeviceContext,
                                                                  T1, T2>
               pool3d_backward;
           pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides,
-                          paddings, in_x_grad);
+                          paddings, adaptive, in_x_grad);
         } break;
         default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
       }
diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h
index 35d9737ee0..3c2d51ec91 100644
--- a/paddle/fluid/operators/spp_op.h
+++ b/paddle/fluid/operators/spp_op.h
@@ -56,13 +56,13 @@ class SppKernel : public framework::OpKernel<T> {
         math::Pool2dFunctor<DeviceContext, math::MaxPool<T>, T> pool_forward;
         math::MaxPool<T> max_process;
         pool_forward(context.template device_context<DeviceContext>(), *in_x,
-                     kernel_size, strides, paddings, max_process, true,
+                     kernel_size, strides, paddings, max_process, true, false,
                      &out_level);
       } else if (pooling_type == "avg") {
         math::Pool2dFunctor<DeviceContext, math::AvgPool<T>, T> pool_forward;
         math::AvgPool<T> avg_process;
         pool_forward(context.template device_context<DeviceContext>(), *in_x,
-                     kernel_size, strides, paddings, avg_process, true,
+                     kernel_size, strides, paddings, avg_process, true, false,
                      &out_level);
       }
       // flatten pooling output shape
@@ -156,7 +156,7 @@ class SppGradKernel : public framework::OpKernel<T> {
         math::AvgPoolGrad<T> avg_process;
         pool_backward(context.template device_context<DeviceContext>(), *in_x,
                       *&out_level, *&outgrad_level, kernel_size, strides,
-                      paddings, avg_process, true, in_x_grad);
+                      paddings, avg_process, true, false, in_x_grad);
       }
     }
   }
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index e25eaaa9fd..61794f0d49 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -52,6 +52,8 @@ __all__ = [
     'softmax',
     'pool2d',
     'pool3d',
+    'adaptive_pool2d',
+    'adaptive_pool3d',
     'batch_norm',
     'beam_search_decode',
     'conv2d_transpose',
@@ -2499,6 +2501,190 @@ def pool3d(input,
     return pool_out
 
 
+@templatedoc(op_type="pool2d")
+def adaptive_pool2d(input,
+                    pool_size,
+                    pool_type="max",
+                    require_index=False,
+                    use_cudnn=True,
+                    name=None):
+    """
+    ${comment}
+
+    Args:
+        input (Variable): The input tensor of pooling operator. The format of
+                          input tensor is NCHW, where N is batch size, C is
+                          the number of channels, H is the height of the
+                          feature, and W is the width of the feature.
+        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two integers, (pool_size_Height, pool_size_Width).
+        pool_type: ${pooling_type_comment}
+        require_index (bool): If true, the index of max pooling point along with outputs.
+            it cannot be set in average pooling type.
+        use_cudnn (bool): ${use_cudnn_comment}
+        name (str|None): A name for this layer(optional). If set None, the
+                        layer will be named automatically.
+
+    Returns:
+        Variable: The pooling result.
+
+    Raises:
+        ValueError: 'pool_type' is not 'max' nor 'avg'.
+        ValueError: 'use_cudnn' is not a bool value.
+        ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'.
+        ValueError: 'pool_size' should be a list or tuple with length as 2.
+
+    Examples:
+
+        .. code-block:: python
+
+          data = fluid.layers.data(
+              name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d = fluid.layers.pool2d(
+                            input=data,
+                            pool_size=[3, 3],
+                            pool_type='max',
+                            require_index=True)
+    """
+    if pool_type not in ["max", "avg"]:
+        raise ValueError(
+            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
+            str(pool_type))
+
+    if pool_type == "avg" and require_index:
+        raise ValueError(
+            "invalid setting 'require_index' true when 'pool_type' is 'avg'.")
+
+    def _is_list_or_tuple_(data):
+        return (isinstance(data, list) or isinstance(data, tuple))
+
+    if not _is_list_or_tuple_(pool_size) or len(pool_size) != 2:
+        raise ValueError(
+            "'pool_size' should be a list or tuple with length as 2.")
+
+    if not isinstance(use_cudnn, bool):
+        raise ValueError("use_cudnn should be True or False.")
+
+    if pool_type == "max":
+        l_type = 'max_pool2d_with_index'
+    else:
+        l_type = "pool2d"
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    outputs = {"Out": pool_out}
+    if pool_type == "max":
+        mask = helper.create_variable_for_type_inference(dtype)
+        outputs["Mask"] = mask
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": input},
+        outputs=outputs,
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "use_cudnn": use_cudnn,
+            "adaptive": True,
+        })
+
+    return pool_out
+
+
+@templatedoc(op_type="pool3d")
+def adaptive_pool3d(input,
+                    pool_size,
+                    pool_type="max",
+                    require_index=False,
+                    use_cudnn=True,
+                    name=None):
+    """
+    ${comment}
+
+    Args:
+        input (Variable): The input tensor of pooling operator. The format of
+                          input tensor is NCHW, where N is batch size, C is
+                          the number of channels, H is the height of the
+                          feature, and W is the width of the feature.
+        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two integers, (Depth, Height, Width).
+        pool_type: ${pooling_type_comment}
+        require_index (bool): If true, the index of max pooling point along with outputs.
+            it cannot be set in average pooling type.
+        use_cudnn (bool): ${use_cudnn_comment}
+        name (str|None): A name for this layer(optional). If set None, the
+                        layer will be named automatically.
+
+    Returns:
+        Variable: The pooling result.
+
+    Raises:
+        ValueError: 'pool_type' is not 'max' nor 'avg'.
+        ValueError: 'use_cudnn' is not a bool value.
+        ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'.
+        ValueError: 'pool_size' should be a list or tuple with length as 2.
+
+    Examples:
+
+        .. code-block:: python
+
+          data = fluid.layers.data(
+              name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d = fluid.layers.pool2d(
+                            input=data,
+                            pool_size=[3, 3],
+                            pool_type='max',
+                            require_index=True)
+    """
+    if pool_type not in ["max", "avg"]:
+        raise ValueError(
+            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
+            str(pool_type))
+
+    if pool_type == "avg" and require_index:
+        raise ValueError(
+            "invalid setting 'require_index' true when 'pool_type' is 'avg'.")
+
+    def _is_list_or_tuple_(data):
+        return (isinstance(data, list) or isinstance(data, tuple))
+
+    if not _is_list_or_tuple_(pool_size) or len(pool_size) != 3:
+        raise ValueError(
+            "'pool_size' should be a list or tuple with length as 3.")
+
+    if not isinstance(use_cudnn, bool):
+        raise ValueError("use_cudnn should be True or False.")
+
+    if pool_type == "max":
+        l_type = 'max_pool3d_with_index'
+    else:
+        l_type = "pool3d"
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    outputs = {"Out": pool_out}
+    if pool_type == "max":
+        mask = helper.create_variable_for_type_inference(dtype)
+        outputs["Mask"] = mask
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": input},
+        outputs=outputs,
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "use_cudnn": use_cudnn,
+            "adaptive": True,
+        })
+
+    return pool_out
+
+
 def batch_norm(input,
                act=None,
                is_test=False,
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 10e8bb5a86..9785b5063c 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -233,6 +233,28 @@ class TestBook(unittest.TestCase):
                     pool_stride=[1, 2],
                     pool_padding=(2, 1)))
 
+    def test_adaptive_pool2d(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[3, 224, 224], dtype='float32')
+            self.assertIsNotNone(
+                layers.adaptive_pool2d(
+                    x, [3, 3], require_index=True))
+            self.assertIsNotNone(
+                layers.adaptive_pool2d(
+                    x, [3, 3], pool_type='avg'))
+
+    def test_adaptive_pool3d(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[3, 244, 224, 224], dtype='float32')
+            self.assertIsNotNone(
+                layers.adaptive_pool3d(
+                    x, [3, 3, 3], require_index=True))
+            self.assertIsNotNone(
+                layers.adaptive_pool3d(
+                    x, [3, 3, 3], pool_type='avg'))
+
     def test_lstm_unit(self):
         program = Program()
         with program_guard(program):
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index 47b2e71a4e..5ccdf082e8 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from __future__ import print_function
+from __future__ import division
 
 import unittest
 import numpy as np
@@ -21,29 +22,47 @@ import paddle.fluid.core as core
 from op_test import OpTest
 
 
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
 def max_pool2D_forward_naive(x,
                              ksize,
                              strides,
                              paddings,
                              global_pool=0,
                              ceil_mode=False,
-                             exclusive=True):
+                             exclusive=True,
+                             adaptive=False):
     N, C, H, W = x.shape
     if global_pool == 1:
         ksize = [H, W]
-    H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1
-             ) // strides[0] + 1 if ceil_mode else (
-                 H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
-    W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1
-             ) // strides[1] + 1 if ceil_mode else (
-                 W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
+    if adaptive:
+        H_out, W_out = ksize
+    else:
+        H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1
+                 ) // strides[0] + 1 if ceil_mode else (
+                     H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+        W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1
+                 ) // strides[1] + 1 if ceil_mode else (
+                     W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
     out = np.zeros((N, C, H_out, W_out))
     for i in range(H_out):
         for j in range(W_out):
-            r_start = np.max((i * strides[0] - paddings[0], 0))
-            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
-            c_start = np.max((j * strides[1] - paddings[1], 0))
-            c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+            if adaptive:
+                r_start = adaptive_start_index(i, H, ksize[0])
+                r_end = adaptive_end_index(i, H, ksize[0])
+                c_start = adaptive_start_index(j, W, ksize[1])
+                c_end = adaptive_end_index(j, W, ksize[1])
+            else:
+                r_start = np.max((i * strides[0] - paddings[0], 0))
+                r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+                c_start = np.max((j * strides[1] - paddings[1], 0))
+                c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
             x_masked = x[:, :, r_start:r_end, c_start:c_end]
 
             out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
@@ -56,27 +75,37 @@ def avg_pool2D_forward_naive(x,
                              paddings,
                              global_pool=0,
                              ceil_mode=False,
-                             exclusive=True):
+                             exclusive=True,
+                             adaptive=False):
     N, C, H, W = x.shape
     if global_pool == 1:
         ksize = [H, W]
-    H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1
-             ) // strides[0] + 1 if ceil_mode else (
-                 H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
-    W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1
-             ) // strides[1] + 1 if ceil_mode else (
-                 W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
+    if adaptive:
+        H_out, W_out = ksize
+    else:
+        H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1
+                 ) // strides[0] + 1 if ceil_mode else (
+                     H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+        W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1
+                 ) // strides[1] + 1 if ceil_mode else (
+                     W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
     out = np.zeros((N, C, H_out, W_out))
     for i in range(H_out):
         for j in range(W_out):
-            r_start = np.max((i * strides[0] - paddings[0], 0))
-            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
-            c_start = np.max((j * strides[1] - paddings[1], 0))
-            c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+            if adaptive:
+                r_start = adaptive_start_index(i, H, ksize[0])
+                r_end = adaptive_end_index(i, H, ksize[0])
+                c_start = adaptive_start_index(j, W, ksize[1])
+                c_end = adaptive_end_index(j, W, ksize[1])
+            else:
+                r_start = np.max((i * strides[0] - paddings[0], 0))
+                r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+                c_start = np.max((j * strides[1] - paddings[1], 0))
+                c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
             x_masked = x[:, :, r_start:r_end, c_start:c_end]
 
-            field_size = ((r_end - r_start) * (c_end - c_start)) if exclusive \
-                            else (ksize[0] * ksize[1])
+            field_size = ((r_end - r_start) * (c_end - c_start)) \
+                        if (exclusive or adaptive) else (ksize[0] * ksize[1])
             out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / field_size
     return out
 
@@ -93,12 +122,13 @@ class TestPool2D_Op(OpTest):
         self.init_pool_type()
         self.init_ceil_mode()
         self.init_exclusive()
+        self.init_adaptive()
         if self.global_pool:
             self.paddings = [0 for _ in range(len(self.paddings))]
         input = np.random.random(self.shape).astype(self.dtype)
         output = self.pool2D_forward_naive(
             input, self.ksize, self.strides, self.paddings, self.global_pool,
-            self.ceil_mode, self.exclusive).astype(self.dtype)
+            self.ceil_mode, self.exclusive, self.adaptive).astype(self.dtype)
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
 
         self.attrs = {
@@ -112,7 +142,8 @@ class TestPool2D_Op(OpTest):
             'ceil_mode': self.ceil_mode,
             'data_format':
             'AnyLayout',  # TODO(dzhwinter) : should be fix latter
-            'exclusive': self.exclusive
+            'exclusive': self.exclusive,
+            'adaptive': self.adaptive
         }
 
         self.outputs = {'Out': output}
@@ -159,6 +190,9 @@ class TestPool2D_Op(OpTest):
     def init_exclusive(self):
         self.exclusive = True
 
+    def init_adaptive(self):
+        self.adaptive = False
+
 
 class TestCase1(TestPool2D_Op):
     def init_test_case(self):
@@ -315,5 +349,10 @@ class TestCUDNNAvgInclude(TestCase2):
         self.exclusive = False
 
 
+class TestAvgPoolAdaptive(TestCase1):
+    def init_adaptive(self):
+        self.adaptive = True
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
index f05f8ccb39..47a5b2d1ab 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from __future__ import print_function
+from __future__ import division
 
 import unittest
 import numpy as np
@@ -21,35 +22,59 @@ import paddle.fluid.core as core
 from op_test import OpTest
 
 
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
 def max_pool3D_forward_naive(x,
                              ksize,
                              strides,
                              paddings,
                              global_pool=0,
                              ceil_mode=False,
-                             exclusive=True):
+                             exclusive=True,
+                             adaptive=False):
     N, C, D, H, W = x.shape
     if global_pool == 1:
         ksize = [D, H, W]
-    D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1
-             ) // strides[0] + 1 if ceil_mode else (
-                 H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
-    H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1
-             ) // strides[1] + 1 if ceil_mode else (
-                 W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
-    W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1
-             ) // strides[2] + 1 if ceil_mode else (
-                 W - ksize[2] + 2 * paddings[2]) // strides[2] + 1
+    if adaptive:
+        D_out, H_out, W_out = ksize
+    else:
+        D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1
+                 ) // strides[0] + 1 if ceil_mode else (
+                     H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+        H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1
+                 ) // strides[1] + 1 if ceil_mode else (
+                     W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
+        W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1
+                 ) // strides[2] + 1 if ceil_mode else (
+                     W - ksize[2] + 2 * paddings[2]) // strides[2] + 1
     out = np.zeros((N, C, D_out, H_out, W_out))
     for k in range(D_out):
-        d_start = np.max((k * strides[0] - paddings[0], 0))
-        d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
+        if adaptive:
+            d_start = adaptive_start_index(k, D, ksize[0])
+            d_end = adaptive_end_index(k, D, ksize[0])
+        else:
+            d_start = np.max((k * strides[0] - paddings[0], 0))
+            d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
         for i in range(H_out):
-            h_start = np.max((i * strides[0] - paddings[0], 0))
-            h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            if adaptive:
+                h_start = adaptive_start_index(i, H, ksize[1])
+                h_end = adaptive_end_index(i, H, ksize[1])
+            else:
+                h_start = np.max((i * strides[1] - paddings[1], 0))
+                h_end = np.min((i * strides[1] + ksize[1] - paddings[1], H))
             for j in range(W_out):
-                w_start = np.max((j * strides[1] - paddings[1], 0))
-                w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+                if adaptive:
+                    w_start = adaptive_start_index(j, W, ksize[2])
+                    w_end = adaptive_end_index(j, W, ksize[2])
+                else:
+                    w_start = np.max((j * strides[2] - paddings[2], 0))
+                    w_end = np.min((j * strides[2] + ksize[2] - paddings[2], W))
                 x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
 
                 out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
@@ -62,33 +87,49 @@ def avg_pool3D_forward_naive(x,
                              paddings,
                              global_pool=0,
                              ceil_mode=False,
-                             exclusive=True):
+                             exclusive=True,
+                             adaptive=False):
     N, C, D, H, W = x.shape
     if global_pool == 1:
         ksize = [D, H, W]
-    D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1
-             ) // strides[0] + 1 if ceil_mode else (
-                 H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
-    H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1
-             ) // strides[1] + 1 if ceil_mode else (
-                 W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
-    W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1
-             ) // strides[2] + 1 if ceil_mode else (
-                 W - ksize[2] + 2 * paddings[2]) // strides[2] + 1
+    if adaptive:
+        D_out, H_out, W_out = ksize
+    else:
+        D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1
+                 ) // strides[0] + 1 if ceil_mode else (
+                     H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+        H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1
+                 ) // strides[1] + 1 if ceil_mode else (
+                     W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
+        W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1
+                 ) // strides[2] + 1 if ceil_mode else (
+                     W - ksize[2] + 2 * paddings[2]) // strides[2] + 1
     out = np.zeros((N, C, D_out, H_out, W_out))
     for k in range(D_out):
-        d_start = np.max((k * strides[0] - paddings[0], 0))
-        d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
+        if adaptive:
+            d_start = adaptive_start_index(k, D, ksize[0])
+            d_end = adaptive_end_index(k, D, ksize[0])
+        else:
+            d_start = np.max((k * strides[0] - paddings[0], 0))
+            d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
         for i in range(H_out):
-            h_start = np.max((i * strides[0] - paddings[0], 0))
-            h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            if adaptive:
+                h_start = adaptive_start_index(i, H, ksize[1])
+                h_end = adaptive_end_index(i, H, ksize[1])
+            else:
+                h_start = np.max((i * strides[1] - paddings[1], 0))
+                h_end = np.min((i * strides[1] + ksize[1] - paddings[1], H))
             for j in range(W_out):
-                w_start = np.max((j * strides[1] - paddings[1], 0))
-                w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+                if adaptive:
+                    w_start = adaptive_start_index(j, W, ksize[2])
+                    w_end = adaptive_end_index(j, W, ksize[2])
+                else:
+                    w_start = np.max((j * strides[2] - paddings[2], 0))
+                    w_end = np.min((j * strides[2] + ksize[2] - paddings[2], W))
                 x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
 
                 field_size = (d_end - d_start) * (h_end - h_start) * (w_end - w_start) \
-                             if exclusive else ksize[0] * ksize[1] * ksize[2]
+                             if (exclusive or adaptive) else ksize[0] * ksize[1] * ksize[2]
                 out[:, :, k, i, j] = np.sum(x_masked, axis=(2, 3,
                                                             4)) / field_size
     return out
@@ -105,13 +146,14 @@ class TestPool3d_Op(OpTest):
         self.init_pool_type()
         self.init_ceil_mode()
         self.init_exclusive()
+        self.init_adaptive()
 
         if self.global_pool:
             self.paddings = [0 for _ in range(len(self.paddings))]
         input = np.random.random(self.shape).astype(self.dtype)
         output = self.pool3D_forward_naive(
             input, self.ksize, self.strides, self.paddings, self.global_pool,
-            self.ceil_mode, self.exclusive).astype(self.dtype)
+            self.ceil_mode, self.exclusive, self.adaptive).astype(self.dtype)
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
 
         self.attrs = {
@@ -124,7 +166,8 @@ class TestPool3d_Op(OpTest):
             'ceil_mode': self.ceil_mode,
             'data_format':
             'AnyLayout',  # TODO(dzhwinter) : should be fix latter
-            'exclusive': self.exclusive
+            'exclusive': self.exclusive,
+            'adaptive': self.adaptive
         }
 
         self.outputs = {'Out': output}
@@ -171,6 +214,9 @@ class TestPool3d_Op(OpTest):
     def init_exclusive(self):
         self.exclusive = True
 
+    def init_adaptive(self):
+        self.adaptive = False
+
 
 class TestCase1(TestPool3d_Op):
     def init_test_case(self):
@@ -353,5 +399,10 @@ class TestCUDNNAvgInclude(TestCUDNNCase3):
         self.exclusive = False
 
 
+class TestAvgPoolAdaptive(TestCase1):
+    def init_adaptive(self):
+        self.adaptive = True
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool_max_op.py b/python/paddle/fluid/tests/unittests/test_pool_max_op.py
index 488ff431d4..6575c408ee 100644
--- a/python/paddle/fluid/tests/unittests/test_pool_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool_max_op.py
@@ -13,33 +13,62 @@
 # limitations under the License.
 
 from __future__ import print_function
+from __future__ import division
 
 import unittest
 import numpy as np
 from op_test import OpTest
 
 
-def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=False):
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
+def max_pool3D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=False,
+                             adaptive=False):
 
     N, C, D, H, W = x.shape
     if global_pool:
         ksize = [D, H, W]
         paddings = [0, 0, 0]
 
-    D_out = (D - ksize[0] + 2 * paddings[0]) // strides[0] + 1
-    H_out = (H - ksize[1] + 2 * paddings[1]) // strides[1] + 1
-    W_out = (W - ksize[2] + 2 * paddings[2]) // strides[2] + 1
+    if adaptive:
+        D_out, H_out, W_out = ksize
+    else:
+        D_out = (D - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+        H_out = (H - ksize[1] + 2 * paddings[1]) // strides[1] + 1
+        W_out = (W - ksize[2] + 2 * paddings[2]) // strides[2] + 1
     out = np.zeros((N, C, D_out, H_out, W_out))
     mask = np.zeros((N, C, D_out, H_out, W_out))
     for k in range(D_out):
-        d_start = np.max((k * strides[0] - paddings[0], 0))
-        d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
+        if adaptive:
+            d_start = adaptive_start_index(k, D, ksize[0])
+            d_end = adaptive_end_index(k, D, ksize[0])
+        else:
+            d_start = np.max((k * strides[0] - paddings[0], 0))
+            d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
         for i in range(H_out):
-            h_start = np.max((i * strides[0] - paddings[0], 0))
-            h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            if adaptive:
+                h_start = adaptive_start_index(i, H, ksize[1])
+                h_end = adaptive_end_index(i, H, ksize[1])
+            else:
+                h_start = np.max((i * strides[1] - paddings[1], 0))
+                h_end = np.min((i * strides[1] + ksize[1] - paddings[1], H))
             for j in range(W_out):
-                w_start = np.max((j * strides[1] - paddings[1], 0))
-                w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+                if adaptive:
+                    w_start = adaptive_start_index(j, W, ksize[2])
+                    w_end = adaptive_end_index(j, W, ksize[2])
+                else:
+                    w_start = np.max((j * strides[2] - paddings[2], 0))
+                    w_end = np.min((j * strides[2] + ksize[2] - paddings[2], W))
                 x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
 
                 out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
@@ -58,23 +87,37 @@ def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=False):
     return out, mask
 
 
-def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=False):
+def max_pool2D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=False,
+                             adaptive=False):
 
     N, C, H, W = x.shape
     if global_pool:
         ksize = [H, W]
         paddings = [0, 0]
 
-    H_out = (H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
-    W_out = (W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
+    if adaptive:
+        H_out, W_out = ksize
+    else:
+        H_out = (H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
+        W_out = (W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
     out = np.zeros((N, C, H_out, W_out))
     mask = np.zeros((N, C, H_out, W_out))
     for i in range(H_out):
         for j in range(W_out):
-            r_start = np.max((i * strides[0] - paddings[0], 0))
-            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
-            c_start = np.max((j * strides[1] - paddings[1], 0))
-            c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+            if adaptive:
+                r_start = adaptive_start_index(i, H, ksize[0])
+                r_end = adaptive_end_index(i, H, ksize[0])
+                c_start = adaptive_start_index(j, W, ksize[1])
+                c_end = adaptive_end_index(j, W, ksize[1])
+            else:
+                r_start = np.max((i * strides[0] - paddings[0], 0))
+                r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+                c_start = np.max((j * strides[1] - paddings[1], 0))
+                c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
             x_masked = x[:, :, r_start:r_end, c_start:c_end]
 
             out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
@@ -95,10 +138,12 @@ class TestMaxPoolWithIndex_Op(OpTest):
     def setUp(self):
         self.init_test_case()
         self.init_global()
+        self.init_adaptive()
 
         input = np.random.random(self.shape).astype("float32")
         output, mask = self.pool_forward_naive(input, self.ksize, self.strides,
-                                               self.paddings, self.global_pool)
+                                               self.paddings, self.global_pool,
+                                               self.adaptive)
         output = output.astype("float32")
         mask = mask.astype("int32")
 
@@ -107,6 +152,7 @@ class TestMaxPoolWithIndex_Op(OpTest):
             'paddings': self.paddings,
             'ksize': self.ksize,
             'global_pooling': self.global_pool,
+            'adaptive': self.adaptive,
         }
 
         self.inputs = {'X': input}
@@ -129,6 +175,9 @@ class TestMaxPoolWithIndex_Op(OpTest):
     def init_global(self):
         self.global_pool = False
 
+    def init_adaptive(self):
+        self.adaptive = False
+
 
 class TestCase1(TestMaxPoolWithIndex_Op):
     def init_global(self):
@@ -190,5 +239,15 @@ class TestCase7(TestCase6):
         self.global_pool = False
 
 
+class TestCastAdaptive2d(TestCase6):
+    def init_adaptive(self):
+        self.adaptive = True
+
+
+class TestCastAdaptive3d(TestMaxPoolWithIndex_Op):
+    def init_adaptive(self):
+        self.adaptive = True
+
+
 if __name__ == '__main__':
     unittest.main()

From cf06e50f1d2b4ceca197e41c2c17a71783c5bc04 Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Mon, 3 Dec 2018 20:04:49 +0800
Subject: [PATCH 14/45] add doc for adaptive pool. test=develop

---
 paddle/fluid/operators/pool_op.cc            | 39 ++++++++++++++++++++
 paddle/fluid/operators/pool_with_index_op.cc | 11 ++++++
 2 files changed, 50 insertions(+)

diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index 11b5c49323..a2f5f811ab 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -277,6 +277,14 @@ Example:
        Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
        $$
 
+  For adaptive = true:
+      $$
+      hstart = floor(i * H_{in} / H_{out})
+      hend = ceil((i + 1) * H_{in} / H_{out})
+      wstart = floor(j * W_{in} / W_{out})
+      wend = ceil((j + 1) * W_{in} / W_{out})
+      Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
+      $$
 )DOC");
 }
 
@@ -396,6 +404,37 @@ Example:
        H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\
        W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
   $$
+  For exclusive = true:
+  $$
+  dstart = i * strides[0] - paddings[0]
+  dend = dstart + ksize[0]
+  hstart = j * strides[1] - paddings[1]
+  hend = hstart + ksize[1]
+  wstart = k * strides[2] - paddings[2]
+  wend = wstart + ksize[2]
+  Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
+  $$
+  For exclusive = false:
+  $$
+  dstart = max(0, i * strides[0] - paddings[0])
+  dend = min(D, dstart + ksize[0])
+  hstart = max(0, j * strides[1] - paddings[1])
+  hend = min(H, hstart + ksize[1])
+  wstart = max(0, k * strides[2] - paddings[2])
+  wend = min(W, wstart + ksize[2])
+  Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+  $$
+
+  For adaptive = true:
+  $$
+  dstart = floor(i * D_{in} / D_{out})
+  dend = ceil((i + 1) * D_{in} / D_{out})
+  hstart = floor(j * H_{in} / H_{out})
+  hend = ceil((j + 1) * H_{in} / H_{out})
+  wstart = floor(k * W_{in} / W_{out})
+  wend = ceil((k + 1) * W_{in} / W_{out})
+  Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+  $$
 
 )DOC");
 }
diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc
index f9e25277e5..5354b485bd 100644
--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -182,6 +182,12 @@ Example:
        H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
        W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
        $$
+  
+  For adaptive = true:
+       $$
+       H_{out} = ksize[0]   W_{out} = ksize[1]
+       $$
+      
 
 )DOC");
   }
@@ -267,6 +273,11 @@ Example:
        H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
        W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
        $$
+  
+  For adaptive = true:
+       $$
+       D_{out} = ksize[0]   H_{out} = ksize[1]   W_{out} = ksize[2]
+       $$
 
 )DOC");
   }

From a81fabd3273ae0cba9988da612dd5241aeec823f Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Tue, 11 Dec 2018 13:59:56 +0800
Subject: [PATCH 15/45] fix doc errors. test=develop

---
 paddle/fluid/operators/math/pooling.cc        |  70 +++----
 paddle/fluid/operators/math/pooling.cu        | 182 +++++++++---------
 paddle/fluid/operators/math/pooling.h         |  12 ++
 python/paddle/fluid/layers/nn.py              |  26 ++-
 .../fluid/tests/unittests/test_layers.py      |  13 +-
 5 files changed, 154 insertions(+), 149 deletions(-)

diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/fluid/operators/math/pooling.cc
index b4ee82add3..30873e9f87 100644
--- a/paddle/fluid/operators/math/pooling.cc
+++ b/paddle/fluid/operators/math/pooling.cc
@@ -19,16 +19,6 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-static inline int ADAPT_START_INDEX(int ph, int input_size, int output_size) {
-  return static_cast<int>(
-      floor(static_cast<float>(ph * input_size) / output_size));
-}
-
-static inline int ADAPT_END_INDEX(int ph, int input_size, int output_size) {
-  return static_cast<int>(
-      ceil(static_cast<float>((ph + 1) * input_size) / output_size));
-}
-
 /*
  * All tensors are in NCHW format.
  * Ksize, strides, paddings are two elements. These two elements represent
@@ -67,8 +57,8 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
       for (int c = 0; c < output_channels; ++c) {
         for (int ph = 0; ph < output_height; ++ph) {
           if (adaptive) {
-            hstart = ADAPT_START_INDEX(ph, input_height, output_height);
-            hend = ADAPT_END_INDEX(ph, input_height, output_height);
+            hstart = AdaptStartIndex(ph, input_height, output_height);
+            hend = AdaptEndIndex(ph, input_height, output_height);
           } else {
             hstart = ph * stride_height - padding_height;
             hend = std::min(hstart + ksize_height, input_height);
@@ -76,8 +66,8 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
           }
           for (int pw = 0; pw < output_width; ++pw) {
             if (adaptive) {
-              wstart = ADAPT_START_INDEX(pw, input_width, output_width);
-              wend = ADAPT_END_INDEX(pw, input_width, output_width);
+              wstart = AdaptStartIndex(pw, input_width, output_width);
+              wend = AdaptEndIndex(pw, input_width, output_width);
             } else {
               wstart = pw * stride_width - padding_width;
               wend = std::min(wstart + ksize_width, input_width);
@@ -144,8 +134,8 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
       for (int c = 0; c < output_channels; ++c) {
         for (int ph = 0; ph < output_height; ++ph) {
           if (adaptive) {
-            hstart = ADAPT_START_INDEX(ph, input_height, output_height);
-            hend = ADAPT_END_INDEX(ph, input_height, output_height);
+            hstart = AdaptStartIndex(ph, input_height, output_height);
+            hend = AdaptEndIndex(ph, input_height, output_height);
           } else {
             hstart = ph * stride_height - padding_height;
             hend = std::min(hstart + ksize_height, input_height);
@@ -153,8 +143,8 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
           }
           for (int pw = 0; pw < output_width; ++pw) {
             if (adaptive) {
-              wstart = ADAPT_START_INDEX(pw, input_width, output_width);
-              wend = ADAPT_END_INDEX(pw, input_width, output_width);
+              wstart = AdaptStartIndex(pw, input_width, output_width);
+              wend = AdaptEndIndex(pw, input_width, output_width);
             } else {
               wstart = pw * stride_width - padding_width;
               wend = std::min(wstart + ksize_width, input_width);
@@ -319,8 +309,8 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
       for (int c = 0; c < output_channels; ++c) {
         for (int pd = 0; pd < output_depth; ++pd) {
           if (adaptive) {
-            dstart = ADAPT_START_INDEX(pd, input_depth, output_depth);
-            dend = ADAPT_END_INDEX(pd, input_depth, output_depth);
+            dstart = AdaptStartIndex(pd, input_depth, output_depth);
+            dend = AdaptEndIndex(pd, input_depth, output_depth);
           } else {
             dstart = pd * stride_depth - padding_depth;
             dend = std::min(dstart + ksize_depth, input_depth);
@@ -328,8 +318,8 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
           }
           for (int ph = 0; ph < output_height; ++ph) {
             if (adaptive) {
-              hstart = ADAPT_START_INDEX(ph, input_height, output_height);
-              hend = ADAPT_END_INDEX(ph, input_height, output_height);
+              hstart = AdaptStartIndex(ph, input_height, output_height);
+              hend = AdaptEndIndex(ph, input_height, output_height);
             } else {
               hstart = ph * stride_height - padding_height;
               hend = std::min(hstart + ksize_height, input_height);
@@ -337,8 +327,8 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
             }
             for (int pw = 0; pw < output_width; ++pw) {
               if (adaptive) {
-                wstart = ADAPT_START_INDEX(pw, input_width, output_width);
-                wend = ADAPT_END_INDEX(pw, input_width, output_width);
+                wstart = AdaptStartIndex(pw, input_width, output_width);
+                wend = AdaptEndIndex(pw, input_width, output_width);
               } else {
                 wstart = pw * stride_width - padding_width;
                 wend = std::min(wstart + ksize_width, input_width);
@@ -417,8 +407,8 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
       for (int c = 0; c < output_channels; ++c) {
         for (int pd = 0; pd < output_depth; ++pd) {
           if (adaptive) {
-            dstart = ADAPT_START_INDEX(pd, input_depth, output_depth);
-            dend = ADAPT_END_INDEX(pd, input_depth, output_depth);
+            dstart = AdaptStartIndex(pd, input_depth, output_depth);
+            dend = AdaptEndIndex(pd, input_depth, output_depth);
           } else {
             dstart = pd * stride_depth - padding_depth;
             dend = std::min(dstart + ksize_depth, input_depth);
@@ -426,8 +416,8 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
           }
           for (int ph = 0; ph < output_height; ++ph) {
             if (adaptive) {
-              hstart = ADAPT_START_INDEX(ph, input_height, output_height);
-              hend = ADAPT_END_INDEX(ph, input_height, output_height);
+              hstart = AdaptStartIndex(ph, input_height, output_height);
+              hend = AdaptEndIndex(ph, input_height, output_height);
             } else {
               hstart = ph * stride_height - padding_height;
               hend = std::min(hstart + ksize_height, input_height);
@@ -435,8 +425,8 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
             }
             for (int pw = 0; pw < output_width; ++pw) {
               if (adaptive) {
-                wstart = ADAPT_START_INDEX(pw, input_width, output_width);
-                wend = ADAPT_END_INDEX(pw, input_width, output_width);
+                wstart = AdaptStartIndex(pw, input_width, output_width);
+                wend = AdaptEndIndex(pw, input_width, output_width);
               } else {
                 wstart = pw * stride_width - padding_width;
                 wend = std::min(wstart + ksize_width, input_width);
@@ -615,8 +605,8 @@ class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
       for (int c = 0; c < output_channels; ++c) {
         for (int ph = 0; ph < output_height; ++ph) {
           if (adaptive) {
-            hstart = ADAPT_START_INDEX(ph, input_height, output_height);
-            hend = ADAPT_END_INDEX(ph, input_height, output_height);
+            hstart = AdaptStartIndex(ph, input_height, output_height);
+            hend = AdaptEndIndex(ph, input_height, output_height);
           } else {
             hstart = ph * stride_height - padding_height;
             hend = std::min(hstart + ksize_height, input_height);
@@ -624,8 +614,8 @@ class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
           }
           for (int pw = 0; pw < output_width; ++pw) {
             if (adaptive) {
-              wstart = ADAPT_START_INDEX(pw, input_width, output_width);
-              wend = ADAPT_END_INDEX(pw, input_width, output_width);
+              wstart = AdaptStartIndex(pw, input_width, output_width);
+              wend = AdaptEndIndex(pw, input_width, output_width);
             } else {
               wstart = pw * stride_width - padding_width;
               wend = std::min(wstart + ksize_width, input_width);
@@ -753,8 +743,8 @@ class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
       for (int c = 0; c < output_channels; ++c) {
         for (int pd = 0; pd < output_depth; ++pd) {
           if (adaptive) {
-            dstart = ADAPT_START_INDEX(pd, input_depth, output_depth);
-            dend = ADAPT_END_INDEX(pd, input_depth, output_depth);
+            dstart = AdaptStartIndex(pd, input_depth, output_depth);
+            dend = AdaptEndIndex(pd, input_depth, output_depth);
           } else {
             dstart = pd * stride_depth - padding_depth;
             dend = std::min(dstart + ksize_depth, input_depth);
@@ -762,8 +752,8 @@ class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
           }
           for (int ph = 0; ph < output_height; ++ph) {
             if (adaptive) {
-              hstart = ADAPT_START_INDEX(ph, input_height, output_height);
-              hend = ADAPT_END_INDEX(ph, input_height, output_height);
+              hstart = AdaptStartIndex(ph, input_height, output_height);
+              hend = AdaptEndIndex(ph, input_height, output_height);
             } else {
               hstart = ph * stride_height - padding_height;
               hend = std::min(hstart + ksize_height, input_height);
@@ -771,8 +761,8 @@ class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
             }
             for (int pw = 0; pw < output_width; ++pw) {
               if (adaptive) {
-                wstart = ADAPT_START_INDEX(pw, input_width, output_width);
-                wend = ADAPT_END_INDEX(pw, input_width, output_width);
+                wstart = AdaptStartIndex(pw, input_width, output_width);
+                wend = AdaptEndIndex(pw, input_width, output_width);
               } else {
                 wstart = pw * stride_width - padding_width;
                 wend = std::min(wstart + ksize_width, input_width);
diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu
index 5f3b82ed55..efce3f899a 100644
--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/fluid/operators/math/pooling.cu
@@ -21,18 +21,6 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-__device__ __forceinline__ int ADAPT_START_INDEX(int ph, int input_size,
-                                                 int output_size) {
-  return static_cast<int>(
-      floor(static_cast<double>(ph * input_size) / output_size));
-}
-
-__device__ __forceinline__ int ADAPT_END_INDEX(int ph, int input_size,
-                                               int output_size) {
-  return static_cast<int>(
-      ceil(static_cast<double>((ph + 1) * input_size) / output_size));
-}
-
 template <typename PoolProcess, typename T>
 __global__ void KernelPool2D(const int nthreads, const T* input_data,
                              const int channels, const int input_height,
@@ -52,11 +40,11 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
     int hstart, hend;
     int wstart, wend;
     if (adaptive) {
-      hstart = ADAPT_START_INDEX(ph, input_height, output_height);
-      hend = ADAPT_END_INDEX(ph, input_height, output_height);
+      hstart = AdaptStartIndex(ph, input_height, output_height);
+      hend = AdaptEndIndex(ph, input_height, output_height);
 
-      wstart = ADAPT_START_INDEX(pw, input_width, output_width);
-      wend = ADAPT_END_INDEX(pw, input_width, output_width);
+      wstart = AdaptStartIndex(pw, input_width, output_width);
+      wend = AdaptEndIndex(pw, input_width, output_width);
     } else {
       hstart = ph * stride_height - padding_height;
       hend = min(hstart + ksize_height, input_height);
@@ -91,28 +79,29 @@ __global__ void KernelPool2DGrad(
     PoolProcess pool_process, bool exclusive, bool adaptive, T* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
-    int offsetW = index % input_width + padding_width;
-    int offsetH = (index / input_width) % input_height + padding_height;
+    int w_offset = index % input_width + padding_width;
+    int h_offset = (index / input_width) % input_height + padding_height;
     int offsetC = (index / input_width / input_height) % channels;
     int batch_idx = index / input_width / input_height / channels;
 
     int phstart, phend;
     int pwstart, pwend;
     if (adaptive) {
-      phstart = offsetH * output_height / input_height;
+      phstart = h_offset * output_height / input_height;
       phend =
-          min((offsetH + 1) * output_height / input_height + 1, output_height);
-      pwstart = offsetW * output_width / input_width;
-      pwend = min((offsetW + 1) * output_width / input_width + 1, output_width);
+          min((h_offset + 1) * output_height / input_height + 1, output_height);
+      pwstart = w_offset * output_width / input_width;
+      pwend =
+          min((w_offset + 1) * output_width / input_width + 1, output_width);
     } else {
-      phstart = (offsetH < ksize_height)
+      phstart = (h_offset < ksize_height)
                     ? 0
-                    : (offsetH - ksize_height) / stride_height + 1;
-      pwstart = (offsetW < ksize_width)
+                    : (h_offset - ksize_height) / stride_height + 1;
+      pwstart = (w_offset < ksize_width)
                     ? 0
-                    : (offsetW - ksize_width) / stride_width + 1;
-      phend = min(offsetH / stride_height + 1, output_height);
-      pwend = min(offsetW / stride_width + 1, output_width);
+                    : (w_offset - ksize_width) / stride_width + 1;
+      phend = min(h_offset / stride_height + 1, output_height);
+      pwend = min(w_offset / stride_width + 1, output_width);
     }
     T gradient = 0;
     T input = input_data[index];
@@ -414,14 +403,14 @@ __global__ void KernelPool3D(
     int hstart, hend;
     int wstart, wend;
     if (adaptive) {
-      dstart = ADAPT_START_INDEX(pd, input_depth, output_depth);
-      dend = ADAPT_END_INDEX(pd, input_depth, output_depth);
+      dstart = AdaptStartIndex(pd, input_depth, output_depth);
+      dend = AdaptEndIndex(pd, input_depth, output_depth);
 
-      hstart = ADAPT_START_INDEX(ph, input_height, output_height);
-      hend = ADAPT_END_INDEX(ph, input_height, output_height);
+      hstart = AdaptStartIndex(ph, input_height, output_height);
+      hend = AdaptEndIndex(ph, input_height, output_height);
 
-      wstart = ADAPT_START_INDEX(pw, input_width, output_width);
-      wend = ADAPT_END_INDEX(pw, input_width, output_width);
+      wstart = AdaptStartIndex(pw, input_width, output_width);
+      wend = AdaptEndIndex(pw, input_width, output_width);
     } else {
       dstart = pd * stride_depth - padding_depth;
       hstart = ph * stride_height - padding_height;
@@ -464,9 +453,9 @@ __global__ void KernelPool3DGrad(
     bool exclusive, bool adaptive, T* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
-    int offsetW = index % input_width + padding_width;
-    int offsetH = (index / input_width) % input_height + padding_height;
-    int offsetD =
+    int w_offset = index % input_width + padding_width;
+    int h_offset = (index / input_width) % input_height + padding_height;
+    int d_offset =
         (index / input_width / input_height) % input_depth + padding_depth;
     int offsetC = (index / input_width / input_height / input_depth) % channels;
     int batch_idx = index / input_width / input_height / input_depth / channels;
@@ -475,26 +464,28 @@ __global__ void KernelPool3DGrad(
     int phstart, phend;
     int pwstart, pwend;
     if (adaptive) {
-      pdstart = offsetD * output_depth / input_depth;
-      pdend = min((offsetD + 1) * output_depth / input_depth + 1, output_depth);
-      phstart = offsetH * output_height / input_height;
+      pdstart = d_offset * output_depth / input_depth;
+      pdend =
+          min((d_offset + 1) * output_depth / input_depth + 1, output_depth);
+      phstart = h_offset * output_height / input_height;
       phend =
-          min((offsetH + 1) * output_height / input_height + 1, output_height);
-      pwstart = offsetW * output_width / input_width;
-      pwend = min((offsetW + 1) * output_width / input_width + 1, output_width);
+          min((h_offset + 1) * output_height / input_height + 1, output_height);
+      pwstart = w_offset * output_width / input_width;
+      pwend =
+          min((w_offset + 1) * output_width / input_width + 1, output_width);
     } else {
-      pdstart = (offsetD < ksize_depth)
+      pdstart = (d_offset < ksize_depth)
                     ? 0
-                    : (offsetD - ksize_depth) / stride_depth + 1;
-      phstart = (offsetH < ksize_height)
+                    : (d_offset - ksize_depth) / stride_depth + 1;
+      phstart = (h_offset < ksize_height)
                     ? 0
-                    : (offsetH - ksize_height) / stride_height + 1;
-      pwstart = (offsetW < ksize_width)
+                    : (h_offset - ksize_height) / stride_height + 1;
+      pwstart = (w_offset < ksize_width)
                     ? 0
-                    : (offsetW - ksize_width) / stride_width + 1;
-      pdend = min((offsetD) / stride_depth + 1, output_depth);
-      phend = min((offsetH) / stride_height + 1, output_height);
-      pwend = min((offsetW) / stride_width + 1, output_width);
+                    : (w_offset - ksize_width) / stride_width + 1;
+      pdend = min((d_offset) / stride_depth + 1, output_depth);
+      phend = min((h_offset) / stride_height + 1, output_height);
+      pwend = min((w_offset) / stride_width + 1, output_width);
     }
 
     T gradient = 0;
@@ -795,11 +786,11 @@ __global__ void KernelMaxPool2dWithIdx(
     int hstart, hend;
     int wstart, wend;
     if (adaptive) {
-      hstart = ADAPT_START_INDEX(ph, input_height, output_height);
-      hend = ADAPT_END_INDEX(ph, input_height, output_height);
+      hstart = AdaptStartIndex(ph, input_height, output_height);
+      hend = AdaptEndIndex(ph, input_height, output_height);
 
-      wstart = ADAPT_START_INDEX(pw, input_width, output_width);
-      wend = ADAPT_END_INDEX(pw, input_width, output_width);
+      wstart = AdaptStartIndex(pw, input_width, output_width);
+      wend = AdaptEndIndex(pw, input_width, output_width);
     } else {
       hstart = ph * stride_height - padding_height;
       hend = min(hstart + ksize_height, input_height);
@@ -837,35 +828,36 @@ __global__ void KernelMaxPool2DWithIdxGrad(
     T1* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
-    int offsetW = index % input_width;
-    int offsetH = (index / input_width) % input_height;
+    int w_offset = index % input_width;
+    int h_offset = (index / input_width) % input_height;
     int offsetC = (index / input_width / input_height) % channels;
     int batch_idx = index / input_width / input_height / channels;
 
     int phstart, phend;
     int pwstart, pwend;
     if (adaptive) {
-      phstart = offsetH * output_height / input_height;
+      phstart = h_offset * output_height / input_height;
       phend =
-          min((offsetH + 1) * output_height / input_height + 1, output_height);
-      pwstart = offsetW * output_width / input_width;
-      pwend = min((offsetW + 1) * output_width / input_width + 1, output_width);
+          min((h_offset + 1) * output_height / input_height + 1, output_height);
+      pwstart = w_offset * output_width / input_width;
+      pwend =
+          min((w_offset + 1) * output_width / input_width + 1, output_width);
     } else {
       phstart =
-          (offsetH + padding_height < ksize_height)
+          (h_offset + padding_height < ksize_height)
               ? 0
-              : (offsetH + padding_height - ksize_height) / stride_height + 1;
+              : (h_offset + padding_height - ksize_height) / stride_height + 1;
       pwstart =
-          (offsetW + padding_width < ksize_width)
+          (w_offset + padding_width < ksize_width)
               ? 0
-              : (offsetW + padding_width - ksize_width) / stride_width + 1;
+              : (w_offset + padding_width - ksize_width) / stride_width + 1;
       phend =
-          min((offsetH + padding_height) / stride_height + 1, output_height);
-      pwend = min((offsetW + padding_width) / stride_width + 1, output_width);
+          min((h_offset + padding_height) / stride_height + 1, output_height);
+      pwend = min((w_offset + padding_width) / stride_width + 1, output_width);
     }
 
     T1 gradient = 0;
-    int input_current_featuremap_idx = offsetH * input_width + offsetW;
+    int input_current_featuremap_idx = h_offset * input_width + w_offset;
     int output_idx =
         (batch_idx * channels + offsetC) * output_height * output_width;
 
@@ -1000,14 +992,14 @@ __global__ void KernelMaxPool3DWithIdx(
     int hstart, hend;
     int wstart, wend;
     if (adaptive) {
-      dstart = ADAPT_START_INDEX(pd, input_depth, output_depth);
-      dend = ADAPT_END_INDEX(pd, input_depth, output_depth);
+      dstart = AdaptStartIndex(pd, input_depth, output_depth);
+      dend = AdaptEndIndex(pd, input_depth, output_depth);
 
-      hstart = ADAPT_START_INDEX(ph, input_height, output_height);
-      hend = ADAPT_END_INDEX(ph, input_height, output_height);
+      hstart = AdaptStartIndex(ph, input_height, output_height);
+      hend = AdaptEndIndex(ph, input_height, output_height);
 
-      wstart = ADAPT_START_INDEX(pw, input_width, output_width);
-      wend = ADAPT_END_INDEX(pw, input_width, output_width);
+      wstart = AdaptStartIndex(pw, input_width, output_width);
+      wend = AdaptEndIndex(pw, input_width, output_width);
     } else {
       dstart = pd * stride_depth - padding_depth;
       hstart = ph * stride_height - padding_height;
@@ -1051,9 +1043,9 @@ __global__ void KernelMaxPool3DWithIdxGrad(
     const int padding_width, bool adaptive, T1* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
-    int offsetW = index % input_width;
-    int offsetH = (index / input_width) % input_height;
-    int offsetD = (index / input_width / input_height) % input_depth;
+    int w_offset = index % input_width;
+    int h_offset = (index / input_width) % input_height;
+    int d_offset = (index / input_width / input_height) % input_depth;
     int offsetC = (index / input_width / input_height / input_depth) % channels;
     int batch_idx = index / input_width / input_height / input_depth / channels;
 
@@ -1061,35 +1053,37 @@ __global__ void KernelMaxPool3DWithIdxGrad(
     int phstart, phend;
     int pwstart, pwend;
     if (adaptive) {
-      pdstart = offsetD * output_depth / input_depth;
-      pdend = min((offsetD + 1) * output_depth / input_depth + 1, output_depth);
-      phstart = offsetH * output_height / input_height;
+      pdstart = d_offset * output_depth / input_depth;
+      pdend =
+          min((d_offset + 1) * output_depth / input_depth + 1, output_depth);
+      phstart = h_offset * output_height / input_height;
       phend =
-          min((offsetH + 1) * output_height / input_height + 1, output_height);
-      pwstart = offsetW * output_width / input_width;
-      pwend = min((offsetW + 1) * output_width / input_width + 1, output_width);
+          min((h_offset + 1) * output_height / input_height + 1, output_height);
+      pwstart = w_offset * output_width / input_width;
+      pwend =
+          min((w_offset + 1) * output_width / input_width + 1, output_width);
     } else {
       pdstart =
-          (offsetD + padding_depth < ksize_depth)
+          (d_offset + padding_depth < ksize_depth)
               ? 0
-              : (offsetD + padding_depth - ksize_depth) / stride_depth + 1;
+              : (d_offset + padding_depth - ksize_depth) / stride_depth + 1;
       phstart =
-          (offsetH + padding_height < ksize_height)
+          (h_offset + padding_height < ksize_height)
               ? 0
-              : (offsetH + padding_height - ksize_height) / stride_height + 1;
+              : (h_offset + padding_height - ksize_height) / stride_height + 1;
       pwstart =
-          (offsetW + padding_width < ksize_width)
+          (w_offset + padding_width < ksize_width)
               ? 0
-              : (offsetW + padding_width - ksize_width) / stride_width + 1;
-      pdend = min((offsetD + padding_depth) / stride_depth + 1, output_depth);
+              : (w_offset + padding_width - ksize_width) / stride_width + 1;
+      pdend = min((d_offset + padding_depth) / stride_depth + 1, output_depth);
       phend =
-          min((offsetH + padding_height) / stride_height + 1, output_height);
-      pwend = min((offsetW + padding_width) / stride_width + 1, output_width);
+          min((h_offset + padding_height) / stride_height + 1, output_height);
+      pwend = min((w_offset + padding_width) / stride_width + 1, output_width);
     }
 
     T1 gradient = 0;
     int input_current_feature_map_idx =
-        (offsetD * input_height + offsetH) * input_width + offsetW;
+        (d_offset * input_height + h_offset) * input_width + w_offset;
     int output_idx = (batch_idx * channels + offsetC) * output_depth *
                      output_height * output_width;
     mask += output_idx;
diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h
index d123af8924..e1f8e6df1d 100644
--- a/paddle/fluid/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
@@ -68,6 +68,18 @@ class AvgPoolGrad {
   }
 };
 
+/* used for adaptive pool to calculate start and end index of each divided grid
+ */
+HOSTDEVICE inline int AdaptStartIndex(int ph, int input_size, int output_size) {
+  return static_cast<int>(
+      floor(static_cast<double>(ph * input_size) / output_size));
+}
+
+HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) {
+  return static_cast<int>(
+      ceil(static_cast<double>((ph + 1) * input_size) / output_size));
+}
+
 /*
  * \brief Getting pooling results, and calculating gradient.
  *
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 61794f0d49..07fc4ccc6b 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -2506,7 +2506,7 @@ def adaptive_pool2d(input,
                     pool_size,
                     pool_type="max",
                     require_index=False,
-                    use_cudnn=True,
+                    use_cudnn=False,
                     name=None):
     """
     ${comment}
@@ -2521,7 +2521,7 @@ def adaptive_pool2d(input,
         pool_type: ${pooling_type_comment}
         require_index (bool): If true, the index of max pooling point along with outputs.
             it cannot be set in average pooling type.
-        use_cudnn (bool): ${use_cudnn_comment}
+        use_cudnn (bool, default False): adaptive pool currently not supported in cudnn.
         name (str|None): A name for this layer(optional). If set None, the
                         layer will be named automatically.
 
@@ -2531,6 +2531,7 @@ def adaptive_pool2d(input,
     Raises:
         ValueError: 'pool_type' is not 'max' nor 'avg'.
         ValueError: 'use_cudnn' is not a bool value.
+        ValueError: adaptive pool currently not supported in cudnn.
         ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'.
         ValueError: 'pool_size' should be a list or tuple with length as 2.
 
@@ -2540,11 +2541,11 @@ def adaptive_pool2d(input,
 
           data = fluid.layers.data(
               name='data', shape=[3, 32, 32], dtype='float32')
-          conv2d = fluid.layers.pool2d(
+          pool_out = fluid.layers.adaptive_pool2d(
                             input=data,
                             pool_size=[3, 3],
                             pool_type='max',
-                            require_index=True)
+                            require_index=False)
     """
     if pool_type not in ["max", "avg"]:
         raise ValueError(
@@ -2565,6 +2566,9 @@ def adaptive_pool2d(input,
     if not isinstance(use_cudnn, bool):
         raise ValueError("use_cudnn should be True or False.")
 
+    if use_cudnn:
+        raise ValueError("adaptive pool currently not supported in cudnn.")
+
     if pool_type == "max":
         l_type = 'max_pool2d_with_index'
     else:
@@ -2590,7 +2594,7 @@ def adaptive_pool2d(input,
             "adaptive": True,
         })
 
-    return pool_out
+    return (pool_out, mask) if require_index else pool_out
 
 
 @templatedoc(op_type="pool3d")
@@ -2598,7 +2602,7 @@ def adaptive_pool3d(input,
                     pool_size,
                     pool_type="max",
                     require_index=False,
-                    use_cudnn=True,
+                    use_cudnn=False,
                     name=None):
     """
     ${comment}
@@ -2613,7 +2617,7 @@ def adaptive_pool3d(input,
         pool_type: ${pooling_type_comment}
         require_index (bool): If true, the index of max pooling point along with outputs.
             it cannot be set in average pooling type.
-        use_cudnn (bool): ${use_cudnn_comment}
+        use_cudnn (bool, default False): adaptive pool currently not supported in cudnn.
         name (str|None): A name for this layer(optional). If set None, the
                         layer will be named automatically.
 
@@ -2623,6 +2627,7 @@ def adaptive_pool3d(input,
     Raises:
         ValueError: 'pool_type' is not 'max' nor 'avg'.
         ValueError: 'use_cudnn' is not a bool value.
+        ValueError: adaptive pool currently not supported in cudnn.
         ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'.
         ValueError: 'pool_size' should be a list or tuple with length as 2.
 
@@ -2632,7 +2637,7 @@ def adaptive_pool3d(input,
 
           data = fluid.layers.data(
               name='data', shape=[3, 32, 32], dtype='float32')
-          conv2d = fluid.layers.pool2d(
+          pool_out, mask = fluid.layers.adaptive_pool3d(
                             input=data,
                             pool_size=[3, 3],
                             pool_type='max',
@@ -2657,6 +2662,9 @@ def adaptive_pool3d(input,
     if not isinstance(use_cudnn, bool):
         raise ValueError("use_cudnn should be True or False.")
 
+    if use_cudnn:
+        raise ValueError("adaptive pool currently not supported in cudnn.")
+
     if pool_type == "max":
         l_type = 'max_pool3d_with_index'
     else:
@@ -2682,7 +2690,7 @@ def adaptive_pool3d(input,
             "adaptive": True,
         })
 
-    return pool_out
+    return (pool_out, mask) if require_index else pool_out
 
 
 def batch_norm(input,
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 9785b5063c..030bf012fa 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -237,23 +237,24 @@ class TestBook(unittest.TestCase):
         program = Program()
         with program_guard(program):
             x = layers.data(name='x', shape=[3, 224, 224], dtype='float32')
-            self.assertIsNotNone(
-                layers.adaptive_pool2d(
-                    x, [3, 3], require_index=True))
             self.assertIsNotNone(
                 layers.adaptive_pool2d(
                     x, [3, 3], pool_type='avg'))
+            pool, mask = layers.adaptive_pool2d(x, [3, 3], require_index=True)
+            self.assertIsNotNone(pool)
+            self.assertIsNotNone(mask)
 
     def test_adaptive_pool3d(self):
         program = Program()
         with program_guard(program):
             x = layers.data(name='x', shape=[3, 244, 224, 224], dtype='float32')
-            self.assertIsNotNone(
-                layers.adaptive_pool3d(
-                    x, [3, 3, 3], require_index=True))
             self.assertIsNotNone(
                 layers.adaptive_pool3d(
                     x, [3, 3, 3], pool_type='avg'))
+            pool, mask = layers.adaptive_pool3d(
+                x, [3, 3, 3], require_index=True)
+            self.assertIsNotNone(pool)
+            self.assertIsNotNone(mask)
 
     def test_lstm_unit(self):
         program = Program()

From 7ec3264b513270fa7a70c2b5fec2166630568a2c Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Wed, 12 Dec 2018 10:56:00 +0800
Subject: [PATCH 16/45] fix API spec. test=develop

---
 paddle/fluid/API.spec | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 87ed586aad..845abe7d5b 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -77,8 +77,8 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name']
 paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
 paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
 paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
-paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=('max', False, True, None))
-paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=('max', False, True, None))
+paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=('max', False, False, None))
+paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=('max', False, False, None))
 paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False))
 paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))

From 9bd70a1e0433b5a930c43b1d7d2af67bc72d38a6 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 11 Dec 2018 16:32:42 +0800
Subject: [PATCH 17/45] Change tensor uses proto::VarType::type

test=develop
---
 .../fluid/framework/data_layout_transform.cc  |  6 +-
 .../fluid/framework/data_layout_transform.h   | 16 ++--
 paddle/fluid/framework/data_type.cc           | 24 ++----
 paddle/fluid/framework/data_type.h            | 77 +++++++++++--------
 paddle/fluid/framework/data_type_test.cc      |  6 +-
 .../framework/details/all_reduce_op_handle.cc |  2 +-
 .../framework/details/fuse_vars_op_handle.h   |  4 +-
 .../framework/details/reduce_op_handle.cc     |  4 +-
 paddle/fluid/framework/dlpack_tensor.cc       | 37 ++++-----
 paddle/fluid/framework/dlpack_tensor_test.cc  | 20 +----
 .../fluid/framework/executor_thread_worker.cc | 46 ++++-------
 paddle/fluid/framework/lod_tensor.cc          |  6 +-
 paddle/fluid/framework/operator.cc            | 14 ++--
 paddle/fluid/framework/selected_rows.cc       |  4 +-
 paddle/fluid/framework/tensor.cc              |  4 +-
 paddle/fluid/framework/tensor.h               |  9 ++-
 paddle/fluid/framework/tensor_impl.h          | 12 ++-
 paddle/fluid/framework/tensor_util.cc         | 10 +--
 .../fluid/inference/api/analysis_predictor.cc |  4 +-
 paddle/fluid/inference/api/api_impl.cc        |  4 +-
 paddle/fluid/inference/api/api_impl_tester.cc |  4 +-
 paddle/fluid/operators/affine_grid_op.cc      |  8 +-
 paddle/fluid/operators/arg_max_op.cc          |  1 -
 paddle/fluid/operators/arg_max_op.cu          |  2 -
 paddle/fluid/operators/arg_min_op.cc          |  1 -
 paddle/fluid/operators/arg_min_op.cu          |  2 -
 .../fluid/operators/array_to_lod_tensor_op.cc |  4 +-
 paddle/fluid/operators/attention_lstm_op.cc   |  5 +-
 .../fluid/operators/average_accumulates_op.cc |  5 +-
 paddle/fluid/operators/batch_norm_op.cc       | 20 ++---
 .../fluid/operators/beam_search_decode_op.cc  |  2 +-
 paddle/fluid/operators/beam_search_op.cc      |  3 +-
 paddle/fluid/operators/bpr_loss_op.cc         | 10 +--
 .../controlflow/conditional_block_op.cc       | 13 ++--
 .../fluid/operators/controlflow/while_op.cc   |  2 +-
 paddle/fluid/operators/conv_op.cc             | 12 ++-
 paddle/fluid/operators/conv_transpose_op.cc   | 10 +--
 paddle/fluid/operators/crf_decoding_op.cc     |  5 +-
 paddle/fluid/operators/crop_op.cc             |  9 +--
 paddle/fluid/operators/cross_entropy_op.cc    | 10 +--
 paddle/fluid/operators/ctc_align_op.cc        |  5 +-
 .../detection/anchor_generator_op.cc          |  3 +-
 .../operators/detection/bipartite_match_op.cc |  5 +-
 .../detection/density_prior_box_op.cc         |  3 +-
 .../detection/generate_proposals_op.cc        |  5 +-
 .../detection/mine_hard_examples_op.cc        |  3 +-
 .../operators/detection/multiclass_nms_op.cc  |  3 +-
 .../fluid/operators/detection/prior_box_op.cc |  3 +-
 .../detection/roi_perspective_transform_op.cc | 10 +--
 .../detection/rpn_target_assign_op.cc         |  3 +-
 .../operators/detection/target_assign_op.cc   |  5 +-
 paddle/fluid/operators/detection_map_op.cc    |  3 +-
 .../operators/elementwise/elementwise_op.h    |  4 +-
 paddle/fluid/operators/fake_quantize_op.cc    | 10 +--
 paddle/fluid/operators/fc_op.cc               | 10 +--
 paddle/fluid/operators/fill_constant_op.cc    |  4 +-
 paddle/fluid/operators/fill_op.cc             |  4 +-
 .../fused/fused_elemwise_activation_op.cc     | 10 +--
 .../fused/fused_embedding_fc_lstm_op.cc       |  3 +-
 paddle/fluid/operators/fused/fusion_gru_op.cc |  5 +-
 .../fluid/operators/fused/fusion_lstm_op.cc   |  5 +-
 .../fused/fusion_seqconv_eltadd_relu_op.cc    |  5 +-
 .../fused/fusion_seqexpand_concat_fc_op.cc    |  5 +-
 paddle/fluid/operators/gather_op.cc           | 10 +--
 paddle/fluid/operators/grid_sampler_op.cc     | 12 +--
 paddle/fluid/operators/group_norm_op.cc       |  3 +-
 .../operators/hierarchical_sigmoid_op.cc      | 10 +--
 paddle/fluid/operators/interpolate_op.cc      |  8 +-
 paddle/fluid/operators/is_empty_op.cc         |  3 +-
 paddle/fluid/operators/isfinite_op.cc         |  5 +-
 paddle/fluid/operators/layer_norm_op.cc       |  3 +-
 paddle/fluid/operators/linear_chain_crf_op.cc |  9 +--
 paddle/fluid/operators/load_combine_op.cc     |  2 +-
 paddle/fluid/operators/load_op.cc             |  2 +-
 paddle/fluid/operators/lod_reset_op.cc        | 10 +--
 .../fluid/operators/lod_tensor_to_array_op.cc |  2 +-
 .../fluid/operators/lookup_sparse_table_op.cc |  3 +-
 paddle/fluid/operators/lrn_op.cc              |  5 +-
 paddle/fluid/operators/lstm_op.cc             |  6 +-
 paddle/fluid/operators/lstmp_op.cc            |  6 +-
 paddle/fluid/operators/math/math_function.cc  |  6 +-
 paddle/fluid/operators/math/math_function.cu  |  2 +-
 paddle/fluid/operators/mean_iou_op.cc         |  5 +-
 paddle/fluid/operators/mean_op.cc             |  4 +-
 paddle/fluid/operators/merge_lod_tensor_op.cc |  4 +-
 paddle/fluid/operators/metrics/accuracy_op.cc |  5 +-
 paddle/fluid/operators/metrics/auc_op.cc      |  5 +-
 .../operators/metrics/precision_recall_op.cc  |  5 +-
 paddle/fluid/operators/multiplex_op.cc        | 10 +--
 paddle/fluid/operators/nce_op.cc              | 10 +--
 .../fluid/operators/optimizers/adadelta_op.cc |  5 +-
 .../fluid/operators/optimizers/adagrad_op.cc  |  5 +-
 paddle/fluid/operators/optimizers/adam_op.cc  |  3 +-
 .../fluid/operators/optimizers/adamax_op.cc   |  5 +-
 .../optimizers/decayed_adagrad_op.cc          |  5 +-
 paddle/fluid/operators/optimizers/ftrl_op.cc  |  3 +-
 .../optimizers/proximal_adagrad_op.cc         |  5 +-
 .../operators/optimizers/proximal_gd_op.cc    |  5 +-
 paddle/fluid/operators/pad2d_op.cc            |  8 +-
 .../fluid/operators/pad_constant_like_op.cc   | 10 +--
 paddle/fluid/operators/pool_op.cc             |  7 +-
 paddle/fluid/operators/pool_with_index_op.cc  | 10 +--
 .../operators/positive_negative_pair_op.cc    |  5 +-
 paddle/fluid/operators/prelu_op.cc            | 10 +--
 paddle/fluid/operators/print_op.cc            |  2 +-
 paddle/fluid/operators/random_crop_op.cc      |  5 +-
 .../reader/create_batch_reader_op.cc          |  4 +-
 paddle/fluid/operators/recurrent_op.cc        |  2 +-
 paddle/fluid/operators/reshape_op.cc          | 14 ++--
 .../fluid/operators/rnn_memory_helper_op.cc   |  2 +-
 paddle/fluid/operators/roi_align_op.cc        | 10 +--
 paddle/fluid/operators/roi_pool_op.cc         | 10 +--
 paddle/fluid/operators/save_combine_op.cc     |  2 +-
 paddle/fluid/operators/save_op.cc             |  2 +-
 paddle/fluid/operators/scatter_op.cc          | 10 +--
 .../sequence_ops/sequence_pool_op.cc          |  5 +-
 .../sequence_ops/sequence_scatter_op.cc       | 10 +--
 .../sequence_ops/sequence_slice_op.cc         | 10 +--
 .../sequence_ops/sequence_softmax_op.cc       |  4 +-
 paddle/fluid/operators/similarity_focus_op.cc |  5 +-
 paddle/fluid/operators/slice_op.cc            |  5 +-
 paddle/fluid/operators/softmax_op.cc          |  7 +-
 .../softmax_with_cross_entropy_op.cc          |  8 +-
 paddle/fluid/operators/sum_op.cc              | 13 ++--
 .../operators/tensorrt/tensorrt_engine_op.h   |  5 +-
 paddle/fluid/operators/transpose_op.cc        |  9 +--
 paddle/fluid/operators/unpool_op.cc           | 10 +--
 paddle/fluid/operators/warpctc_op.cc          | 10 +--
 paddle/fluid/operators/yolov3_loss_op.cc      | 10 +--
 paddle/fluid/platform/nccl_helper.h           | 11 +--
 paddle/fluid/pybind/pybind.cc                 |  2 +-
 paddle/fluid/pybind/tensor_py.h               |  2 +-
 132 files changed, 407 insertions(+), 576 deletions(-)

diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 5467f6d1b2..72c50518af 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -85,7 +85,7 @@ void TransDataLayout(const OpKernelType& kernel_type_for_var,
   out->mutable_data(expected_kernel_type.place_, in.type());
 
   framework::VisitDataType(
-      framework::ToDataType(in.type()),
+      in.type(),
       CastDataLayout(pool.Get(expected_kernel_type.place_), axis, in, out));
 
   out->set_layout(expected_kernel_type.data_layout_);
@@ -101,7 +101,7 @@ void* GetDataFromTensor(const Tensor& tensor, mkldnn::memory::data_type type) {
     case mkldnn::memory::data_type::f32:
       return platform::to_void_cast(tensor.data<float>());
     case mkldnn::memory::data_type::s8:
-      return platform::to_void_cast(tensor.data<char>());
+      return platform::to_void_cast(tensor.data<int8_t>());
     case mkldnn::memory::data_type::u8:
       return platform::to_void_cast(tensor.data<unsigned char>());
     case mkldnn::memory::data_type::s16:
@@ -144,7 +144,7 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
 
   memory::data_type in_type = ToMKLDNNDataType(in.type());
   PADDLE_ENFORCE(in_type != memory::data_type::data_undef,
-                 "Input tensor type is not supported: ", in.type().name());
+                 "Input tensor type is not supported: %s", in.type());
   memory::data_type out_type = in_type;
 
   auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
index 90bb206ec6..2479de4fd4 100644
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -50,14 +50,14 @@ inline DataLayout ToPaddleLayout(const MKLDNNFormat& format) {
   }
 }
 
-inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) {
-  static const std::map<std::type_index, MKLDNNDataType> dict{
-      {std::type_index(typeid(float)), MKLDNNDataType::f32},  // NOLINT
-      {std::type_index(typeid(char)), MKLDNNDataType::s8},    // NOLINT
-      {std::type_index(typeid(unsigned char)), MKLDNNDataType::u8},
-      {std::type_index(typeid(int16_t)), MKLDNNDataType::s16},
-      {std::type_index(typeid(int32_t)), MKLDNNDataType::s32}};
-  auto iter = dict.find(type);
+inline MKLDNNDataType ToMKLDNNDataType(proto::VarType::Type type) {
+  static std::unordered_map<int, MKLDNNDataType> dict{
+      {DataTypeTrait<float>::DataType, MKLDNNDataType::f32},
+      {DataTypeTrait<int8_t>::DataType, MKLDNNDataType::s8},
+      {DataTypeTrait<uint8_t>::DataType, MKLDNNDataType::u8},
+      {DataTypeTrait<int16_t>::DataType, MKLDNNDataType::s16},
+      {DataTypeTrait<int32_t>::DataType, MKLDNNDataType::s32}};
+  auto iter = dict.find(static_cast<int>(type));
   if (iter != dict.end()) return iter->second;
   return MKLDNNDataType::data_undef;
 }
diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc
index 28f3da88fa..a0248cf3c7 100644
--- a/paddle/fluid/framework/data_type.cc
+++ b/paddle/fluid/framework/data_type.cc
@@ -26,7 +26,7 @@ struct DataTypeMap {
   std::unordered_map<std::type_index, proto::VarType::Type> cpp_to_proto_;
   std::unordered_map<int, std::type_index> proto_to_cpp_;
   std::unordered_map<int, std::string> proto_to_str_;
-  std::unordered_map<std::type_index, size_t> cpp_to_size_;
+  std::unordered_map<int, size_t> proto_to_size_;
 };
 
 static DataTypeMap* InitDataTypeMap();
@@ -45,7 +45,7 @@ static inline void RegisterType(DataTypeMap* map,
   map->proto_to_cpp_.emplace(static_cast<int>(proto_type), typeid(T));
   map->cpp_to_proto_.emplace(typeid(T), proto_type);
   map->proto_to_str_.emplace(static_cast<int>(proto_type), name);
-  map->cpp_to_size_.emplace(typeid(T), sizeof(T));
+  map->proto_to_size_.emplace(static_cast<int>(proto_type), sizeof(T));
 }
 
 static DataTypeMap* InitDataTypeMap() {
@@ -54,17 +54,7 @@ static DataTypeMap* InitDataTypeMap() {
 #define RegType(cc_type, proto_type) \
   RegisterType<cc_type>(retv, proto_type, #cc_type)
 
-  // NOTE: Add your customize type here.
-  RegType(float16, proto::VarType::FP16);
-  RegType(float, proto::VarType::FP32);
-  RegType(double, proto::VarType::FP64);
-  RegType(int, proto::VarType::INT32);
-  RegType(int64_t, proto::VarType::INT64);
-  RegType(bool, proto::VarType::BOOL);
-  RegType(size_t, proto::VarType::SIZE_T);
-  RegType(int16_t, proto::VarType::INT16);
-  RegType(uint8_t, proto::VarType::UINT8);
-  RegType(int8_t, proto::VarType::INT8);
+  _ForEachDataType_(RegType);
 
 #undef RegType
   return retv;
@@ -96,12 +86,12 @@ std::string DataTypeToString(const proto::VarType::Type type) {
                static_cast<int>(type));
 }
 
-size_t SizeOfType(std::type_index type) {
-  auto it = gDataTypeMap().cpp_to_size_.find(type);
-  if (it != gDataTypeMap().cpp_to_size_.end()) {
+size_t SizeOfType(proto::VarType::Type type) {
+  auto it = gDataTypeMap().proto_to_size_.find(static_cast<int>(type));
+  if (it != gDataTypeMap().proto_to_size_.end()) {
     return it->second;
   }
-  PADDLE_THROW("Not support %s as tensor type", type.name());
+  PADDLE_THROW("Not support %s as tensor type", DataTypeToString(type));
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index d5be43b33e..76df78ea5e 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -22,46 +22,59 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+template <typename T>
+struct DataTypeTrait {};
+
+// Stub handle for void
+template <>
+struct DataTypeTrait<void> {
+  constexpr static auto DataType = proto::VarType::RAW;
+};
+
+#define _ForEachDataTypeHelper_(callback, cpp_type, proto_type) \
+  callback(cpp_type, ::paddle::framework::proto::VarType::proto_type);
+
+#define _ForEachDataType_(callback)                                     \
+  _ForEachDataTypeHelper_(callback, float, FP32);                       \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16); \
+  _ForEachDataTypeHelper_(callback, double, FP64);                      \
+  _ForEachDataTypeHelper_(callback, int, INT32);                        \
+  _ForEachDataTypeHelper_(callback, int64_t, INT64);                    \
+  _ForEachDataTypeHelper_(callback, bool, BOOL);                        \
+  _ForEachDataTypeHelper_(callback, uint8_t, UINT8);                    \
+  _ForEachDataTypeHelper_(callback, int16_t, INT16);                    \
+  _ForEachDataTypeHelper_(callback, int8_t, INT8)
+
+#define DefineDataTypeTrait(cpp_type, proto_type) \
+  template <>                                     \
+  struct DataTypeTrait<cpp_type> {                \
+    constexpr static auto DataType = proto_type;  \
+  }
+
+_ForEachDataType_(DefineDataTypeTrait);
+
+#undef DefineDataTypeTrait
+
 extern proto::VarType::Type ToDataType(std::type_index type);
 extern std::type_index ToTypeIndex(proto::VarType::Type type);
 
 template <typename Visitor>
 inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
-  switch (type) {
-    case proto::VarType::FP16:
-      visitor.template apply<platform::float16>();
-      break;
-    case proto::VarType::FP32:
-      visitor.template apply<float>();
-      break;
-    case proto::VarType::FP64:
-      visitor.template apply<double>();
-      break;
-    case proto::VarType::INT32:
-      visitor.template apply<int>();
-      break;
-    case proto::VarType::INT64:
-      visitor.template apply<int64_t>();
-      break;
-    case proto::VarType::BOOL:
-      visitor.template apply<bool>();
-      break;
-    case proto::VarType::UINT8:
-      visitor.template apply<uint8_t>();
-      break;
-    case proto::VarType::INT16:
-      visitor.template apply<int16_t>();
-      break;
-    case proto::VarType::INT8:
-      visitor.template apply<int8_t>();
-      break;
-    default:
-      PADDLE_THROW("Not supported %d", type);
-  }
+#define VisitDataTypeCallback(cpp_type, proto_type) \
+  do {                                              \
+    if (type == proto_type) {                       \
+      visitor.template apply<cpp_type>();           \
+      return;                                       \
+    }                                               \
+  } while (0)
+
+  _ForEachDataType_(VisitDataTypeCallback);
+#undef VisitDataTypeCallback
+  PADDLE_THROW("Not supported %d", type);
 }
 
 extern std::string DataTypeToString(const proto::VarType::Type type);
-extern size_t SizeOfType(std::type_index type);
+extern size_t SizeOfType(proto::VarType::Type type);
 inline std::ostream& operator<<(std::ostream& out,
                                 const proto::VarType::Type& type) {
   out << DataTypeToString(type);
diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc
index 54c41c55ba..92639dfc61 100644
--- a/paddle/fluid/framework/data_type_test.cc
+++ b/paddle/fluid/framework/data_type_test.cc
@@ -26,13 +26,13 @@ TEST(DataType, float16) {
 
   Tensor tensor;
   CPUPlace cpu;
-  tensor.mutable_data(cpu, f::ToTypeIndex(dtype));
+  tensor.mutable_data(cpu, dtype);
 
   // test fp16 tensor
-  EXPECT_EQ(tensor.type(), std::type_index(typeid(float16)));
+  EXPECT_EQ(tensor.type(), f::ToDataType(typeid(float16)));
 
   // test fp16 size
-  EXPECT_EQ(f::SizeOfType(f::ToTypeIndex(dtype)), 2u);
+  EXPECT_EQ(f::SizeOfType(dtype), 2u);
 
   // test debug info
   std::string type = "float16";
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index e8bf53e160..9eaff1f560 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -127,7 +127,7 @@ void AllReduceOpHandle::RunImpl() {
 
       // Reduce All Tensor to trg in CPU
       ReduceLoDTensor func(lod_tensors, &trg);
-      VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+      VisitDataType(lod_tensors[0]->type(), func);
 
       for (size_t i = 1; i < local_scopes_.size(); ++i) {
         auto &scope =
diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.h b/paddle/fluid/framework/details/fuse_vars_op_handle.h
index 3f360c510a..b40b01df36 100644
--- a/paddle/fluid/framework/details/fuse_vars_op_handle.h
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.h
@@ -33,7 +33,7 @@ struct FuseVarsOpHandle : public OpHandleBase {
   FuseVarsOpHandle(ir::Node *node, Scope *local_scope,
                    const platform::Place &place,
                    const std::unordered_map<std::string, int64_t> &inputs_numel,
-                   const std::type_index &var_type)
+                   const proto::VarType::Type var_type)
       : OpHandleBase(node),
         local_scope_(local_scope),
         place_(place),
@@ -57,7 +57,7 @@ struct FuseVarsOpHandle : public OpHandleBase {
   Scope *local_scope_;
   const platform::Place place_;
   const std::unordered_map<std::string, int64_t> inputs_numel_;
-  const std::type_index type_;
+  const proto::VarType::Type type_;
   int64_t total_numel_;
 };
 }  // namespace details
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index cb864848b9..85d8abc910 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -246,7 +246,7 @@ void ReduceOpHandle::RunImpl() {
         if (!FLAGS_cpu_deterministic) {
           ReduceLoDTensor func(lod_tensors,
                                out_var->GetMutable<framework::LoDTensor>());
-          VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+          VisitDataType(lod_tensors[0]->type(), func);
         } else {
           // We sum lod_tensors to reduce_sum_trg which is in local_scopes_0
           // here, but it doesn't mean reduce_sum_trg must be in local_scopes_0.
@@ -256,7 +256,7 @@ void ReduceOpHandle::RunImpl() {
                                       ->FindVar(out_var_handle->name_)
                                       ->GetMutable<framework::LoDTensor>();
           ReduceLoDTensor func(lod_tensors, &reduce_sum_trg);
-          VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+          VisitDataType(lod_tensors[0]->type(), func);
 
           auto trg = out_var->GetMutable<framework::LoDTensor>();
           if (reduce_sum_trg.data<void>() != trg->data<void>()) {
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index 04e3f78afe..eaef093ed3 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/dlpack_tensor.h"
-
+#include "paddle/fluid/framework/data_type.h"
 namespace paddle {
 namespace framework {
 
@@ -36,26 +36,23 @@ static ::DLDataType GetDLDataTypeCode() {
   return dtype;
 }
 
-static DLDataType GetDLDataTypeFromTypeIndex(const std::type_index &type) {
-#define REG_DL_DATA_TYPE(type) \
-  { std::type_index(typeid(type)), GetDLDataTypeCode<type>() }
-  static const std::unordered_map<std::type_index, ::DLDataType>
-      type_to_dtype_map({
-          REG_DL_DATA_TYPE(platform::float16),  // NOLINT
-          REG_DL_DATA_TYPE(float),              // NOLINT
-          REG_DL_DATA_TYPE(double),             // NOLINT
-          REG_DL_DATA_TYPE(int),                // NOLINT
-          REG_DL_DATA_TYPE(int64_t),            // NOLINT
-          REG_DL_DATA_TYPE(bool),               // NOLINT
-          REG_DL_DATA_TYPE(size_t),             // NOLINT
-          REG_DL_DATA_TYPE(int16_t),            // NOLINT
-          REG_DL_DATA_TYPE(uint8_t),            // NOLINT
-          REG_DL_DATA_TYPE(int8_t)              // NOLINT
-      });
+static std::unordered_map<int, ::DLDataType> CreateDLDataTypeMap() {
+  static std::unordered_map<int, ::DLDataType> result;
+
+#define REG_DL_DATA_TYPE(cpp_type, proto_type) \
+  result[static_cast<int>(proto_type)] = GetDLDataTypeCode<cpp_type>()
+
+  _ForEachDataType_(REG_DL_DATA_TYPE);
+#undef REG_DL_DATA_TYPE
+  return result;
+}
+
+static DLDataType GetDLDataTypeFromTypeIndex(proto::VarType::Type type) {
+  static auto type_to_dtype_map = CreateDLDataTypeMap();
   static auto type_to_dtype_map_end_it = type_to_dtype_map.end();
-  auto it = type_to_dtype_map.find(type);
-  PADDLE_ENFORCE(it != type_to_dtype_map_end_it, "Unsupported data type %s",
-                 type.name());
+  auto it = type_to_dtype_map.find(static_cast<int>(type));
+  PADDLE_ENFORCE(it != type_to_dtype_map_end_it, "Unsupported data type %d",
+                 type);
   return it->second;
 #undef REG_DL_DATA_TYPE
 }
diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc
index 938b056350..c0a8e1bcdf 100644
--- a/paddle/fluid/framework/dlpack_tensor_test.cc
+++ b/paddle/fluid/framework/dlpack_tensor_test.cc
@@ -91,23 +91,11 @@ void TestMainLoop() {
     }
   }
 }
+TEST(dlpack, test_all) {
+#define TestCallback(cpp_type, proto_type) TestMainLoop<cpp_type>()
 
-#define PADDLE_DLPACK_TEST(type) \
-  TEST(dlpack, test_##type) { TestMainLoop<type>(); }
-
-using float16 = platform::float16;
-PADDLE_DLPACK_TEST(float16);
-PADDLE_DLPACK_TEST(float);
-PADDLE_DLPACK_TEST(double);
-PADDLE_DLPACK_TEST(int);
-PADDLE_DLPACK_TEST(int64_t);
-PADDLE_DLPACK_TEST(bool);
-PADDLE_DLPACK_TEST(size_t);
-PADDLE_DLPACK_TEST(int16_t);
-PADDLE_DLPACK_TEST(uint8_t);
-PADDLE_DLPACK_TEST(int8_t);
-
-#undef PADDLE_DLPACK_TEST
+  _ForEachDataType_(TestCallback);
+}
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc
index 3d53511615..f03f39dfc6 100644
--- a/paddle/fluid/framework/executor_thread_worker.cc
+++ b/paddle/fluid/framework/executor_thread_worker.cc
@@ -138,39 +138,19 @@ void print_lod_tensor(std::string var_name, const LoDTensor& lod_tensor) {
   std::cout << sstream.str() << std::endl;
 }
 
-void print_fetch_var(Scope* scope, std::string var_name) {
-  const LoDTensor& tensor = scope->FindVar(var_name)->Get<LoDTensor>();
-
-  if (std::type_index(tensor.type()) ==
-      std::type_index(typeid(platform::float16))) {
-    print_lod_tensor<platform::float16>(var_name, tensor);
-  } else if (std::type_index(tensor.type()) == std::type_index(typeid(float))) {
-    print_lod_tensor<float>(var_name, tensor);
-  } else if (std::type_index(tensor.type()) ==
-             std::type_index(typeid(double))) {
-    print_lod_tensor<double>(var_name, tensor);
-  } else if (std::type_index(tensor.type()) == std::type_index(typeid(int))) {
-    print_lod_tensor<int>(var_name, tensor);
-  } else if (std::type_index(tensor.type()) ==
-             std::type_index(typeid(int64_t))) {
-    print_lod_tensor<int64_t>(var_name, tensor);
-  } else if (std::type_index(tensor.type()) == std::type_index(typeid(bool))) {
-    print_lod_tensor<bool>(var_name, tensor);
-  } else if (std::type_index(tensor.type()) ==
-             std::type_index(typeid(uint8_t))) {
-    print_lod_tensor<uint8_t>(var_name, tensor);
-  } else if (std::type_index(tensor.type()) ==
-             std::type_index(typeid(int16_t))) {
-    print_lod_tensor<int16_t>(var_name, tensor);
-  } else if (std::type_index(tensor.type()) ==
-             std::type_index(typeid(int8_t))) {
-    print_lod_tensor<int8_t>(var_name, tensor);
-  } else {
-    VLOG(1) << "print_fetch_var: unrecognized data type:"
-            << tensor.type().name();
-  }
-
-  return;
+static void print_fetch_var(Scope* scope, const std::string& var_name) {
+  auto& tensor = scope->FindVar(var_name)->Get<LoDTensor>();
+
+#define PrintLoDTensorCallback(cpp_type, proto_type) \
+  do {                                               \
+    if (tensor.type() == proto_type) {               \
+      print_lod_tensor<cpp_type>(var_name, tensor);  \
+      return;                                        \
+    }                                                \
+  } while (0)
+
+  _ForEachDataType_(PrintLoDTensorCallback);
+  VLOG(1) << "print_fetch_var: unrecognized data type:" << tensor.type();
 }
 
 void ExecutorThreadWorker::TrainFiles() {
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 9b2eeaf59a..6c8bec32de 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -70,9 +70,9 @@ std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
   // only print first ten elements
   int64_t size = t.numel() < 10 ? t.numel() : 10;
   for (int64_t i = 0; i < size; ++i) {
-    if (IsType<float>(t.type())) {
+    if (t.type() == proto::VarType::FP32) {
       os << t.data<float>()[i] << " ";
-    } else if (IsType<int64_t>(t.type())) {
+    } else if (t.type() == proto::VarType::INT64) {
       os << t.data<int64_t>()[i] << " ";
     } else {
       PADDLE_THROW("LoDTensor data type not in [float, int64_t]");
@@ -387,7 +387,7 @@ void LoDTensor::MergeLoDTensor(
   PADDLE_ENFORCE(!lod_tensors.empty());
 
   framework::DDim new_dim = lod_tensors[0]->dims();
-  std::type_index new_type = lod_tensors[0]->type();
+  auto new_type = lod_tensors[0]->type();
   framework::DataLayout new_layout = lod_tensors[0]->layout();
   LoD new_lod = lod_tensors[0]->lod();
   for (size_t i = 1; i < lod_tensors.size(); ++i) {
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index c6f3254e9f..05ab48412a 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -43,10 +43,9 @@ std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
 
 proto::VarType::Type GetDataTypeOfVar(const Variable* var) {
   if (var->IsType<framework::LoDTensor>()) {
-    return framework::ToDataType(var->Get<framework::LoDTensor>().type());
+    return var->Get<framework::LoDTensor>().type();
   } else if (var->IsType<framework::SelectedRows>()) {
-    return framework::ToDataType(
-        var->Get<framework::SelectedRows>().value().type());
+    return var->Get<framework::SelectedRows>().value().type();
   } else {
     PADDLE_THROW("Var should be LoDTensor or SelectedRows");
   }
@@ -93,13 +92,13 @@ static std::string GetDtype(const Scope& scope, const std::string& name) {
     if (UNLIKELY(!tensor.IsInitialized())) {
       return "";
     }
-    return DataTypeToString(ToDataType(tensor.type()));
+    return DataTypeToString(tensor.type());
   } else if (var->IsType<SelectedRows>()) {
     auto tensor = var->Get<SelectedRows>().value();
     if (UNLIKELY(!tensor.IsInitialized())) {
       return "uninited";
     } else {
-      return DataTypeToString(ToDataType(tensor.type()));
+      return DataTypeToString(tensor.type());
     }
   } else {
     return "";
@@ -686,7 +685,8 @@ static void CheckTensorNANOrInf(const std::string& name,
   if (tensor.memory_size() == 0) {
     return;
   }
-  if (!IsType<float>(tensor.type()) && !IsType<double>(tensor.type())) {
+  if (tensor.type() != proto::VarType::FP32 &&
+      tensor.type() != proto::VarType::FP64) {
     return;
   }
   PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
@@ -879,7 +879,7 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
           t = &(var->Get<SelectedRows>().value());
         }
         if (t != nullptr) {
-          int tmp = static_cast<int>(ToDataType(t->type()));
+          int tmp = static_cast<int>(t->type());
           PADDLE_ENFORCE(
               tmp == data_type || data_type == -1,
               "DataType of Paddle Op %s must be the same. Get %s(%d) != %s(%d)",
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index 62a30815d4..54a818250b 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -218,11 +218,11 @@ void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
       if (index < 0) {
         VLOG(5) << "id " << id << " not in the table, return 0";
         framework::VisitDataType(
-            framework::ToDataType(value_->type()),
+            value_->type(),
             TensorFillVisitor(value, i * value_width, value_width, 0.0));
       } else {
         framework::VisitDataType(
-            framework::ToDataType(value_->type()),
+            value_->type(),
             TensorCopyVisitor(value, i * value_width, *value_.get(),
                               index * value_width, value_width));
       }
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index 41566800e5..57335847a1 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-extern size_t SizeOfType(std::type_index type);
+extern size_t SizeOfType(proto::VarType::Type type);
 void Tensor::check_memory_size() const {
   PADDLE_ENFORCE_NOT_NULL(
       holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
@@ -31,7 +31,7 @@ size_t Tensor::memory_size() const {
   return holder_ == nullptr ? 0UL : holder_->size() - offset_;
 }
 
-void* Tensor::mutable_data(platform::Place place, std::type_index type,
+void* Tensor::mutable_data(platform::Place place, proto::VarType::Type type,
                            memory::Allocator::Attr attr,
                            size_t requested_size) {
   type_ = type;
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 71e8badd4b..057fe1f98c 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <paddle/fluid/framework/framework.pb.h>
 #include <cstdint>
 #include <cstring>
 #include <memory>
@@ -67,7 +68,7 @@ class Tensor {
   friend struct EigenVector;
 
  public:
-  Tensor() : type_(typeid(float)), offset_(0) {}
+  Tensor() : type_(proto::VarType::FP32), offset_(0) {}
 
   /*! Return a pointer to mutable memory block. */
   template <typename T>
@@ -88,7 +89,7 @@ class Tensor {
                   memory::Allocator::Attr attr = memory::Allocator::kDefault,
                   size_t requested_size = 0);
 
-  void* mutable_data(platform::Place place, std::type_index type,
+  void* mutable_data(platform::Place place, proto::VarType::Type type,
                      memory::Allocator::Attr attr = memory::Allocator::kDefault,
                      size_t requested_size = 0);
 
@@ -138,7 +139,7 @@ class Tensor {
     return holder_->place();
   }
 
-  std::type_index type() const {
+  proto::VarType::Type type() const {
     PADDLE_ENFORCE_NOT_NULL(
         holder_, "Tensor not initialized yet when Tensor::type() is called.");
     return type_;
@@ -161,7 +162,7 @@ class Tensor {
  private:
   /*! holds the memory block if allocated. */
   std::shared_ptr<memory::Allocation> holder_;
-  std::type_index type_;
+  proto::VarType::Type type_;
   /**
    * @brief points to elements dimensions.
    *
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index 0c9c0d782f..ce3ad18b1f 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -24,9 +24,8 @@ template <typename T>
 inline const T* Tensor::data() const {
   check_memory_size();
   bool valid =
-      std::is_same<T, void>::value || type_ == std::type_index(typeid(T));
-  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
-                 type_.name());
+      std::is_same<T, void>::value || type_ == DataTypeTrait<T>::DataType;
+  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %d", type_);
 
   return reinterpret_cast<const T*>(
       reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
@@ -38,9 +37,8 @@ template <typename T>
 inline T* Tensor::data() {
   check_memory_size();
   bool valid =
-      std::is_same<T, void>::value || type_ == std::type_index(typeid(T));
-  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
-                 type_.name());
+      std::is_same<T, void>::value || type_ == DataTypeTrait<T>::DataType;
+  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s", type_);
   return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                               offset_);
 }
@@ -60,7 +58,7 @@ inline T* Tensor::mutable_data(platform::Place place,
                                size_t requested_size) {
   static_assert(std::is_pod<T>::value, "T must be POD");
   return reinterpret_cast<T*>(
-      mutable_data(place, typeid(T), attr, requested_size));
+      mutable_data(place, DataTypeTrait<T>::DataType, attr, requested_size));
 }
 
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index ca1e01c89f..85d15c5d3f 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -186,8 +186,8 @@ struct AnyDTypeVisitor {
 template <typename Predicate, typename DevCtx>
 inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor,
                     const DevCtx& ctx, framework::Tensor* out) {
-  VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor<Predicate, DevCtx>(
-                                               predicate, tensor, ctx, out));
+  VisitDataType(tensor.type(), AnyDTypeVisitor<Predicate, DevCtx>(
+                                   predicate, tensor, ctx, out));
 }
 
 template <typename Predicate>
@@ -379,7 +379,7 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
      // int32_t  size
      // void*    protobuf message
     proto::VarType::TensorDesc desc;
-    desc.set_data_type(framework::ToDataType(tensor.type()));
+    desc.set_data_type(tensor.type());
     auto dims = framework::vectorize(tensor.dims());
     auto* pb_dims = desc.mutable_dims();
     pb_dims->Resize(static_cast<int>(dims.size()), 0);
@@ -461,9 +461,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     tensor->Resize(framework::make_ddim(dims));
     void* buf;
     auto ctx = platform::CPUDeviceContext();
-    size_t size =
-        tensor->numel() *
-        framework::SizeOfType(framework::ToTypeIndex(desc.data_type()));
+    size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
     if (platform::is_gpu_place(dev_ctx.GetPlace())) {
 #ifdef PADDLE_WITH_CUDA
       Tensor cpu_tensor;
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index be51e7fc1f..c751e85158 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -289,10 +289,10 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
     auto type = fetch.type();
     auto output = &(outputs->at(i));
     output->name = fetchs_[idx]->Input("X")[0];
-    if (type == typeid(float)) {
+    if (type == framework::proto::VarType::FP32) {
       GetFetchOne<float>(fetch, output);
       output->dtype = PaddleDType::FLOAT32;
-    } else if (type == typeid(int64_t)) {
+    } else if (type == framework::proto::VarType::INT64) {
       GetFetchOne<int64_t>(fetch, output);
       output->dtype = PaddleDType::INT64;
     } else {
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 4c5b412a2c..3d121e0460 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -266,10 +266,10 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
     auto type = fetch.type();
     auto output = &(outputs->at(i));
     output->name = fetchs_[idx]->Input("X")[0];
-    if (type == typeid(float)) {
+    if (type == framework::DataTypeTrait<float>::DataType) {
       GetFetchOne<float>(fetch, output);
       output->dtype = PaddleDType::FLOAT32;
-    } else if (type == typeid(int64_t)) {
+    } else if (type == framework::DataTypeTrait<int64_t>::DataType) {
       GetFetchOne<int64_t>(fetch, output);
       output->dtype = PaddleDType::INT64;
     } else {
diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc
index 014bdc6a37..191225493c 100644
--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -36,10 +36,10 @@ namespace paddle {
 PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
   PaddleTensor pt;
 
-  if (t->type() == typeid(int64_t)) {
+  if (t->type() == framework::proto::VarType::INT64) {
     pt.data.Reset(t->data<void>(), t->numel() * sizeof(int64_t));
     pt.dtype = PaddleDType::INT64;
-  } else if (t->type() == typeid(float)) {
+  } else if (t->type() == framework::proto::VarType::INT32) {
     pt.data.Reset(t->data<void>(), t->numel() * sizeof(float));
     pt.dtype = PaddleDType::FLOAT32;
   } else {
diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc
index 6f7da445fc..1de59a5165 100644
--- a/paddle/fluid/operators/affine_grid_op.cc
+++ b/paddle/fluid/operators/affine_grid_op.cc
@@ -78,7 +78,7 @@ class AffineGridOp : public framework::OperatorWithKernel {
       library = framework::LibraryType::kCUDNN;
     }
 #endif
-    auto data_type = framework::ToDataType(ctx.Input<Tensor>("Theta")->type());
+    auto data_type = ctx.Input<Tensor>("Theta")->type();
     return framework::OpKernelType(data_type, ctx.GetPlace(),
                                    framework::DataLayout::kAnyLayout, library);
   }
@@ -188,9 +188,9 @@ class AffineGridOpGrad : public framework::OperatorWithKernel {
       library_ = framework::LibraryType::kCUDNN;
     }
 #endif
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Theta")->type()),
-        ctx.GetPlace(), framework::DataLayout::kAnyLayout, library_);
+    return framework::OpKernelType(ctx.Input<Tensor>("Theta")->type(),
+                                   ctx.GetPlace(),
+                                   framework::DataLayout::kAnyLayout, library_);
   }
 };
 
diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc
index 8174d37358..7fe9a0df74 100644
--- a/paddle/fluid/operators/arg_max_op.cc
+++ b/paddle/fluid/operators/arg_max_op.cc
@@ -28,6 +28,5 @@ REGISTER_OP_CPU_KERNEL(
                                     int32_t>,
     paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
                                     int16_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, size_t>,
     paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
                                     uint8_t>);
diff --git a/paddle/fluid/operators/arg_max_op.cu b/paddle/fluid/operators/arg_max_op.cu
index a147d77a9e..85e4f98173 100644
--- a/paddle/fluid/operators/arg_max_op.cu
+++ b/paddle/fluid/operators/arg_max_op.cu
@@ -25,7 +25,5 @@ REGISTER_OP_CUDA_KERNEL(
                                     int32_t>,
     paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
                                     int16_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    size_t>,
     paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
                                     uint8_t>);
diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc
index 41f188029f..23b24735cd 100644
--- a/paddle/fluid/operators/arg_min_op.cc
+++ b/paddle/fluid/operators/arg_min_op.cc
@@ -28,6 +28,5 @@ REGISTER_OP_CPU_KERNEL(
                                     int32_t>,
     paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
                                     int16_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, size_t>,
     paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
                                     uint8_t>);
diff --git a/paddle/fluid/operators/arg_min_op.cu b/paddle/fluid/operators/arg_min_op.cu
index 4d02050850..47d7c8b122 100644
--- a/paddle/fluid/operators/arg_min_op.cu
+++ b/paddle/fluid/operators/arg_min_op.cu
@@ -25,7 +25,5 @@ REGISTER_OP_CUDA_KERNEL(
                                     int32_t>,
     paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
                                     int16_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    size_t>,
     paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
                                     uint8_t>);
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index 6257e04b01..d942391b86 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -58,7 +58,7 @@ struct ArrayToLoDFunctor : public boost::static_visitor<void> {
     ArrayToLoDFunctorImpl<DeviceContext> functor;
     functor.dev_ctx_ = dev_ctx;
     functor.prev_functor_ = this;
-    framework::VisitDataType(framework::ToDataType(out->type()), functor);
+    framework::VisitDataType(out->type(), functor);
   }
 };
 
@@ -91,7 +91,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
     PADDLE_ENFORCE(!x.empty(), "There's no element in the input array.");
     int rank = x[0].dims().size();
     platform::Place place = x[0].place();
-    std::type_index data_type = x[0].type();
+    auto data_type = x[0].type();
     int64_t batch_size = x[0].dims()[0];
     framework::DDim ins_dims = rank > 1
                                    ? framework::slice_ddim(x[0].dims(), 1, rank)
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index 75fc59125f..b6996be4b0 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -121,9 +121,8 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
 
 framework::OpKernelType AttentionLSTMOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-      ctx.device_context());
+  return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                 ctx.device_context());
 }
 
 void AttentionLSTMOpMaker::Make() {
diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc
index f389eab605..0922b03b5f 100644
--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ b/paddle/fluid/operators/average_accumulates_op.cc
@@ -103,9 +103,8 @@ class AverageAccumulatesOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("param")->type()),
-        ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("param")->type(),
+                                   ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index f66813989c..8b672e09b2 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -72,8 +72,7 @@ class BatchNormOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    auto input_data_type = ctx.Input<Tensor>("X")->type();
     // By default, the type of the scale, bias, mean,
     // and var tensors should both be float. (For float or float16 input tensor)
     // or double (For double input tensor).
@@ -81,17 +80,13 @@ class BatchNormOp : public framework::OperatorWithKernel {
     if (input_data_type == framework::proto::VarType::FP64) {
       bn_param_type = framework::proto::VarType::FP64;
     }
-    PADDLE_ENFORCE_EQ(bn_param_type,
-                      framework::ToDataType(ctx.Input<Tensor>("Scale")->type()),
+    PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Scale")->type(),
                       "Scale input should be of float type");
-    PADDLE_ENFORCE_EQ(bn_param_type,
-                      framework::ToDataType(ctx.Input<Tensor>("Bias")->type()),
+    PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Bias")->type(),
                       "Bias input should be of float type");
-    PADDLE_ENFORCE_EQ(bn_param_type,
-                      framework::ToDataType(ctx.Input<Tensor>("Mean")->type()),
+    PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Mean")->type(),
                       "Mean input should be of float type");
-    PADDLE_ENFORCE_EQ(bn_param_type, framework::ToDataType(
-                                         ctx.Input<Tensor>("Variance")->type()),
+    PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Variance")->type(),
                       "Variance input should be of float type");
 
     // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
@@ -413,9 +408,8 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
     }
 #endif
 
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        layout, library);
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace(), layout, library);
   }
 };
 
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index 0d32cae0e1..ae9765b761 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -145,7 +145,7 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
     LoDTensor* sentenceScores = ctx.Output<LoDTensor>("SentenceScores");
 
     framework::VisitDataType(
-        framework::ToDataType(scores->at(0).type()),
+        scores->at(0).type(),
         BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores,
                                 beam_size, end_id));
   }
diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
index 62771d09f1..30f700f1d9 100644
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -282,8 +282,7 @@ class BeamSearchOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     framework::OpKernelType kt = framework::OpKernelType(
-        framework::ToDataType(
-            ctx.Input<framework::LoDTensor>("pre_ids")->type()),
+        ctx.Input<framework::LoDTensor>("pre_ids")->type(),
         platform::CPUPlace());
     return kt;
   }
diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc
index 9258d7c7e8..f349c51d8a 100644
--- a/paddle/fluid/operators/bpr_loss_op.cc
+++ b/paddle/fluid/operators/bpr_loss_op.cc
@@ -47,9 +47,8 @@ class BprLossOp : public framework::OperatorWithKernel {
   // is determined by its input "X".
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        platform::CPUPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   platform::CPUPlace());
   }
 };
 
@@ -94,9 +93,8 @@ class BprLossGradientOp : public framework::OperatorWithKernel {
   // is determined by its input "X".
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        platform::CPUPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc
index 135254ce6b..dd28f82b65 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc
@@ -48,13 +48,12 @@ class ConditionalOp : public framework::OperatorBase {
     if (!(ips.size() == 1UL && ips[0]->IsInitialized())) {
       PADDLE_THROW("should have one initialized input as condition");
     }
-    if (!(framework::IsType<bool>(ips[0]->type()) &&  // NOLINT
-          ips[0]->numel() == 1)) {
-      PADDLE_THROW(
-          "condition input's data type should be bool, "
-          "numel should be 1, actual numel is %d",
-          ips[0]->numel());
-    }
+
+    PADDLE_ENFORCE(ips[0]->type() == framework::proto::VarType::BOOL &&
+                       ips[0]->numel() == 1,
+                   "condition input's data type should be bool, "
+                   "numel should be 1, actual numel is %d",
+                   ips[0]->numel());
     bool res = false;
     if (platform::is_gpu_place(ips[0]->place())) {
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index 6c1b2f329a..66f8508f02 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -237,7 +237,7 @@ class WhileGradOp : public framework::OperatorBase {
           if (var->IsType<LoDTensor>()) {
             auto &inside_tensor = var->Get<framework::LoDTensor>();
             framework::AttributeMap attrs;
-            attrs["dtype"] = framework::ToDataType(inside_tensor.type());
+            attrs["dtype"] = inside_tensor.type();
             attrs["shape"] = framework::vectorize2int(inside_tensor.dims());
             attrs["value"] = 0.0f;
 
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index d7b8766288..183850db18 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -95,10 +95,8 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
   }
 #endif
 
-  auto input_data_type =
-      framework::ToDataType(ctx.Input<Tensor>("Input")->type());
-  auto filter_data_type =
-      framework::ToDataType(ctx.Input<Tensor>("Filter")->type());
+  auto input_data_type = ctx.Input<Tensor>("Input")->type();
+  auto filter_data_type = ctx.Input<Tensor>("Filter")->type();
   PADDLE_ENFORCE_EQ(input_data_type, filter_data_type,
                     "input and filter data type should be consistent");
 
@@ -382,9 +380,9 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
   }
 #endif
 
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
-      layout_, library_, customized_type_value);
+  return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
+                                 ctx.GetPlace(), layout_, library_,
+                                 customized_type_value);
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index 2fdfc40d19..86a140f152 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -104,9 +104,8 @@ framework::OpKernelType ConvTransposeOp::GetExpectedKernelType(
   }
 #endif
 
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
-      layout_, library_);
+  return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
+                                 ctx.GetPlace(), layout_, library_);
 }
 
 void Conv2DTransposeOpMaker::Make() {
@@ -335,9 +334,8 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
 
   std::string data_format = ctx.Attr<std::string>("data_format");
   framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
-      layout_, library_);
+  return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
+                                 ctx.GetPlace(), layout_, library_);
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc
index c27befe114..81c9e9e543 100644
--- a/paddle/fluid/operators/crf_decoding_op.cc
+++ b/paddle/fluid/operators/crf_decoding_op.cc
@@ -118,9 +118,8 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
-        platform::CPUPlace());
+    return framework::OpKernelType(ctx.Input<LoDTensor>("Emission")->type(),
+                                   platform::CPUPlace());
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
index a2a871efa8..97d20681b8 100644
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -51,9 +51,8 @@ class CropOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -174,9 +173,7 @@ class CropOpGrad : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(
-            ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
-                ->type()),
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
         ctx.device_context());
   }
 };
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index a904dd9130..1968e54b00 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -57,9 +57,8 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
   // is determined by its input "X".
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -111,9 +110,8 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
   // is determined by its input "X".
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/ctc_align_op.cc b/paddle/fluid/operators/ctc_align_op.cc
index d2b440d9d2..e7c472f8c0 100644
--- a/paddle/fluid/operators/ctc_align_op.cc
+++ b/paddle/fluid/operators/ctc_align_op.cc
@@ -36,9 +36,8 @@ class CTCAlignOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cc b/paddle/fluid/operators/detection/anchor_generator_op.cc
index 0c0155a0a9..f2984d1af2 100644
--- a/paddle/fluid/operators/detection/anchor_generator_op.cc
+++ b/paddle/fluid/operators/detection/anchor_generator_op.cc
@@ -53,8 +53,7 @@ class AnchorGeneratorOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
-        ctx.device_context());
+        ctx.Input<framework::Tensor>("Input")->type(), ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/detection/bipartite_match_op.cc b/paddle/fluid/operators/detection/bipartite_match_op.cc
index c23b65fe4d..b7da1261a8 100644
--- a/paddle/fluid/operators/detection/bipartite_match_op.cc
+++ b/paddle/fluid/operators/detection/bipartite_match_op.cc
@@ -45,9 +45,8 @@ class BipartiteMatchOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<LoDTensor>("DistMat")->type()),
-        platform::CPUPlace());
+    return framework::OpKernelType(ctx.Input<LoDTensor>("DistMat")->type(),
+                                   platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cc b/paddle/fluid/operators/detection/density_prior_box_op.cc
index 1012ba3652..cacd47ed4a 100644
--- a/paddle/fluid/operators/detection/density_prior_box_op.cc
+++ b/paddle/fluid/operators/detection/density_prior_box_op.cc
@@ -66,8 +66,7 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
-        ctx.GetPlace());
+        ctx.Input<framework::Tensor>("Input")->type(), ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index 709c2dfc4b..2c46803fd0 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -66,9 +66,8 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Anchors")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("Anchors")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/detection/mine_hard_examples_op.cc b/paddle/fluid/operators/detection/mine_hard_examples_op.cc
index 54a4b87ec8..f70e6adb5b 100644
--- a/paddle/fluid/operators/detection/mine_hard_examples_op.cc
+++ b/paddle/fluid/operators/detection/mine_hard_examples_op.cc
@@ -249,8 +249,7 @@ class MineHardExamplesOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("ClsLoss")->type()),
-        platform::CPUPlace());
+        ctx.Input<framework::Tensor>("ClsLoss")->type(), platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
index f0f8851be0..2395b18148 100644
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -65,8 +65,7 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(
-            ctx.Input<framework::LoDTensor>("Scores")->type()),
+        ctx.Input<framework::LoDTensor>("Scores")->type(),
         platform::CPUPlace());
   }
 };
diff --git a/paddle/fluid/operators/detection/prior_box_op.cc b/paddle/fluid/operators/detection/prior_box_op.cc
index b5cb6a724c..3e75c0394f 100644
--- a/paddle/fluid/operators/detection/prior_box_op.cc
+++ b/paddle/fluid/operators/detection/prior_box_op.cc
@@ -72,8 +72,7 @@ class PriorBoxOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
-        ctx.device_context());
+        ctx.Input<framework::Tensor>("Input")->type(), ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
index 42c720e701..3796854fe6 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
@@ -498,9 +498,8 @@ class ROIPerspectiveTransformOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -519,9 +518,8 @@ class ROIPerspectiveTransformGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
index 46fff9d338..dc6c3d5a66 100644
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -78,8 +78,7 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(
-            ctx.Input<framework::LoDTensor>("Anchor")->type()),
+        ctx.Input<framework::LoDTensor>("Anchor")->type(),
         platform::CPUPlace());
   }
 };
diff --git a/paddle/fluid/operators/detection/target_assign_op.cc b/paddle/fluid/operators/detection/target_assign_op.cc
index 3670019392..c057c82ce0 100644
--- a/paddle/fluid/operators/detection/target_assign_op.cc
+++ b/paddle/fluid/operators/detection/target_assign_op.cc
@@ -57,9 +57,8 @@ class TargetAssignOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc
index d7f49a9590..e1d113f854 100644
--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -71,8 +71,7 @@ class DetectionMAPOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(
-            ctx.Input<framework::Tensor>("DetectRes")->type()),
+        ctx.Input<framework::Tensor>("DetectRes")->type(),
         platform::CPUPlace());
   }
 };
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index 87bf7c6b15..41644d8cc1 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -197,8 +197,8 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto input_data_type = framework::ToDataType(
-        ctx.Input<Tensor>(framework::GradVarName("Out"))->type());
+    auto input_data_type =
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type();
 
 #ifdef PADDLE_WITH_MKLDNN
     if (platform::CanMKLDNNBeUsed(ctx)) {
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index 43af83fd69..8aff911141 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -115,9 +115,8 @@ class FakeQuantizeAbsMaxOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -175,9 +174,8 @@ class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index e80249fc87..1ed8a2ddd1 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -79,9 +79,8 @@ framework::OpKernelType FCOp::GetExpectedKernelType(
     library = framework::LibraryType::kMKLDNN;
     layout = framework::DataLayout::kMKLDNN;
   }
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
-      layout, library);
+  return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
+                                 ctx.GetPlace(), layout, library);
 }
 
 void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const {
@@ -111,9 +110,8 @@ framework::OpKernelType FCOpGrad::GetExpectedKernelType(
     library = framework::LibraryType::kMKLDNN;
     layout = framework::DataLayout::kMKLDNN;
   }
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
-      layout, library);
+  return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
+                                 ctx.GetPlace(), layout, library);
 }
 
 void FCOpMaker::Make() {
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 252f313440..38cb33e790 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -59,9 +59,9 @@ class FillConstantOp : public framework::OperatorBase {
 
     if (force_cpu) {
       auto cpu = platform::CPUPlace();
-      tensor->mutable_data(cpu, framework::ToTypeIndex(data_type));
+      tensor->mutable_data(cpu, data_type);
     } else {
-      tensor->mutable_data(dev_place, framework::ToTypeIndex(data_type));
+      tensor->mutable_data(dev_place, data_type);
     }
 
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
diff --git a/paddle/fluid/operators/fill_op.cc b/paddle/fluid/operators/fill_op.cc
index adc7cb1f9e..a885b301e7 100644
--- a/paddle/fluid/operators/fill_op.cc
+++ b/paddle/fluid/operators/fill_op.cc
@@ -55,7 +55,7 @@ class FillOp : public framework::OperatorBase {
         static_cast<framework::proto::VarType::Type>(Attr<int>("dtype"));
     platform::CPUPlace cpu;
     auto force_cpu = Attr<bool>("force_cpu");
-    out.mutable_data(force_cpu ? cpu : place, framework::ToTypeIndex(dtype));
+    out.mutable_data(force_cpu ? cpu : place, dtype);
 
     framework::LoDTensor tensor;
 
@@ -64,7 +64,7 @@ class FillOp : public framework::OperatorBase {
     } else {
       // Always make tensor in CPU memory.
       tensor.Resize(out.dims());
-      tensor.mutable_data(cpu, framework::ToTypeIndex(dtype));
+      tensor.mutable_data(cpu, dtype);
     }
 
     framework::VisitDataType(
diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
index 3771aac0df..0fbf564b7e 100644
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
@@ -135,9 +135,8 @@ class FusedElemwiseActivationOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(ctx.Input<framework::Tensor>("X")->type(),
                       ctx.Input<framework::Tensor>("Y")->type(),
                       "The element's type of input should be the same.");
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type());
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.GetPlace());
   }
 };
 
@@ -324,9 +323,8 @@ class FusedElemwiseActivationOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto input_data_type_index = ctx.Input<framework::Tensor>("Y")->type();
-    auto input_data_type = framework::ToDataType(input_data_type_index);
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("Y")->type(),
+                                   ctx.GetPlace());
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
index 1eb6523a2d..f1466f17fe 100644
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@@ -115,8 +115,7 @@ void FusedEmbeddingFCLSTMOp::InferShape(
 framework::OpKernelType FusedEmbeddingFCLSTMOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   return framework::OpKernelType(
-      framework::ToDataType(
-          ctx.Input<framework::LoDTensor>("Embeddings")->type()),
+      ctx.Input<framework::LoDTensor>("Embeddings")->type(),
       ctx.device_context());
 }
 
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
index 25b7ae7c28..4ce67e16dd 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -93,9 +93,8 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
 
 framework::OpKernelType FusionGRUOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-      ctx.device_context());
+  return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                 ctx.device_context());
 }
 
 void FusionGRUOpMaker::Make() {
diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc
index 8021a896ce..c4e752e3f0 100644
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@@ -117,9 +117,8 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
 
 framework::OpKernelType FusionLSTMOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-      ctx.device_context());
+  return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                 ctx.device_context());
 }
 
 void FusionLSTMOpMaker::Make() {
diff --git a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
index 40bba09f3e..b05329cfd0 100644
--- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
@@ -61,9 +61,8 @@ void FusionSeqConvEltAddReluOp::InferShape(
 
 framework::OpKernelType FusionSeqConvEltAddReluOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-      ctx.device_context());
+  return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                 ctx.device_context());
 }
 
 void FusionSeqConvEltAddReluOpMaker::Make() {
diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
index 17ed9771d0..aaef46de0d 100644
--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
@@ -67,9 +67,8 @@ void FusionSeqExpandConcatFCOp::InferShape(
 
 framework::OpKernelType FusionSeqExpandConcatFCOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.MultiInput<LoDTensor>("X")[0]->type()),
-      ctx.device_context());
+  return framework::OpKernelType(ctx.MultiInput<LoDTensor>("X")[0]->type(),
+                                 ctx.device_context());
 }
 
 void FusionSeqExpandConcatFCOpMaker::Make() {
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 95aa9b573c..0a8c0814a7 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -42,9 +42,8 @@ class GatherOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -60,9 +59,8 @@ class GatherGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
index e76eb6893b..14a2524bd8 100644
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -63,9 +63,9 @@ class GridSampleOp : public framework::OperatorWithKernel {
       library_ = framework::LibraryType::kCUDNN;
     }
 #endif
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        framework::DataLayout::kAnyLayout, library_);
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace(),
+                                   framework::DataLayout::kAnyLayout, library_);
   }
 };
 
@@ -159,9 +159,9 @@ class GridSampleOpGrad : public framework::OperatorWithKernel {
       library_ = framework::LibraryType::kCUDNN;
     }
 #endif
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        framework::DataLayout::kAnyLayout, library_);
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace(),
+                                   framework::DataLayout::kAnyLayout, library_);
   }
 };
 
diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc
index 6322659b67..4fa15058f8 100644
--- a/paddle/fluid/operators/group_norm_op.cc
+++ b/paddle/fluid/operators/group_norm_op.cc
@@ -141,8 +141,7 @@ class GroupNormGradOp : public framework::OperatorWithKernel {
     if (t == nullptr) {
       PADDLE_THROW("can't find Y@GRAD");
     }
-    return framework::OpKernelType(framework::ToDataType(t->type()),
-                                   ctx.GetPlace());
+    return framework::OpKernelType(t->type(), ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
index 0dbcc442df..a807117115 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
@@ -76,9 +76,8 @@ class HierarchicalSigmoidOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.GetPlace());
   }
 };
 
@@ -163,9 +162,8 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index 4d25822259..93dd3f794f 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -55,8 +55,8 @@ class InterpolateOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace());
   }
 };
 
@@ -124,8 +124,8 @@ class InterpolateOpGrad : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/is_empty_op.cc b/paddle/fluid/operators/is_empty_op.cc
index 29b73951bb..ba50bdf34b 100644
--- a/paddle/fluid/operators/is_empty_op.cc
+++ b/paddle/fluid/operators/is_empty_op.cc
@@ -35,8 +35,7 @@ class IsEmptyOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     framework::OpKernelType kt = framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        platform::CPUPlace());
+        ctx.Input<framework::LoDTensor>("X")->type(), platform::CPUPlace());
     return kt;
   }
 };
diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc
index 7b42efd623..1312eecfa4 100644
--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
@@ -40,10 +40,9 @@ class OverflowOp : public framework::OperatorWithKernel {
     int dtype = -1;
     auto *x_var = ctx.InputVar("X");
     if (x_var->IsType<framework::LoDTensor>()) {
-      dtype = framework::ToDataType(x_var->Get<framework::LoDTensor>().type());
+      dtype = x_var->Get<framework::LoDTensor>().type();
     } else if (x_var->IsType<framework::SelectedRows>()) {
-      dtype = framework::ToDataType(
-          x_var->Get<framework::SelectedRows>().value().type());
+      dtype = x_var->Get<framework::SelectedRows>().value().type();
     } else {
       PADDLE_THROW("Cannot find the input data type by all input data");
     }
diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc
index 14ce1da2e9..f83fe355b8 100644
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -153,8 +153,7 @@ class LayerNormGradOp : public framework::OperatorWithKernel {
     if (t == nullptr) {
       PADDLE_THROW("can't find Y@GRAD");
     }
-    return framework::OpKernelType(framework::ToDataType(t->type()),
-                                   ctx.GetPlace());
+    return framework::OpKernelType(t->type(), ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc
index ea1ca7f59d..998b7f09c3 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -184,9 +184,8 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
   // is determined by its input "Emission".
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
-        platform::CPUPlace());
+    return framework::OpKernelType(ctx.Input<LoDTensor>("Emission")->type(),
+                                   platform::CPUPlace());
   }
 };
 
@@ -244,9 +243,7 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(
-            ctx.Input<LoDTensor>(framework::GradVarName("LogLikelihood"))
-                ->type()),
+        ctx.Input<LoDTensor>(framework::GradVarName("LogLikelihood"))->type(),
         platform::CPUPlace());
   }
 };
diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc
index 9d1423915a..e28d199eeb 100644
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -69,7 +69,7 @@ class LoadCombineOp : public framework::OperatorBase {
       // Get data from fin to tensor
       DeserializeFromStream(*buffer, tensor, dev_ctx);
 
-      auto in_dtype = framework::ToDataType(tensor->type());
+      auto in_dtype = tensor->type();
       auto out_dtype =
           load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
 
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index df1edc5c2e..06773d1d0e 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -65,7 +65,7 @@ class LoadOp : public framework::OperatorBase {
     DeserializeFromStream(fin, tensor, dev_ctx);
 
     auto load_as_fp16 = Attr<bool>("load_as_fp16");
-    auto in_dtype = framework::ToDataType(tensor->type());
+    auto in_dtype = tensor->type();
     auto out_dtype = load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
 
     if (in_dtype != out_dtype) {
diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc
index 0d4e84e850..7c8fe5fbd7 100644
--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
@@ -39,9 +39,8 @@ class LoDResetOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -144,9 +143,8 @@ class LoDResetGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index 145d2db118..9b91cf5260 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -72,7 +72,7 @@ struct LoDTensorToArrayFunctor : public boost::static_visitor<void> {
     LoDTensorToArrayFunctorImpl<DeviceContext> func;
     func.prev_functor_ = this;
     func.dev_ctx_ = dev_ctx;
-    framework::VisitDataType(framework::ToDataType(input_.type()), func);
+    framework::VisitDataType(input_.type(), func);
   }
 };
 
diff --git a/paddle/fluid/operators/lookup_sparse_table_op.cc b/paddle/fluid/operators/lookup_sparse_table_op.cc
index 1b55527fd3..4840a7ac1e 100644
--- a/paddle/fluid/operators/lookup_sparse_table_op.cc
+++ b/paddle/fluid/operators/lookup_sparse_table_op.cc
@@ -63,8 +63,7 @@ class LookupSparseTableOp : public framework::OperatorBase {
     out_shape[0] = ids_t.numel();
     out_t->Resize(out_shape);
     out_t->mutable_data(cpu, w_t->value().type());
-    PADDLE_ENFORCE_EQ(framework::ToDataType(w_t->value().type()),
-                      framework::proto::VarType::FP32,
+    PADDLE_ENFORCE_EQ(w_t->value().type(), framework::proto::VarType::FP32,
                       "The sparse table only support FP32");
     w_t->Get(ids_t, out_t, true, is_test);
     out_t->set_lod(ids_t.lod());
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index a3bb2be5c7..06ac31b5f1 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -145,9 +145,8 @@ framework::OpKernelType GetExpectedLRNKernel(
   }
 #endif
 
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-      layout_, library_);
+  return framework::OpKernelType(ctx.Input<Tensor>("X")->type(), ctx.GetPlace(),
+                                 layout_, library_);
 }
 }  // namespace
 
diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc
index 3225bf9bb6..4a199d681f 100644
--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
@@ -96,8 +96,7 @@ class LSTMOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
-        ctx.device_context());
+        ctx.Input<framework::LoDTensor>("Input")->type(), ctx.device_context());
   }
 };
 
@@ -261,8 +260,7 @@ class LSTMGradOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
-        ctx.device_context());
+        ctx.Input<framework::LoDTensor>("Input")->type(), ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc
index e398b51480..7a62bc9f82 100644
--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
@@ -113,8 +113,7 @@ class LSTMPOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
-        ctx.device_context());
+        ctx.Input<framework::LoDTensor>("Input")->type(), ctx.device_context());
   }
 };
 
@@ -312,8 +311,7 @@ class LSTMPGradOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
-        ctx.device_context());
+        ctx.Input<framework::LoDTensor>("Input")->type(), ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index 854c8653ff..e1491a8156 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -77,16 +77,14 @@ template <>
 void set_constant_with_place<platform::CPUPlace>(
     const platform::DeviceContext& context, framework::Tensor* tensor,
     float value) {
-  framework::VisitDataType(framework::ToDataType(tensor->type()),
-                           TensorSetConstantCPU(tensor, value));
+  framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
 }
 
 template <>
 void set_constant_with_place<platform::CUDAPinnedPlace>(
     const platform::DeviceContext& context, framework::Tensor* tensor,
     float value) {
-  framework::VisitDataType(framework::ToDataType(tensor->type()),
-                           TensorSetConstantCPU(tensor, value));
+  framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
 }
 
 struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index 9372d63f0b..4645b3ae6e 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -65,7 +65,7 @@ template <>
 void set_constant_with_place<platform::CUDAPlace>(
     const platform::DeviceContext& context, framework::Tensor* tensor,
     float value) {
-  framework::VisitDataType(framework::ToDataType(tensor->type()),
+  framework::VisitDataType(tensor->type(),
                            TensorSetConstantGPU(context, tensor, value));
 }
 
diff --git a/paddle/fluid/operators/mean_iou_op.cc b/paddle/fluid/operators/mean_iou_op.cc
index a60f245f53..bb290046f3 100644
--- a/paddle/fluid/operators/mean_iou_op.cc
+++ b/paddle/fluid/operators/mean_iou_op.cc
@@ -44,9 +44,8 @@ class MeanIoUOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Predictions")->type()),
-        ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("Predictions")->type(),
+                                   ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc
index 820636defa..35b6d7b5e3 100644
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
@@ -61,9 +61,7 @@ class MeanGradOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("X")->type());
-
+    auto input_data_type = ctx.Input<Tensor>("X")->type();
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index 2dc1467b0d..da7fa1b81d 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -63,9 +63,7 @@ class MergeLoDTensorOp : public framework::OperatorBase {
 
     platform::Place place = dev_place;
     int64_t batch_size = in_true.dims()[0] + in_false.dims()[0];
-
-    std::type_index data_type =
-        in_true.IsInitialized() ? in_true.type() : in_false.type();
+    auto data_type = in_true.IsInitialized() ? in_true.type() : in_false.type();
     int rank;
     framework::DDim in_dims;
     if (in_true.IsInitialized()) {
diff --git a/paddle/fluid/operators/metrics/accuracy_op.cc b/paddle/fluid/operators/metrics/accuracy_op.cc
index 95aa76bc69..7db6dff297 100644
--- a/paddle/fluid/operators/metrics/accuracy_op.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op.cc
@@ -55,9 +55,8 @@ class AccuracyOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
-        ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("Out")->type(),
+                                   ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/metrics/auc_op.cc b/paddle/fluid/operators/metrics/auc_op.cc
index 335d4fded4..5e33dd9606 100644
--- a/paddle/fluid/operators/metrics/auc_op.cc
+++ b/paddle/fluid/operators/metrics/auc_op.cc
@@ -51,9 +51,8 @@ class AucOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Predict")->type()),
-        platform::CPUPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("Predict")->type(),
+                                   platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/metrics/precision_recall_op.cc b/paddle/fluid/operators/metrics/precision_recall_op.cc
index 0d733c47dd..1a67b13491 100644
--- a/paddle/fluid/operators/metrics/precision_recall_op.cc
+++ b/paddle/fluid/operators/metrics/precision_recall_op.cc
@@ -82,9 +82,8 @@ class PrecisionRecallOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("MaxProbs")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("MaxProbs")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc
index 18ad46cb5e..1801f2915e 100644
--- a/paddle/fluid/operators/multiplex_op.cc
+++ b/paddle/fluid/operators/multiplex_op.cc
@@ -53,9 +53,8 @@ class MultiplexOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.MultiInput<Tensor>("X")[0]->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -123,9 +122,8 @@ class MultiplexGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.MultiInput<Tensor>("X")[0]->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index 9f97f7821d..06c35c789f 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -69,9 +69,8 @@ class NCEOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
-        platform::CPUPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
+                                   platform::CPUPlace());
   }
 };
 
@@ -214,9 +213,8 @@ class NCEOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
-        platform::CPUPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
+                                   platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cc b/paddle/fluid/operators/optimizers/adadelta_op.cc
index 9039d02b67..dd365629fc 100644
--- a/paddle/fluid/operators/optimizers/adadelta_op.cc
+++ b/paddle/fluid/operators/optimizers/adadelta_op.cc
@@ -70,9 +70,8 @@ class AdadeltaOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("Param")->type());
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("Param")->type(),
+                                   ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cc b/paddle/fluid/operators/optimizers/adagrad_op.cc
index e8d5a9e2c8..bd1bb98e63 100644
--- a/paddle/fluid/operators/optimizers/adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/adagrad_op.cc
@@ -59,9 +59,8 @@ class AdagradOp : public framework::OperatorWithKernel {
   }
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("Param")->type());
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("Param")->type(),
+                                   ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc
index 5710cda39a..5eae503461 100644
--- a/paddle/fluid/operators/optimizers/adam_op.cc
+++ b/paddle/fluid/operators/optimizers/adam_op.cc
@@ -75,8 +75,7 @@ class AdamOp : public framework::OperatorWithKernel {
   }
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("Param")->type());
+    auto input_data_type = ctx.Input<Tensor>("Param")->type();
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
diff --git a/paddle/fluid/operators/optimizers/adamax_op.cc b/paddle/fluid/operators/optimizers/adamax_op.cc
index 4b244a76dc..aef1fc972c 100644
--- a/paddle/fluid/operators/optimizers/adamax_op.cc
+++ b/paddle/fluid/operators/optimizers/adamax_op.cc
@@ -76,9 +76,8 @@ class AdamaxOp : public framework::OperatorWithKernel {
   }
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("Param")->type());
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("Param")->type(),
+                                   ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
index 80278441c0..07899278f9 100644
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
@@ -64,9 +64,8 @@ class DecayedAdagradOp : public framework::OperatorWithKernel {
   }
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("Param")->type());
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("Param")->type(),
+                                   ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/optimizers/ftrl_op.cc b/paddle/fluid/operators/optimizers/ftrl_op.cc
index 1c9e91d9b6..c1a4f5790b 100644
--- a/paddle/fluid/operators/optimizers/ftrl_op.cc
+++ b/paddle/fluid/operators/optimizers/ftrl_op.cc
@@ -66,8 +66,7 @@ class FTRLOp : public framework::OperatorWithKernel {
   }
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("Param")->type());
+    auto input_data_type = ctx.Input<Tensor>("Param")->type();
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
index 7b07b3b707..9dd9b8afbd 100644
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
@@ -58,9 +58,8 @@ class ProximalAdagradOp : public framework::OperatorWithKernel {
   }
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("Param")->type());
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("Param")->type(),
+                                   ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.cc b/paddle/fluid/operators/optimizers/proximal_gd_op.cc
index dcef4f7be2..fccfc2b458 100644
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.cc
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cc
@@ -46,9 +46,8 @@ class ProximalGDOp : public framework::OperatorWithKernel {
   }
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("Param")->type());
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("Param")->type(),
+                                   ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc
index a9da21f479..6ef2dacb38 100644
--- a/paddle/fluid/operators/pad2d_op.cc
+++ b/paddle/fluid/operators/pad2d_op.cc
@@ -511,8 +511,8 @@ class Pad2dOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace());
   }
 };
 
@@ -612,8 +612,8 @@ class Pad2dOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc
index 685ebc3937..3f827c26fd 100644
--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
@@ -47,9 +47,8 @@ class PadConstantLikeOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Y")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("Y")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -171,9 +170,8 @@ class PadConstantLikeOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Y")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("Y")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index 52b607df74..6259954849 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -99,9 +99,8 @@ framework::OpKernelType PoolOp::GetExpectedKernelType(
   }
 #endif
 
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-      layout_, library_);
+  return framework::OpKernelType(ctx.Input<Tensor>("X")->type(), ctx.GetPlace(),
+                                 layout_, library_);
 }
 
 void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const {
@@ -130,7 +129,7 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
   }
 #endif
 
-  auto input_data_type = framework::ToDataType(ctx.Input<Tensor>("X")->type());
+  auto input_data_type = ctx.Input<Tensor>("X")->type();
   if (input_data_type == framework::proto::VarType::FP16) {
     PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
                       "float16 can only be used when CUDNN is used");
diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc
index 873706593e..179ee96e01 100644
--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -71,9 +71,8 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -92,9 +91,8 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/positive_negative_pair_op.cc b/paddle/fluid/operators/positive_negative_pair_op.cc
index 4d865b7f17..99256e408d 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.cc
+++ b/paddle/fluid/operators/positive_negative_pair_op.cc
@@ -87,9 +87,8 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Score")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("Score")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
index 64d94ab604..62c55c4f55 100644
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -56,9 +56,8 @@ class PReluOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -113,9 +112,8 @@ class PReluGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        platform::CPUPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc
index e7f1caf4d3..6a5bf17060 100644
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -172,7 +172,7 @@ class TensorPrintOp : public framework::OperatorBase {
       formater.name = printed_var_name;
     }
     if (Attr<bool>("print_tensor_type")) {
-      formater.dtype = printed_tensor.type();
+      formater.dtype = framework::ToTypeIndex(printed_tensor.type());
     }
     if (Attr<bool>("print_tensor_shape")) {
       auto &dims = printed_tensor.dims();
diff --git a/paddle/fluid/operators/random_crop_op.cc b/paddle/fluid/operators/random_crop_op.cc
index 123fa44fa3..cd3bd32adb 100644
--- a/paddle/fluid/operators/random_crop_op.cc
+++ b/paddle/fluid/operators/random_crop_op.cc
@@ -22,9 +22,8 @@ class RandomCropOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/reader/create_batch_reader_op.cc b/paddle/fluid/operators/reader/create_batch_reader_op.cc
index e17c2ffd39..f771cebd0c 100644
--- a/paddle/fluid/operators/reader/create_batch_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc
@@ -99,10 +99,10 @@ void BatchReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
   out->reserve(out_num);
   for (size_t j = 0; j < out_num; ++j) {
     // Merge shape and check date type
-    std::type_index batch_type = buffer_[0][j].type();
+    auto batch_type = buffer_[0][j].type();
     framework::DDim batch_shape = buffer_[0][j].dims();
     for (size_t i = 1; i < buffer_.size(); ++i) {
-      std::type_index ins_type = buffer_[i][j].type();
+      auto ins_type = buffer_[i][j].type();
       framework::DDim ins_shape = buffer_[i][j].dims();
       PADDLE_ENFORCE_EQ(batch_type, ins_type);
       PADDLE_ENFORCE_EQ(slice_ddim(batch_shape, 1, batch_shape.size()),
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index 162bfcbb08..a1e02a3fd0 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -414,7 +414,7 @@ class RecurrentGradOp : public RecurrentBase {
             auto &inside_tensor = cur_scope.FindVar(inside_grad_name)
                                       ->Get<framework::LoDTensor>();
             framework::AttributeMap attrs;
-            attrs["dtype"] = framework::ToDataType(inside_tensor.type());
+            attrs["dtype"] = inside_tensor.type();
             attrs["shape"] = framework::vectorize2int(inside_tensor.dims());
             attrs["value"] = 0.0f;
 
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 500d86fec3..289d848ea1 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -108,9 +108,8 @@ class ReshapeOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -189,9 +188,8 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -322,9 +320,7 @@ class Reshape2GradOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(
-            ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
-                ->type()),
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
         ctx.device_context());
   }
 };
diff --git a/paddle/fluid/operators/rnn_memory_helper_op.cc b/paddle/fluid/operators/rnn_memory_helper_op.cc
index 0fb7776fd9..834dd1eabd 100644
--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
@@ -99,7 +99,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
       auto &in_var_tensor = in_var->Get<framework::LoDTensor>();
 
       framework::AttributeMap attrs;
-      attrs["dtype"] = framework::ToDataType(in_var_tensor.type());
+      attrs["dtype"] = in_var_tensor.type();
       attrs["shape"] = framework::vectorize2int(in_var_tensor.dims());
       attrs["value"] = 0.0f;
 
diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc
index 79f189222e..6857b5ed9d 100644
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -62,9 +62,8 @@ class ROIAlignOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -83,9 +82,8 @@ class ROIAlignGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index 3f6b2e46c7..e46d92d6fc 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -69,9 +69,8 @@ class ROIPoolOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -90,9 +89,8 @@ class ROIPoolGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc
index 5b05f757c0..a0b9fa305d 100644
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -75,7 +75,7 @@ class SaveCombineOp : public framework::OperatorBase {
       // Serialize tensors one by one
 
       // Check types to see if a fp16 transformation is required
-      auto in_dtype = framework::ToDataType(tensor.type());
+      auto in_dtype = tensor.type();
       auto out_dtype =
           save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
 
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index e79cffcf49..e1c9fd8ff1 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -85,7 +85,7 @@ class SaveOp : public framework::OperatorBase {
                    filename);
 
     auto save_as_fp16 = Attr<bool>("save_as_fp16");
-    auto in_dtype = framework::ToDataType(tensor.type());
+    auto in_dtype = tensor.type();
     auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
 
     if (in_dtype != out_dtype) {
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index c32d2603cf..ad418d51bc 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -51,9 +51,8 @@ class ScatterOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -70,9 +69,8 @@ class ScatterGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
index 44b09bf7c2..1754221e77 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
@@ -114,9 +114,8 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
index c49d1ccb18..8267c04f9f 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
@@ -112,9 +112,8 @@ class SequenceScatterOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        platform::CPUPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   platform::CPUPlace());
   }
 };
 
@@ -131,9 +130,8 @@ class SequenceScatterGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        platform::CPUPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc
index 6f84023e26..35f49f78ce 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc
@@ -50,9 +50,8 @@ class SequenceSliceOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -71,9 +70,8 @@ class SequenceSliceGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
index 644a5bebc1..027073e5d7 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
@@ -51,7 +51,7 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel {
     }
     std::string data_format = ctx.Attr<std::string>("data_format");
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+        ctx.Input<Tensor>("X")->type(), ctx.GetPlace(),
         framework::StringToDataLayout(data_format), library_);
   }
 };
@@ -146,7 +146,7 @@ class SequenceSoftmaxGradOp : public framework::OperatorWithKernel {
     }
     std::string data_format = ctx.Attr<std::string>("data_format");
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+        ctx.Input<Tensor>("X")->type(), ctx.GetPlace(),
         framework::StringToDataLayout(data_format), library_);
   }
 };
diff --git a/paddle/fluid/operators/similarity_focus_op.cc b/paddle/fluid/operators/similarity_focus_op.cc
index 9612f82b6d..21871d7656 100644
--- a/paddle/fluid/operators/similarity_focus_op.cc
+++ b/paddle/fluid/operators/similarity_focus_op.cc
@@ -70,9 +70,8 @@ class SimilarityFocusOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        platform::CPUPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index e55462d6cf..789e61b2d3 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -59,9 +59,8 @@ class SliceOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
-        ctx.GetPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
+                                   ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 091ce4e6e8..bc889a5a04 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -62,8 +62,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
     }
 #endif
 
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    auto input_data_type = ctx.Input<Tensor>("X")->type();
     if (input_data_type == framework::proto::VarType::FP16) {
       PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                      "float16 can only be used on GPU place");
@@ -169,8 +168,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
       layout_ = framework::DataLayout::kMKLDNN;
     }
 #endif
-    auto input_data_type = framework::ToDataType(
-        ctx.Input<Tensor>(framework::GradVarName("Out"))->type());
+    auto input_data_type =
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type();
     if (input_data_type == framework::proto::VarType::FP16) {
       PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                      "float16 can only be used on GPU place");
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
index 2900221485..0397c7791e 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -131,9 +131,8 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("Logits")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -173,8 +172,7 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(
-            ctx.Input<Tensor>(framework::GradVarName("Loss"))->type()),
+        ctx.Input<Tensor>(framework::GradVarName("Loss"))->type(),
         ctx.device_context());
   }
 };
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index 7df14158f3..4f717a4355 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -91,9 +91,9 @@ class SumOp : public framework::OperatorWithKernel {
           continue;
         }
         if (dtype == -1) {
-          dtype = framework::ToDataType(tensor->type());
+          dtype = tensor->type();
         } else {
-          PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(tensor->type()));
+          PADDLE_ENFORCE_EQ(dtype, tensor->type());
         }
       }
       PADDLE_ENFORCE_NE(dtype, -1,
@@ -106,8 +106,8 @@ class SumOp : public framework::OperatorWithKernel {
       for (auto& var : x_vars) {
         auto& value = var->Get<framework::SelectedRows>().value();
         if (value.IsInitialized()) {
-          return framework::OpKernelType(framework::ToDataType(value.type()),
-                                         ctx.device_context(), layout, library);
+          return framework::OpKernelType(value.type(), ctx.device_context(),
+                                         layout, library);
         }
       }
       // if input sparse vars are not initialized, use an default kernel type.
@@ -118,9 +118,8 @@ class SumOp : public framework::OperatorWithKernel {
         auto& array = x_var->Get<framework::LoDTensorArray>();
         for (auto& each : array) {
           if (each.numel() != 0) {
-            return framework::OpKernelType(framework::ToDataType(each.type()),
-                                           ctx.device_context(), layout,
-                                           library);
+            return framework::OpKernelType(each.type(), ctx.device_context(),
+                                           layout, library);
           }
         }
       }
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 6eef4c98c4..5b2aad55a4 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -76,10 +76,7 @@ class TensorRTEngineOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     auto input0 = ctx.Inputs("Xs").front();
     framework::OpKernelType kt = framework::OpKernelType(
-        framework::ToDataType(ctx.scope()
-                                  .FindVar(input0)
-                                  ->GetMutable<framework::LoDTensor>()
-                                  ->type()),
+        ctx.scope().FindVar(input0)->GetMutable<framework::LoDTensor>()->type(),
         ctx.GetPlace());
     return kt;
   }
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index bbd71db606..bc1f59bc1a 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -144,9 +144,8 @@ class Transpose2Op : public TransposeOp {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -194,9 +193,7 @@ class Transpose2OpGrad : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(
-            ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
-                ->type()),
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
         ctx.device_context());
   }
 };
diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc
index 6d2ccb38f6..11e505d6df 100644
--- a/paddle/fluid/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
@@ -74,9 +74,8 @@ class UnpoolOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 
  public:
@@ -113,9 +112,8 @@ class UnpoolOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 
  public:
diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc
index 6a257cebf5..e2ae7caae1 100644
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -56,9 +56,8 @@ class WarpCTCOp : public framework::OperatorWithKernel {
     }
 #endif
     framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
-        ctx.device_context(), layout_, library_);
+    return framework::OpKernelType(ctx.Input<Tensor>("Logits")->type(),
+                                   ctx.device_context(), layout_, library_);
   }
 };
 
@@ -136,9 +135,8 @@ class WarpCTCGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<Tensor>("Logits")->type(),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/yolov3_loss_op.cc b/paddle/fluid/operators/yolov3_loss_op.cc
index e7597f7324..60508f7ab8 100644
--- a/paddle/fluid/operators/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/yolov3_loss_op.cc
@@ -64,9 +64,8 @@ class Yolov3LossOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        platform::CPUPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   platform::CPUPlace());
   }
 };
 
@@ -180,9 +179,8 @@ class Yolov3LossOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        platform::CPUPlace());
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index 7c539d25f6..cbb090adef 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -20,6 +20,7 @@
 #include <thread>  // NOLINT
 #include <typeindex>
 #include <vector>
+#include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/platform/dynload/nccl.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -28,14 +29,14 @@
 namespace paddle {
 namespace platform {
 
-inline ncclDataType_t ToNCCLDataType(std::type_index type) {
-  if (type == typeid(float)) {  // NOLINT
+inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) {
+  if (type == framework::proto::VarType::FP32) {
     return ncclFloat;
-  } else if (type == typeid(double)) {  // NOLINT
+  } else if (type == framework::proto::VarType::FP64) {
     return ncclDouble;
-  } else if (type == typeid(int)) {  // NOLINT
+  } else if (type == framework::proto::VarType::INT32) {
     return ncclInt;
-  } else if (type == typeid(int64_t)) {  // NOLINT
+  } else if (type == framework::proto::VarType::INT64) {
     return ncclInt64;
   } else {
     PADDLE_THROW("Not supported");
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index dca0c01ab2..314ab98625 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -206,7 +206,7 @@ PYBIND11_MODULE(core, m) {
       .def("_get_float_element", TensorGetElement<float>)
       .def("_set_double_element", TensorSetElement<double>)
       .def("_get_double_element", TensorGetElement<double>)
-      .def("_dtype", [](Tensor &self) { return ToDataType(self.type()); });
+      .def("_dtype", [](Tensor &self) { return self.type(); });
 
   py::class_<LoDTensor, Tensor>(m, "LoDTensor", R"DOC(
     LoDTensor is a Tensor with optional LoD information.
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index f67f40f19f..5e91f5b301 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -43,7 +43,7 @@ template <size_t I, typename... ARGS>
 struct CastToPyBufferImpl<true, I, ARGS...> {
   using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
   pybind11::buffer_info operator()(const framework::Tensor &tensor) {
-    if (std::type_index(typeid(CUR_TYPE)) == tensor.type()) {
+    if (framework::DataTypeTrait<CUR_TYPE>::DataType == tensor.type()) {
       auto dim_vec = framework::vectorize(tensor.dims());
       std::vector<size_t> dims_outside;
       std::vector<size_t> strides;

From 06f8aa5b97be564b878848acd216069e23081300 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Wed, 12 Dec 2018 03:08:21 +0000
Subject: [PATCH 18/45] remove while_op support temporarily test=develop

---
 paddle/fluid/framework/executor.cc            |  3 +-
 .../fluid/operators/controlflow/while_op.cc   | 46 +------------------
 2 files changed, 3 insertions(+), 46 deletions(-)

diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 767bbb524f..7eab876015 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -419,7 +419,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
 
   int64_t max_memory_size = GetEagerDeletionThreshold();
   std::unique_ptr<GarbageCollector> gc;
-  if (max_memory_size >= 0) {
+  // skip while_op and while_grad_op temporarily
+  if (max_memory_size >= 0 && !keep_kids) {
     ctx->ResetReferenceCount();
 #ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(place_)) {
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index 06920a47ee..5ab0918c48 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -365,51 +365,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
     // while operator could be renamed.
     while_grad->SetAttr("original_output_grad", output_grads_list);
 
-    /* The following codes are used in eager deletion mode */
-    std::unordered_set<std::string> bwd_skip_vars;
-    if (framework::GetEagerDeletionThreshold() >= 0) {
-      std::unordered_set<std::string> fwd_skip_vars;
-      for (auto *op_desc : grad_block->AllOps()) {
-        auto skippable = [&](const std::string &name) {
-          return !grad_block->HasVar(name) &&
-                 (fwd_block->HasVarRecursive(name) ||
-                  parent_block->HasVarRecursive(name));
-        };
-        for (auto &in_arg_name : op_desc->InputArgumentNames()) {
-          if (skippable(in_arg_name)) {
-            fwd_skip_vars.insert(in_arg_name);
-          }
-        }
-
-        for (auto &out_arg_name : op_desc->OutputArgumentNames()) {
-          if (skippable(out_arg_name)) {
-            fwd_skip_vars.insert(out_arg_name);
-          }
-        }
-      }
-
-      if (!fwd_skip_vars.empty()) {
-        // FIXME(zjl): ugly const_cast here, maybe we should find a better way
-        // to modify forward while_op
-        auto &fwd_while_op = const_cast<framework::OpDesc &>(ForwardOp());
-        fwd_while_op.SetAttr(kSkipEagerDeletionVars,
-                             std::vector<std::string>(fwd_skip_vars.begin(),
-                                                      fwd_skip_vars.end()));
-      }
-
-      // Find backward skip vars
-      auto fwd_input = Input(kX);
-      for (size_t i = 0; i < igs.size(); ++i) {
-        if (igs[i] == framework::kEmptyVarName) {
-          continue;
-        }
-        bwd_skip_vars.insert(igs[i]);
-        bwd_skip_vars.insert(framework::GradVarName(fwd_input[i]));
-      }
-    }
-    while_grad->SetAttr(
-        kSkipEagerDeletionVars,
-        std::vector<std::string>(bwd_skip_vars.begin(), bwd_skip_vars.end()));
+    while_grad->SetAttr(kSkipEagerDeletionVars, std::vector<std::string>());
 
     return std::unique_ptr<framework::OpDesc>(while_grad);
   }

From c00e07cda02ce611f0c10ed9cbc64f9a59f42f73 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 12 Dec 2018 14:58:51 +0800
Subject: [PATCH 19/45] Fix distribute compile

test=develop
---
 .../fluid/framework/details/reduce_op_handle.cc   | 10 +++++-----
 paddle/fluid/operators/distributed/grpc_serde.cc  |  3 +--
 .../operators/distributed/sendrecvop_utils.cc     |  6 ++----
 .../operators/distributed/sendrecvop_utils.h      | 13 +++++++------
 .../operators/distributed/variable_response.cc    | 15 +++++++--------
 .../operators/distributed_ops/merge_ids_op.cc     |  4 +---
 .../distributed_ops/ref_by_trainer_id_op.cc       |  4 +---
 7 files changed, 24 insertions(+), 31 deletions(-)

diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index 85d8abc910..7a5f7de57e 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -218,18 +218,18 @@ void ReduceOpHandle::RunImpl() {
       }
 
 #if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
-      if (framework::IsType<const float>(in_selected_rows[0]->value().type())) {
+      if (in_selected_rows[0]->value().type() ==
+          framework::proto::VarType::FP32) {
         GatherSelectedRows<platform::CUDADeviceContext, float>(
             in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
             out_var->GetMutable<framework::SelectedRows>());
-      } else if (framework::IsType<const double>(
-                     in_selected_rows[0]->value().type())) {
+      } else if (in_selected_rows[0]->value().type() ==
+                 framework::proto::VarType::FP64) {
         GatherSelectedRows<platform::CUDADeviceContext, double>(
             in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
             out_var->GetMutable<framework::SelectedRows>());
       } else {
-        PADDLE_ENFORCE(false,
-                       "only support double or float when gahter SelectedRows");
+        PADDLE_THROW("only support double or float when gather SelectedRows");
       }
 #endif
     });
diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc
index 31fac2133c..94bf0a113b 100644
--- a/paddle/fluid/operators/distributed/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde.cc
@@ -122,8 +122,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
   if (var->IsType<framework::SelectedRows>()) {
     auto* slr = var->GetMutable<framework::SelectedRows>();
     ProtoEncodeHelper e2(static_cast<char*>(buf), 128);
-    size_t rows_memory_size =
-        slr->rows().size() * framework::SizeOfType(typeid(int64_t));
+    size_t rows_memory_size = slr->rows().size() * sizeof(int64_t);
     e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
     slices[2] = ::grpc::Slice(e2.size());
     memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
index 6ba883ba01..5fd42e884a 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
@@ -61,8 +61,7 @@ TensorPayload GetTensorPayload(framework::Variable* var,
   auto tensor = var->Get<framework::LoDTensor>();
   // FIXME(wuyi): data types in send_recv.proto is copied from
   // framework.proto
-  request->set_data_type(
-      static_cast<VarMsg::Type>(framework::ToDataType(tensor.type())));
+  request->set_data_type(static_cast<VarMsg::Type>(tensor.type()));
   for (auto& dim : framework::vectorize(tensor.dims())) {
     request->add_dims(dim);
   }
@@ -83,8 +82,7 @@ TensorPayload GetSelectedRowsPayload(framework::Variable* var,
                                      const platform::DeviceContext& ctx,
                                      VarMsg* request) {
   auto* slr = var->GetMutable<framework::SelectedRows>();
-  request->set_data_type(
-      static_cast<VarMsg::Type>(framework::ToDataType(slr->value().type())));
+  request->set_data_type(static_cast<VarMsg::Type>(slr->value().type()));
   request->set_lod_level(0);
   request->set_slr_height(slr->height());
 
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h
index 523e56fe3e..710c839166 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.h
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h
@@ -58,18 +58,19 @@ TensorPayload GetSelectedRowsPayload(framework::Variable* var,
                                      const platform::DeviceContext& ctx,
                                      VarMsg* request);
 
-inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) {
+inline framework::proto::VarType::Type ToVarType(
+    sendrecv::VariableMessage::Type type) {
   switch (type) {
     case sendrecv::VariableMessage::FP32:
-      return typeid(float);  // NOLINT
+      return framework::proto::VarType::FP32;  // NOLINT
     case sendrecv::VariableMessage::FP64:
-      return typeid(double);  // NOLINT
+      return framework::proto::VarType::FP64;  // NOLINT
     case sendrecv::VariableMessage::INT32:
-      return typeid(int);  // NOLINT
+      return framework::proto::VarType::INT32;  // NOLINT
     case sendrecv::VariableMessage::INT64:
-      return typeid(int64_t);  // NOLINT
+      return framework::proto::VarType::INT64;  // NOLINT
     case sendrecv::VariableMessage::BOOL:
-      return typeid(bool);  // NOLINT
+      return framework::proto::VarType::BOOL;  // NOLINT
     default:
       PADDLE_THROW("Not support type %d", type);
   }
diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc
index 5b2be04e6a..921c96b583 100644
--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -114,7 +114,7 @@ bool VariableResponse::CopyLodTensorData(
   tensor->set_lod(lod);
 
   void* tensor_data =
-      tensor->mutable_data(ctx.GetPlace(), ToTypeIndex(meta_.data_type()));
+      tensor->mutable_data(ctx.GetPlace(), ToVarType(meta_.data_type()));
 
   VLOG(6) << "Tensor.memory_size = " << tensor->memory_size()
           << ", Buffer Size = " << length;
@@ -139,13 +139,13 @@ bool VariableResponse::CopySelectRowsTensorData(
   slr->set_height(meta_.slr_height());
   auto* tensor = slr->mutable_value();
   tensor->Resize(dims);
-  PADDLE_ENFORCE_EQ(static_cast<size_t>(tensor->numel()),
-                    length / framework::SizeOfType(
-                                 paddle::operators::distributed::ToTypeIndex(
-                                     meta_.data_type())));
+  PADDLE_ENFORCE_EQ(
+      static_cast<size_t>(tensor->numel()),
+      length / framework::SizeOfType(paddle::operators::distributed::ToVarType(
+                   meta_.data_type())));
   void* tensor_data = tensor->mutable_data(
       ctx.GetPlace(),
-      paddle::operators::distributed::ToTypeIndex(meta_.data_type()));
+      paddle::operators::distributed::ToVarType(meta_.data_type()));
 
   if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
     return false;
@@ -159,8 +159,7 @@ bool VariableResponse::CopySelectRowsData(
     const platform::DeviceContext& ctx, int length) {
   auto* slr = GetVar()->GetMutable<framework::SelectedRows>();
   slr->mutable_rows()->clear();
-  slr->mutable_rows()->resize(length /
-                              framework::SizeOfType(typeid(int64_t)));  // int64
+  slr->mutable_rows()->resize(length / sizeof(int64_t));  // int64
   int64_t* rows_data = slr->mutable_rows()->data();
 
   // copy rows CPU data, GPU data will be copied lazily.
diff --git a/paddle/fluid/operators/distributed_ops/merge_ids_op.cc b/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
index 252a63cb60..da0185b8c4 100644
--- a/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
+++ b/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
@@ -108,9 +108,7 @@ class MergeIdsOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(
-            ctx.MultiInput<framework::Tensor>("X").front()->type()),
-        ctx.GetPlace());
+        ctx.MultiInput<framework::Tensor>("X").front()->type(), ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc
index 98b0af7688..7e16e6ff66 100644
--- a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc
+++ b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc
@@ -42,9 +42,7 @@ class RefByTrainerIdOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(
-            ctx.MultiInput<framework::Tensor>("X")[0]->type()),
-        ctx.GetPlace());
+        ctx.MultiInput<framework::Tensor>("X")[0]->type(), ctx.GetPlace());
   }
 };
 

From e82772f42518f1cff790ac04aa1c73c2e5b201e9 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Wed, 12 Dec 2018 09:22:44 +0000
Subject: [PATCH 20/45] fix cmake conflict test=develop

---
 paddle/fluid/framework/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index b1cfb23f3a..6d7a69c8c9 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -169,7 +169,7 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor)
 cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
 
 if(WITH_DISTRIBUTE)
-  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper garbage_collector)
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()

From 8d9401152eaf26cc0d6ab4643fe6255028d6edf2 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 12 Dec 2018 17:43:39 +0800
Subject: [PATCH 21/45] Refine w2v

---
 .../fluid/operators/math/matrix_bit_code.cc   | 22 ++++++----
 paddle/fluid/operators/math/matrix_bit_code.h | 40 +++++++++----------
 2 files changed, 35 insertions(+), 27 deletions(-)

diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc
index 5a6e64b6f8..dbf4f5e332 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.cc
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
@@ -23,12 +23,14 @@ void MatrixBitCodeFunctor<T>::Add(const framework::Tensor& vec,
                                   framework::Tensor* tmat) {
   size_t batch_size = tmat->dims()[0];
   size_t width = tmat->dims()[1];
+  auto* tmat_data = tmat->data<T>();
+  auto* vec_data = vec.data<T>();
   for (size_t i = 0; i < batch_size; ++i) {
     auto code = code_table_->get_code(i);
     int code_length = code->get_length();
     for (int j = 0; j < code_length; ++j) {
       size_t index = code->calc_index(j);
-      tmat->data<T>()[i * width + j] += vec.data<T>()[index];
+      tmat_data[i * width + j] += vec_data[index];
     }
   }
 }
@@ -38,12 +40,14 @@ void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor& tmat,
                                       framework::Tensor* vec) {
   size_t batch_size = tmat.dims()[0];
   size_t width = tmat.dims()[1];
+  auto* vec_data = vec->data<T>();
+  auto* tmat_data = tmat.data<T>();
   for (size_t i = 0; i < batch_size; ++i) {
     auto code = code_table_->get_code(i);
     int code_length = code->get_length();
     for (int j = 0; j < code_length; ++j) {
       size_t index = code->calc_index(j);
-      vec->data<T>()[index] += tmat.data<T>()[i * width + j];
+      vec_data[index] += tmat_data[i * width + j];
     }
   }
 }
@@ -53,14 +57,15 @@ void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor& tmat,
                                       framework::SelectedRows* vec) {
   size_t batch_size = tmat.dims()[0];
   size_t width = tmat.dims()[1];
+  auto* vec_data = vec->mutable_value()->data<T>();
+  auto* tmat_data = tmat.data<T>();
   for (size_t i = 0; i < batch_size; ++i) {
     auto code = code_table_->get_code(i);
     int code_length = code->get_length();
     for (int j = 0; j < code_length; ++j) {
       size_t index = code->calc_index(j);
       int64_t row_index = vec->GetIndexFromId(static_cast<int64_t>(index));
-      vec->mutable_value()->data<T>()[row_index] +=
-          tmat.data<T>()[i * width + j];
+      vec_data[row_index] += tmat_data[i * width + j];
     }
   }
 }
@@ -70,6 +75,8 @@ void MatrixBitCodeFunctor<T>::Sum(const framework::Tensor& tmat,
                                   framework::Tensor* sum, T scale_sum) {
   size_t num_samples = tmat.dims()[0];
   size_t o_width = tmat.dims()[1];
+  auto* tmat_data = tmat.data<T>();
+  auto* sum_data = sum->data<T>();
   for (size_t i = 0; i < num_samples; ++i) {
     T sm = static_cast<T>(0.0);
     auto code = code_table_->get_code(i);
@@ -78,10 +85,10 @@ void MatrixBitCodeFunctor<T>::Sum(const framework::Tensor& tmat,
       if (code->calc_bit(j)) {
         // calc_bit starts from right most bit, while data in tmat[i] is in the
         // reverse order.
-        sm += tmat.data<T>()[i * o_width + j];
+        sm += tmat_data[i * o_width + j];
       }
     }
-    sum->data<T>()[i] = scale_sum * sm;
+    sum_data[i] = scale_sum * sm;
   }
 }
 
@@ -217,12 +224,13 @@ template <typename T>
 void MatrixBitCodeFunctor<T>::Sub(framework::Tensor* tmat) {
   size_t num_samples = tmat->dims()[0];
   size_t o_width = tmat->dims()[1];
+  auto* tmat_data = tmat->data<T>();
   for (size_t i = 0; i < num_samples; ++i) {
     auto code = code_table_->get_code(i);
     int code_length = code->get_length();
     for (int j = 0; j < code_length; ++j) {
       if (code->calc_bit(j)) {
-        tmat->data<T>()[i * o_width + j] -= 1;
+        tmat_data[i * o_width + j] -= 1;
       }
     }
   }
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
index 35ca73802b..ba1745b86d 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -140,13 +140,13 @@ template <typename T>
 class CustomCode : public Code {
  public:
   CustomCode(const framework::Tensor& ptable, const framework::Tensor& pcode,
-             const int64_t* ids, int index)
-      : ids_(ids), index_(index) {
-    ptable_ = ptable.Slice(index, index + 1);
-    pcode_ = pcode.Slice(index, index + 1);
+             const int64_t* ids, int index) {
+    seq_len_ = ptable.dims()[1];
+    ptable_data_ = ptable.data<T>() + seq_len_ * index;
+    pcode_data_ = pcode.data<T>() + seq_len_ * index;
   }
   /**
-   * Here the id of root shoud be 1 rather than 0, thus the encoding of class c
+   * Here the id of root should be 1 rather than 0, thus the encoding of class c
    * is `c + num_classes` and all siblings can get the same weight indice using
    * prefixes.
    * Weight index is the prefixes of encoding, thus leave out the right most
@@ -154,26 +154,26 @@ class CustomCode : public Code {
    * Binary classification path is the suffixes of encoding, thus leave out the
    * left most bit in calc_bit.
    */
-  size_t calc_index(int bit) const { return ptable_.data<T>()[bit]; }
-  bool calc_bit(int bit) const { return pcode_.data<T>()[bit]; }
-  int get_length() const {
-    int length = 0;
+  size_t calc_index(int bit) const override { return ptable_data_[bit]; }
+  bool calc_bit(int bit) const override { return pcode_data_[bit]; }
 
-    for (int i = 0; i < static_cast<int>(ptable_.dims()[1]); i++) {
-      if (ptable_.data<T>()[i] >= 0) {
-        length++;
-      } else {
-        return length;
-      }
+  // NOTE: this function is not thread-safe.
+  int get_length() const override {
+    if (length_ < 0) {
+      auto len = seq_len_;
+      length_ =
+          static_cast<int>(std::find_if(ptable_data_, ptable_data_ + len,
+                                        [](const T& val) { return val < 0; }) -
+                           ptable_data_);
     }
-    return length;
+    return length_;
   }
 
  private:
-  framework::Tensor ptable_;
-  framework::Tensor pcode_;
-  const int64_t* ids_;
-  const int index_;
+  int64_t seq_len_;
+  const T* ptable_data_;
+  const T* pcode_data_;
+  mutable int length_{-1};
 };
 
 class SimpleCodeTable : public CodeTable {

From be113756610c2894ae2adfeab40c8dfe879620a9 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 12 Dec 2018 18:06:58 +0800
Subject: [PATCH 22/45] Refine code

---
 .../fluid/operators/math/matrix_bit_code.cc   |  4 +--
 paddle/fluid/operators/math/matrix_bit_code.h | 30 ++++++++++++++-----
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc
index dbf4f5e332..92affa0e4e 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.cc
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/matrix_bit_code.h"
 #include <iostream>
+#include <map>
 namespace paddle {
 namespace operators {
 namespace math {
@@ -133,8 +134,7 @@ void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
   auto weight_value = weight->data<T>();
   auto input_value = input.data<T>();
 
-  std::unordered_map<int, std::vector<std::pair<T, const T*>>> ops;
-
+  std::map<int, std::vector<std::pair<T, const T*>>> ops;
   for (size_t i = 0; i < num_samples; ++i) {
     auto code = code_table_->get_code(i);
     int code_length = code->get_length();
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
index ba1745b86d..cf43ad9d44 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <map>
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -109,7 +110,7 @@ class Code {
 // set a CodeTable interface to create multiple code table
 class CodeTable {
  public:
-  virtual std::unique_ptr<Code> get_code(int64_t code) const = 0;
+  virtual Code* get_code(int64_t code) const = 0;
   virtual size_t size() const = 0;
   virtual int get_max_code_length() const = 0;
   virtual ~CodeTable() {}
@@ -180,14 +181,23 @@ class SimpleCodeTable : public CodeTable {
  public:
   SimpleCodeTable(size_t num_classes, const int64_t* ids)
       : num_classes_(num_classes), ids_(ids) {}
-  std::unique_ptr<Code> get_code(int64_t code) const {
-    std::unique_ptr<Code> coder(new SimpleCode(code, num_classes_, ids_));
-    return coder;
+
+  Code* get_code(int64_t code) const {
+    auto it = codes_.find(code);
+    if (it != codes_.end()) {
+      return it->second.get();
+    }
+    auto* result = new SimpleCode(code, num_classes_, ids_);
+    codes_.emplace(code, std::unique_ptr<Code>(result));
+    return result;
   }
+
   size_t size() const { return num_classes_; }
   int get_max_code_length() const { return FindLastSet(num_classes_ - 1); }
 
  private:
+  mutable std::map<int64_t, std::unique_ptr<Code>> codes_;
+
   size_t num_classes_;
   const int64_t* ids_;
 };
@@ -199,9 +209,14 @@ class CustomCodeTable : public CodeTable {
                   const framework::Tensor& pcode, const int64_t* ids)
       : ptable_(ptable), pcode_(pcode), ids_(ids) {}
 
-  std::unique_ptr<Code> get_code(int64_t code) const {
-    std::unique_ptr<Code> coder(new CustomCode<T>(ptable_, pcode_, ids_, code));
-    return coder;
+  Code* get_code(int64_t code) const {
+    auto it = codes_.find(code);
+    if (it != codes_.end()) {
+      return it->second.get();
+    }
+    auto* result = new CustomCode<T>(ptable_, pcode_, ids_, code);
+    codes_.emplace(code, std::unique_ptr<Code>(result));
+    return result;
   }
 
   size_t size() const { return static_cast<size_t>(ptable_.dims()[1]); }
@@ -210,6 +225,7 @@ class CustomCodeTable : public CodeTable {
   }
 
  private:
+  mutable std::unordered_map<int64_t, std::unique_ptr<Code>> codes_;
   const framework::Tensor& ptable_;
   const framework::Tensor& pcode_;
   const int64_t* ids_;

From 162637b64abd39c3ca7c75c08690169968305712 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 13 Dec 2018 10:56:08 +0800
Subject: [PATCH 23/45] Fix ngraph compile

test=develop
---
 paddle/fluid/framework/ngraph_operator.cc | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc
index 253de4c611..e2cdfc845f 100644
--- a/paddle/fluid/framework/ngraph_operator.cc
+++ b/paddle/fluid/framework/ngraph_operator.cc
@@ -471,27 +471,23 @@ void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const {
       auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
       PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()),
                      "Ensure ngraph tensor layout align with paddle tensor");
-      if (tensor_pd->type().hash_code() ==
-          typeid(float).hash_code()) {  // NOLINT
+      if (tensor_pd->type() == proto::VarType::FP32) {
         const float* arr = tensor_pd->data<float>();
         ti = backend_->create_tensor(ngraph::element::f32, sp,
                                      const_cast<float*>(arr));
-      } else if (tensor_pd->type().hash_code() ==
-                 typeid(int).hash_code()) {  // NOLINT
+      } else if (tensor_pd->type() == proto::VarType::INT32) {
         const int* arr = tensor_pd->data<int>();
         ti = backend_->create_tensor(ngraph::element::i32, sp,
                                      const_cast<int*>(arr));
-      } else if (tensor_pd->type().hash_code() == typeid(int64_t).hash_code()) {
+      } else if (tensor_pd->type() == proto::VarType::INT64) {
         const int64_t* arr = tensor_pd->data<int64_t>();
         ti = backend_->create_tensor(ngraph::element::i64, sp,
                                      const_cast<int64_t*>(arr));
-      } else if (tensor_pd->type().hash_code() ==
-                 typeid(double).hash_code()) {  // NOLINT
+      } else if (tensor_pd->type() == proto::VarType::FP64) {
         const double* arr = tensor_pd->data<double>();
         ti = backend_->create_tensor(ngraph::element::f64, sp,
                                      const_cast<double*>(arr));
-      } else if (tensor_pd->type().hash_code() ==
-                 typeid(bool).hash_code()) {  // NOLINT
+      } else if (tensor_pd->type() == proto::VarType::BOOL) {
         const bool* arr = tensor_pd->data<bool>();
         ti = backend_->create_tensor(ngraph::element::boolean, sp,
                                      const_cast<bool*>(arr));

From fa1f77e20ca2134f52ab01049a7070a2f0a9a3c8 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yanxu05@baidu.com>
Date: Wed, 12 Dec 2018 18:14:03 +0800
Subject: [PATCH 24/45] enable ci test=develop

---
 .../fluid/tests/unittests/test_dist_base.py       | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 26fa20291b..8456651266 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -224,6 +224,7 @@ class TestDistBase(unittest.TestCase):
     def setUp(self):
         self._trainers = 2
         self._pservers = 2
+        self._port_set = set()
         self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
             self._find_free_port(), self._find_free_port())
         self._python_interp = sys.executable
@@ -238,9 +239,17 @@ class TestDistBase(unittest.TestCase):
         self._after_setup_config()
 
     def _find_free_port(self):
-        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
-            s.bind(('', 0))
-            return s.getsockname()[1]
+        def __free_port():
+            with closing(socket.socket(socket.AF_INET,
+                                       socket.SOCK_STREAM)) as s:
+                s.bind(('', 0))
+                return s.getsockname()[1]
+
+        while True:
+            port = __free_port()
+            if port not in self._port_set:
+                self._port_set.add(port)
+                return port
 
     def start_pserver(self, model_file, check_error_log, required_envs):
         ps0_ep, ps1_ep = self._ps_endpoints.split(",")

From 2328bee1cc835d789b83cd4da9bef6b588bc87c5 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Thu, 13 Dec 2018 06:34:09 +0000
Subject: [PATCH 25/45] fix Windows compile bug test=develop

---
 .../framework/details/eager_deletion_op_handle.cc      |  6 +++---
 paddle/fluid/framework/executor.cc                     | 10 ++++++----
 paddle/fluid/framework/tensor.h                        |  2 +-
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index 3b27415e43..abacb11e3b 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -77,14 +77,14 @@ void EagerDeletionOpHandle::RunImpl() {
     VLOG(2) << "Erase variable " << name;
 
     if (var->IsType<LoDTensor>()) {
-      garbages.emplace_back(var->GetMutable<LoDTensor>()->MoveMemory());
+      garbages.emplace_back(var->GetMutable<LoDTensor>()->MoveMemoryHolder());
     } else if (var->IsType<SelectedRows>()) {
       garbages.emplace_back(
-          var->GetMutable<SelectedRows>()->mutable_value()->MoveMemory());
+          var->GetMutable<SelectedRows>()->mutable_value()->MoveMemoryHolder());
     } else if (var->IsType<LoDTensorArray>()) {
       auto *tensor_arr = var->GetMutable<LoDTensorArray>();
       for (auto &t : *tensor_arr) {
-        garbages.emplace_back(t.MoveMemory());
+        garbages.emplace_back(t.MoveMemoryHolder());
       }
     } else {
       PADDLE_THROW("Type %s of %s is not supported eager deletion",
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 16c4552a5f..0c4bd336c5 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -106,14 +106,16 @@ static void DeleteUnusedTensors(
 
         VLOG(2) << "Erase variable " << name;
         if (var->IsType<LoDTensor>()) {
-          garbages.emplace_back(var->GetMutable<LoDTensor>()->MoveMemory());
-        } else if (var->IsType<SelectedRows>()) {
           garbages.emplace_back(
-              var->GetMutable<SelectedRows>()->mutable_value()->MoveMemory());
+              var->GetMutable<LoDTensor>()->MoveMemoryHolder());
+        } else if (var->IsType<SelectedRows>()) {
+          garbages.emplace_back(var->GetMutable<SelectedRows>()
+                                    ->mutable_value()
+                                    ->MoveMemoryHolder());
         } else if (var->IsType<LoDTensorArray>()) {
           auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>();
           for (auto& t : *lod_tensor_arr) {
-            garbages.emplace_back(t.MoveMemory());
+            garbages.emplace_back(t.MoveMemoryHolder());
           }
         } else {
           PADDLE_THROW("Type %s of %s is not supported eager deletion",
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 9f7027f5ae..153222506a 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -158,7 +158,7 @@ class Tensor {
   const std::shared_ptr<memory::Allocation>& Holder() const { return holder_; }
   size_t offset() const { return offset_; }
 
-  std::shared_ptr<memory::Allocation> MoveMemory() {
+  std::shared_ptr<memory::Allocation> MoveMemoryHolder() {
     return std::move(holder_);
   }
 

From 15550a27536012a92e2e7badaee7b41afff31f3e Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 13 Dec 2018 14:46:22 +0800
Subject: [PATCH 26/45] Polish code

---
 cmake/external/python.cmake                   |   6 +-
 .../fluid/operators/math/matrix_bit_code.cc   | 486 ++++++++++++------
 paddle/fluid/operators/math/matrix_bit_code.h |  63 +--
 3 files changed, 340 insertions(+), 215 deletions(-)

diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index a3599dd798..52ad02a355 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -18,8 +18,8 @@ ENDIF()
 
 INCLUDE(python_module)
 
-FIND_PACKAGE(PythonInterp ${PY_VERSION})
-FIND_PACKAGE(PythonLibs ${PY_VERSION})
+FIND_PACKAGE(PythonInterp ${PY_VERSION} REQUIRED)
+FIND_PACKAGE(PythonLibs ${PY_VERSION} REQUIRED)
 
 if(WIN32)
     execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
@@ -79,6 +79,6 @@ IF(PYTHONINTERP_FOUND)
         "please use pip to upgrade protobuf. pip install -U protobuf")
     ENDIF()
 ENDIF(PYTHONINTERP_FOUND)
-
+message(STATUS ${PYTHON_INCLUDE_DIR})
 INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
 INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc
index 92affa0e4e..d55e832cc2 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.cc
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
@@ -15,225 +15,379 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/matrix_bit_code.h"
 #include <iostream>
 #include <map>
+
 namespace paddle {
 namespace operators {
 namespace math {
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::Add(const framework::Tensor& vec,
-                                  framework::Tensor* tmat) {
-  size_t batch_size = tmat->dims()[0];
-  size_t width = tmat->dims()[1];
-  auto* tmat_data = tmat->data<T>();
-  auto* vec_data = vec.data<T>();
-  for (size_t i = 0; i < batch_size; ++i) {
-    auto code = code_table_->get_code(i);
-    int code_length = code->get_length();
-    for (int j = 0; j < code_length; ++j) {
-      size_t index = code->calc_index(j);
-      tmat_data[i * width + j] += vec_data[index];
+struct MatrixBitCodeFunctorAdd : public boost::static_visitor<void> {
+  const framework::Tensor &vec_;
+  framework::Tensor *tmat_;
+
+  MatrixBitCodeFunctorAdd(const framework::Tensor &vec, framework::Tensor *tmat)
+      : vec_(vec), tmat_(tmat) {}
+
+  template <typename CodeTable>
+  void operator()(const CodeTable &code_table) {
+    size_t batch_size = tmat_->dims()[0];
+    size_t width = tmat_->dims()[1];
+    auto *tmat_data = tmat_->data<T>();
+    auto *vec_data = vec_.data<T>();
+    for (size_t i = 0; i < batch_size; ++i) {
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      for (int j = 0; j < code_length; ++j) {
+        size_t index = code.calc_index(j);
+        tmat_data[i * width + j] += vec_data[index];
+      }
     }
   }
+};
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::Add(const framework::Tensor &vec,
+                                  framework::Tensor *tmat) {
+  MatrixBitCodeFunctorAdd<T> func(vec, tmat);
+  code_table_.apply_visitor(func);
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor& tmat,
-                                      framework::Tensor* vec) {
-  size_t batch_size = tmat.dims()[0];
-  size_t width = tmat.dims()[1];
-  auto* vec_data = vec->data<T>();
-  auto* tmat_data = tmat.data<T>();
-  for (size_t i = 0; i < batch_size; ++i) {
-    auto code = code_table_->get_code(i);
-    int code_length = code->get_length();
-    for (int j = 0; j < code_length; ++j) {
-      size_t index = code->calc_index(j);
-      vec_data[index] += tmat_data[i * width + j];
+struct MatrixBitCodeFunctorAddGrad : public boost::static_visitor<void> {
+  const framework::Tensor &tmat_;
+  framework::Tensor *vec_;
+  MatrixBitCodeFunctorAddGrad(const framework::Tensor &tmat,
+                              framework::Tensor *vec)
+      : tmat_(tmat), vec_(vec) {}
+
+  template <typename CodeTable>
+  void operator()(const CodeTable &table) {
+    size_t batch_size = tmat_.dims()[0];
+    size_t width = tmat_.dims()[1];
+    auto *vec_data = vec_->data<T>();
+    auto *tmat_data = tmat_.data<T>();
+    for (size_t i = 0; i < batch_size; ++i) {
+      auto code = table.get_code(i);
+      int code_length = code.get_length();
+      for (int j = 0; j < code_length; ++j) {
+        size_t index = code.calc_index(j);
+        vec_data[index] += tmat_data[i * width + j];
+      }
     }
   }
+};
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor &tmat,
+                                      framework::Tensor *vec) {
+  MatrixBitCodeFunctorAddGrad<T> func(tmat, vec);
+  code_table_.apply_visitor(func);
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor& tmat,
-                                      framework::SelectedRows* vec) {
-  size_t batch_size = tmat.dims()[0];
-  size_t width = tmat.dims()[1];
-  auto* vec_data = vec->mutable_value()->data<T>();
-  auto* tmat_data = tmat.data<T>();
-  for (size_t i = 0; i < batch_size; ++i) {
-    auto code = code_table_->get_code(i);
-    int code_length = code->get_length();
-    for (int j = 0; j < code_length; ++j) {
-      size_t index = code->calc_index(j);
-      int64_t row_index = vec->GetIndexFromId(static_cast<int64_t>(index));
-      vec_data[row_index] += tmat_data[i * width + j];
+struct MatrixBitCodeFunctorSelectedRowsAddGrad
+    : public boost::static_visitor<void> {
+  const framework::Tensor &tmat_;
+  framework::SelectedRows *vec_;
+
+  MatrixBitCodeFunctorSelectedRowsAddGrad(const framework::Tensor &tmat,
+                                          framework::SelectedRows *vec)
+      : tmat_(tmat), vec_(vec) {}
+
+  template <typename CodeTable>
+  void operator()(const CodeTable &code_table) {
+    size_t batch_size = tmat_.dims()[0];
+    size_t width = tmat_.dims()[1];
+    auto *vec_data = vec_->mutable_value()->template data<T>();
+    auto *tmat_data = tmat_.data<T>();
+    for (size_t i = 0; i < batch_size; ++i) {
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      for (int j = 0; j < code_length; ++j) {
+        size_t index = code.calc_index(j);
+        int64_t row_index = vec_->GetIndexFromId(static_cast<int64_t>(index));
+        vec_data[row_index] += tmat_data[i * width + j];
+      }
     }
   }
+};
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor &tmat,
+                                      framework::SelectedRows *vec) {
+  MatrixBitCodeFunctorSelectedRowsAddGrad<T> func(tmat, vec);
+  code_table_.apply_visitor(func);
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::Sum(const framework::Tensor& tmat,
-                                  framework::Tensor* sum, T scale_sum) {
-  size_t num_samples = tmat.dims()[0];
-  size_t o_width = tmat.dims()[1];
-  auto* tmat_data = tmat.data<T>();
-  auto* sum_data = sum->data<T>();
-  for (size_t i = 0; i < num_samples; ++i) {
-    T sm = static_cast<T>(0.0);
-    auto code = code_table_->get_code(i);
-    int code_length = code->get_length();
-    for (int j = 0; j < code_length; ++j) {
-      if (code->calc_bit(j)) {
-        // calc_bit starts from right most bit, while data in tmat[i] is in the
-        // reverse order.
-        sm += tmat_data[i * o_width + j];
+struct MatrixBitCodeFunctorSum : public boost::static_visitor<void> {
+  const framework::Tensor &tmat_;
+  framework::Tensor *sum_;
+  T scale_sum_;
+
+  MatrixBitCodeFunctorSum(const framework::Tensor &tmat, framework::Tensor *sum,
+                          T scale_sum)
+      : tmat_(tmat), sum_(sum), scale_sum_(scale_sum) {}
+
+  template <typename CodeTable>
+  void operator()(const CodeTable &code_table) {
+    size_t num_samples = tmat_.dims()[0];
+    size_t o_width = tmat_.dims()[1];
+    auto *tmat_data = tmat_.data<T>();
+    auto *sum_data = sum_->data<T>();
+    for (size_t i = 0; i < num_samples; ++i) {
+      T sm = static_cast<T>(0.0);
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      for (int j = 0; j < code_length; ++j) {
+        if (code.calc_bit(j)) {
+          // calc_bit starts from right most bit, while data in tmat[i] is in
+          // the
+          // reverse order.
+          sm += tmat_data[i * o_width + j];
+        }
       }
+      sum_data[i] = scale_sum_ * sm;
     }
-    sum_data[i] = scale_sum * sm;
   }
+};
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::Sum(const framework::Tensor &tmat,
+                                  framework::Tensor *sum, T scale_sum) {
+  MatrixBitCodeFunctorSum<T> func(tmat, sum, scale_sum);
+  code_table_.apply_visitor(func);
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat,
-                                  const framework::Tensor& weight,
-                                  const framework::Tensor& input) {
-  auto blas =
-      GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
-  size_t num_samples = tmat->dims()[0];
-  size_t tmat_width = tmat->dims()[1];
-  size_t input_width = input.dims()[1];
-  size_t weight_width = weight.dims()[1];
-  auto tmat_value = tmat->data<T>();
-  auto weight_value = weight.data<T>();
-  auto input_value = input.data<T>();
-  for (size_t i = 0; i < num_samples; ++i) {
-    auto code = code_table_->get_code(i);
-    int code_length = code->get_length();
-    const T* input_row = input_value + input_width * i;
-    for (int j = 0; j < code_length; ++j) {
-      size_t index = code->calc_index(j);
-      const T* weight_row = weight_value + weight_width * index;
-      T sum = static_cast<T>(0.0);
-      sum = blas.DOT(input_width, weight_row, input_row);
-      tmat_value[i * tmat_width + j] += sum;
+struct MatrixBitCodeFunctorMul : public boost::static_visitor<void> {
+  framework::Tensor *tmat_;
+  const framework::Tensor &weight_;
+  const framework::Tensor &input_;
+
+  MatrixBitCodeFunctorMul(framework::Tensor *tmat,
+                          const framework::Tensor &weight,
+                          const framework::Tensor &input)
+      : tmat_(tmat), weight_(weight), input_(input) {}
+
+  template <typename CodeTable>
+  void operator()(const CodeTable &code_table) {
+    auto blas =
+        GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
+    size_t num_samples = tmat_->dims()[0];
+    size_t tmat_width = tmat_->dims()[1];
+    size_t input_width = input_.dims()[1];
+    size_t weight_width = weight_.dims()[1];
+    auto tmat_value = tmat_->data<T>();
+    auto weight_value = weight_.data<T>();
+    auto input_value = input_.data<T>();
+    for (size_t i = 0; i < num_samples; ++i) {
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      const T *input_row = input_value + input_width * i;
+      for (int j = 0; j < code_length; ++j) {
+        size_t index = code.calc_index(j);
+        const T *weight_row = weight_value + weight_width * index;
+        T sum = blas.DOT(input_width, weight_row, input_row);
+        tmat_value[i * tmat_width + j] += sum;
+      }
     }
   }
+};
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::Mul(framework::Tensor *tmat,
+                                  const framework::Tensor &weight,
+                                  const framework::Tensor &input) {
+  MatrixBitCodeFunctorMul<T> func(tmat, weight, input);
+  code_table_.apply_visitor(func);
 }
 
+template <typename T, size_t N>
+class ReservedVector : public std::vector<T> {
+ public:
+  ReservedVector() { this->reserve(N); }
+};
+
 template <typename T>
-void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
-                                            framework::Tensor* weight,
-                                            const framework::Tensor& input) {
-  auto blas =
-      GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
-  size_t num_samples = tmat.dims()[0];
-  size_t input_width = input.dims()[1];
-  size_t tmat_width = tmat.dims()[1];
-  size_t weight_width = weight->dims()[1];
-  auto tmat_value = tmat.data<T>();
-  auto weight_value = weight->data<T>();
-  auto input_value = input.data<T>();
-
-  std::map<int, std::vector<std::pair<T, const T*>>> ops;
-  for (size_t i = 0; i < num_samples; ++i) {
-    auto code = code_table_->get_code(i);
-    int code_length = code->get_length();
-    const T* input_value_row = input_value + input_width * i;
-    const T* tmat_row = tmat_value + i * tmat_width;
-    for (int j = 0; j < code_length; ++j) {
-      ops[code->calc_index(j)].emplace_back(tmat_row[j], input_value_row);
+struct MatrixBitCodeFunctorMulGradWeight : public boost::static_visitor<void> {
+  const framework::Tensor &tmat_;
+  framework::Tensor *weight_;
+  const framework::Tensor &input_;
+  MatrixBitCodeFunctorMulGradWeight(const framework::Tensor &tmat,
+                                    framework::Tensor *weight,
+                                    const framework::Tensor &input)
+      : tmat_(tmat), weight_(weight), input_(input) {}
+  template <typename CodeTable>
+  void operator()(const CodeTable &code_table) {
+    auto blas =
+        GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
+    size_t num_samples = tmat_.dims()[0];
+    size_t input_width = input_.dims()[1];
+    size_t tmat_width = tmat_.dims()[1];
+    size_t weight_width = weight_->dims()[1];
+    auto tmat_value = tmat_.data<T>();
+    auto weight_value = weight_->data<T>();
+    auto input_value = input_.data<T>();
+
+    std::map<int, ReservedVector<std::pair<T, const T *>, 8u>> ops;
+    for (size_t i = 0; i < num_samples; ++i) {
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      const T *input_value_row = input_value + input_width * i;
+      const T *tmat_row = tmat_value + i * tmat_width;
+      for (int j = 0; j < code_length; ++j) {
+        ops[code.calc_index(j)].emplace_back(tmat_row[j], input_value_row);
+      }
     }
-  }
-  for (auto& op : ops) {
-    auto& op_in_row = op.second;
-    for (auto& pair : op_in_row) {
-      auto& scale = pair.first;
-      auto* input_row = pair.second;
-      T* weight_row = weight_value + op.first * weight_width;
-      blas.AXPY(input_width, scale, input_row, weight_row);
+    for (auto &op : ops) {
+      auto &op_in_row = op.second;
+      for (auto &pair : op_in_row) {
+        auto &scale = pair.first;
+        auto *input_row = pair.second;
+        T *weight_row = weight_value + op.first * weight_width;
+        blas.AXPY(input_width, scale, input_row, weight_row);
+      }
     }
   }
+};
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor &tmat,
+                                            framework::Tensor *weight,
+                                            const framework::Tensor &input) {
+  MatrixBitCodeFunctorMulGradWeight<T> func(tmat, weight, input);
+  code_table_.apply_visitor(func);
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
-                                            framework::SelectedRows* weight,
-                                            const framework::Tensor& input) {
-  auto blas =
-      GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
-  size_t num_samples = tmat.dims()[0];
-  size_t input_width = input.dims()[1];
-  size_t tmat_width = tmat.dims()[1];
-  size_t weight_width = weight->value().dims()[1];
-  auto tmat_value = tmat.data<T>();
-  auto weight_value = weight->mutable_value()->data<T>();
-  auto input_value = input.data<T>();
-
-  std::unordered_map<int, std::vector<std::pair<T, const T*>>> ops;
-  ops.reserve(weight->rows().size());
-
-  for (size_t i = 0; i < num_samples; ++i) {
-    auto code = code_table_->get_code(i);
-    int code_length = code->get_length();
-    const T* input_value_row = input_value + input_width * i;
-    const T* tmat_row = tmat_value + i * tmat_width;
-    for (int j = 0; j < code_length; ++j) {
-      ops[code->calc_index(j)].emplace_back(tmat_row[j], input_value_row);
+struct MatrixBitCodeFunctorMulGradWeightSR
+    : public boost::static_visitor<void> {
+  const framework::Tensor &tmat_;
+  framework::SelectedRows *weight_;
+  const framework::Tensor &input_;
+
+  MatrixBitCodeFunctorMulGradWeightSR(const framework::Tensor &tmat,
+                                      framework::SelectedRows *weight,
+                                      const framework::Tensor &input)
+      : tmat_(tmat), weight_(weight), input_(input) {}
+
+  template <typename CodeTable>
+  void operator()(const CodeTable &code_table) {
+    auto blas =
+        GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
+    size_t num_samples = tmat_.dims()[0];
+    size_t input_width = input_.dims()[1];
+    size_t tmat_width = tmat_.dims()[1];
+    size_t weight_width = weight_->value().dims()[1];
+    auto tmat_value = tmat_.data<T>();
+    auto weight_value = weight_->mutable_value()->data<T>();
+    auto input_value = input_.data<T>();
+
+    std::unordered_map<int, std::vector<std::pair<T, const T *>>> ops;
+    ops.reserve(weight_->rows().size());
+
+    for (size_t i = 0; i < num_samples; ++i) {
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      const T *input_value_row = input_value + input_width * i;
+      const T *tmat_row = tmat_value + i * tmat_width;
+      for (int j = 0; j < code_length; ++j) {
+        ops[code.calc_index(j)].emplace_back(tmat_row[j], input_value_row);
+      }
     }
-  }
 
-  for (auto& row : weight->rows()) {
-    auto& op_in_row = ops[row];
-    for (auto& pair : op_in_row) {
-      auto& scale = pair.first;
-      auto* input_row = pair.second;
-      blas.AXPY(input_width, scale, input_row, weight_value);
+    for (auto &row : weight_->rows()) {
+      auto &op_in_row = ops[row];
+      for (auto &pair : op_in_row) {
+        auto &scale = pair.first;
+        auto *input_row = pair.second;
+        blas.AXPY(input_width, scale, input_row, weight_value);
+      }
+      weight_value += weight_width;
     }
-    weight_value += weight_width;
   }
+};
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor &tmat,
+                                            framework::SelectedRows *weight,
+                                            const framework::Tensor &input) {
+  MatrixBitCodeFunctorMulGradWeightSR<T> func(tmat, weight, input);
+  code_table_.apply_visitor(func);
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor& tmat,
-                                           const framework::Tensor& weight,
-                                           framework::Tensor* input) {
-  size_t num_samples = tmat.dims()[0];
-  size_t tmat_width = tmat.dims()[1];
-  size_t input_width = input->dims()[1];
-  size_t weight_width = weight.dims()[1];
-  auto tmat_value = tmat.data<T>();
-  auto weight_value = weight.data<T>();
-  auto input_value = input->data<T>();
-
-  for (size_t i = 0; i < num_samples; ++i) {
-    auto code = code_table_->get_code(i);
-    int code_length = code->get_length();
-    for (int j = 0; j < code_length; ++j) {
-      size_t index = code->calc_index(j);
-
-      for (size_t k = 0; k < input_width; ++k) {
-        input_value[input_width * i + k] +=
-            tmat_value[i * tmat_width + j] *
-            weight_value[weight_width * index + k];
+struct MatrixBitCodeFunctorMulGradError : public boost::static_visitor<void> {
+  const framework::Tensor &tmat_;
+  const framework::Tensor &weight_;
+  framework::Tensor *input_;
+
+  MatrixBitCodeFunctorMulGradError(const framework::Tensor &tmat,
+                                   const framework::Tensor &weight,
+                                   framework::Tensor *input)
+      : tmat_(tmat), weight_(weight), input_(input) {}
+  template <typename CodeTable>
+  void operator()(const CodeTable &code_table) {
+    size_t num_samples = tmat_.dims()[0];
+    size_t tmat_width = tmat_.dims()[1];
+    size_t input_width = input_->dims()[1];
+    size_t weight_width = weight_.dims()[1];
+    auto tmat_value = tmat_.data<T>();
+    auto weight_value = weight_.data<T>();
+    auto input_value = input_->data<T>();
+
+    for (size_t i = 0; i < num_samples; ++i) {
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      for (int j = 0; j < code_length; ++j) {
+        size_t index = code.calc_index(j);
+
+        for (size_t k = 0; k < input_width; ++k) {
+          input_value[input_width * i + k] +=
+              tmat_value[i * tmat_width + j] *
+              weight_value[weight_width * index + k];
+        }
       }
     }
   }
+};
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor &tmat,
+                                           const framework::Tensor &weight,
+                                           framework::Tensor *input) {
+  MatrixBitCodeFunctorMulGradError<T> func(tmat, weight, input);
+  code_table_.apply_visitor(func);
 }
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::Sub(framework::Tensor* tmat) {
-  size_t num_samples = tmat->dims()[0];
-  size_t o_width = tmat->dims()[1];
-  auto* tmat_data = tmat->data<T>();
-  for (size_t i = 0; i < num_samples; ++i) {
-    auto code = code_table_->get_code(i);
-    int code_length = code->get_length();
-    for (int j = 0; j < code_length; ++j) {
-      if (code->calc_bit(j)) {
-        tmat_data[i * o_width + j] -= 1;
+struct MatrixBitCodeFunctorSub : public boost::static_visitor<void> {
+  framework::Tensor *tmat_;
+
+  explicit MatrixBitCodeFunctorSub(framework::Tensor *tmat) : tmat_(tmat) {}
+
+  template <typename CodeTable>
+  void operator()(const CodeTable &code_table) {
+    size_t num_samples = tmat_->dims()[0];
+    size_t o_width = tmat_->dims()[1];
+    auto *tmat_data = tmat_->data<T>();
+    for (size_t i = 0; i < num_samples; ++i) {
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      for (int j = 0; j < code_length; ++j) {
+        if (code.calc_bit(j)) {
+          tmat_data[i * o_width + j] -= 1;
+        }
       }
     }
   }
+};
+
+template <typename T>
+void MatrixBitCodeFunctor<T>::Sub(framework::Tensor *tmat) {
+  MatrixBitCodeFunctorSub<T> func(tmat);
+  code_table_.apply_visitor(func);
 }
 
 template class MatrixBitCodeFunctor<float>;
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
index cf43ad9d44..01e4889d34 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/variant.h"
 
 #if defined(_WIN32)
 #include <intrin.h>
@@ -99,24 +100,7 @@ inline int clz(const T& value) {
 
 inline size_t FindLastSet(size_t x) { return sizeof(size_t) * 8 - clz(x); }
 #endif  // !_WIN32
-// set a code interface to create multiple code
-class Code {
- public:
-  virtual ~Code() {}
-  virtual size_t calc_index(int bit) const = 0;
-  virtual bool calc_bit(int bit) const = 0;
-  virtual int get_length() const = 0;
-};
-// set a CodeTable interface to create multiple code table
-class CodeTable {
- public:
-  virtual Code* get_code(int64_t code) const = 0;
-  virtual size_t size() const = 0;
-  virtual int get_max_code_length() const = 0;
-  virtual ~CodeTable() {}
-};
-
-class SimpleCode : public Code {
+class SimpleCode {
  public:
   SimpleCode(size_t code, size_t num_classes, const int64_t* ids)
       : c_(static_cast<size_t>(ids[code]) + num_classes) {}
@@ -138,7 +122,7 @@ class SimpleCode : public Code {
 };
 
 template <typename T>
-class CustomCode : public Code {
+class CustomCode {
  public:
   CustomCode(const framework::Tensor& ptable, const framework::Tensor& pcode,
              const int64_t* ids, int index) {
@@ -155,11 +139,11 @@ class CustomCode : public Code {
    * Binary classification path is the suffixes of encoding, thus leave out the
    * left most bit in calc_bit.
    */
-  size_t calc_index(int bit) const override { return ptable_data_[bit]; }
-  bool calc_bit(int bit) const override { return pcode_data_[bit]; }
+  size_t calc_index(int bit) const { return ptable_data_[bit]; }
+  bool calc_bit(int bit) const { return pcode_data_[bit]; }
 
   // NOTE: this function is not thread-safe.
-  int get_length() const override {
+  int get_length() const {
     if (length_ < 0) {
       auto len = seq_len_;
       length_ =
@@ -177,46 +161,32 @@ class CustomCode : public Code {
   mutable int length_{-1};
 };
 
-class SimpleCodeTable : public CodeTable {
+class SimpleCodeTable {
  public:
   SimpleCodeTable(size_t num_classes, const int64_t* ids)
       : num_classes_(num_classes), ids_(ids) {}
 
-  Code* get_code(int64_t code) const {
-    auto it = codes_.find(code);
-    if (it != codes_.end()) {
-      return it->second.get();
-    }
-    auto* result = new SimpleCode(code, num_classes_, ids_);
-    codes_.emplace(code, std::unique_ptr<Code>(result));
-    return result;
+  SimpleCode get_code(int64_t code) const {
+    return SimpleCode(code, num_classes_, ids_);
   }
 
   size_t size() const { return num_classes_; }
   int get_max_code_length() const { return FindLastSet(num_classes_ - 1); }
 
  private:
-  mutable std::map<int64_t, std::unique_ptr<Code>> codes_;
-
   size_t num_classes_;
   const int64_t* ids_;
 };
 
 template <typename T>
-class CustomCodeTable : public CodeTable {
+class CustomCodeTable {
  public:
   CustomCodeTable(const framework::Tensor& ptable,
                   const framework::Tensor& pcode, const int64_t* ids)
       : ptable_(ptable), pcode_(pcode), ids_(ids) {}
 
-  Code* get_code(int64_t code) const {
-    auto it = codes_.find(code);
-    if (it != codes_.end()) {
-      return it->second.get();
-    }
-    auto* result = new CustomCode<T>(ptable_, pcode_, ids_, code);
-    codes_.emplace(code, std::unique_ptr<Code>(result));
-    return result;
+  CustomCode<T> get_code(int64_t code) const {
+    return CustomCode<T>(ptable_, pcode_, ids_, code);
   }
 
   size_t size() const { return static_cast<size_t>(ptable_.dims()[1]); }
@@ -225,25 +195,26 @@ class CustomCodeTable : public CodeTable {
   }
 
  private:
-  mutable std::unordered_map<int64_t, std::unique_ptr<Code>> codes_;
   const framework::Tensor& ptable_;
   const framework::Tensor& pcode_;
   const int64_t* ids_;
 };
 
+using CodeTable = boost::variant<SimpleCodeTable, CustomCodeTable<int64_t>>;
+
 template <typename T>
 class MatrixBitCodeFunctor {
  public:
   MatrixBitCodeFunctor(size_t num_classes, const int64_t* ids)
       : num_classes_(num_classes),
         ids_(ids),
-        code_table_(new SimpleCodeTable(num_classes, ids)) {}
+        code_table_(SimpleCodeTable(num_classes, ids)) {}
 
   MatrixBitCodeFunctor(const framework::Tensor& ptable,
                        const framework::Tensor& pcode, const int64_t* ids)
       : num_classes_(static_cast<size_t>(ptable.dims()[1])),
         ids_(ids),
-        code_table_(new CustomCodeTable<int64_t>(ptable, pcode, ids)) {}
+        code_table_(CustomCodeTable<int64_t>(ptable, pcode, ids)) {}
   /* For j < code_length
        tmat(i, j) += vec(0, index(i, j))
   */
@@ -293,7 +264,7 @@ class MatrixBitCodeFunctor {
 
   size_t num_classes_;
   const int64_t* ids_;
-  std::unique_ptr<CodeTable> code_table_;
+  CodeTable code_table_;
 };
 }  // namespace math
 }  // namespace operators

From aa38fc4ce5cb73e01b614ff57fae9553dcf30abf Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 13 Dec 2018 15:20:40 +0800
Subject: [PATCH 27/45] Fix compile

test=develop
---
 paddle/fluid/inference/tests/api/tester_helper.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 8209a049f4..4c8bce4600 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -373,7 +373,7 @@ static bool CompareTensorData(const framework::LoDTensor &a,
   }
 
   for (size_t i = 0; i < a_size; i++) {
-    if (a.type() == typeid(float)) {
+    if (a.type() == framework::proto::VarType::FP32) {
       const auto *a_data = a.data<float>();
       const auto *b_data = b.data<float>();
       if (std::abs(a_data[i] - b_data[i]) > 1e-3) {
@@ -382,7 +382,7 @@ static bool CompareTensorData(const framework::LoDTensor &a,
             b_data[i]);
         return false;
       }
-    } else if (a.type() == typeid(int64_t)) {
+    } else if (a.type() == framework::proto::VarType::INT64) {
       const auto *a_data = a.data<int64_t>();
       const auto *b_data = b.data<int64_t>();
       if (std::abs(a_data[i] - b_data[i]) > 1e-3) {

From e3c4b0dacee78d49a4701db788375b02d0916d6a Mon Sep 17 00:00:00 2001
From: SunGaofeng <peakbee@gmail.com>
Date: Thu, 13 Dec 2018 15:46:12 +0800
Subject: [PATCH 28/45] this is for psroi_pool op, test=develop (#14796)

* Add psroi_pool operator.
---
 paddle/fluid/API.spec                         |   1 +
 paddle/fluid/operators/psroi_pool_op.cc       | 173 +++++++++++
 paddle/fluid/operators/psroi_pool_op.cu       | 294 ++++++++++++++++++
 paddle/fluid/operators/psroi_pool_op.h        | 253 +++++++++++++++
 python/paddle/fluid/layers/nn.py              |  55 ++++
 .../fluid/tests/unittests/test_layers.py      |  10 +
 .../tests/unittests/test_psroi_pool_op.py     | 134 ++++++++
 7 files changed, 920 insertions(+)
 create mode 100644 paddle/fluid/operators/psroi_pool_op.cc
 create mode 100644 paddle/fluid/operators/psroi_pool_op.cu
 create mode 100644 paddle/fluid/operators/psroi_pool_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_psroi_pool_op.py

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index fd4cf92d85..8e6482ca98 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -198,6 +198,7 @@ paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act
 paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
+paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc
new file mode 100644
index 0000000000..6978d9c5dc
--- /dev/null
+++ b/paddle/fluid/operators/psroi_pool_op.cc
@@ -0,0 +1,173 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/psroi_pool_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+class PSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor), "
+             "the input of PSROIPoolOp. "
+             "The format of input tensor is NCHW. Where N is the batch size, "
+             "C is the number of input channels, "
+             "H is the height of the input feature map, and "
+             "W is the width.");
+    AddInput("ROIs",
+             "(LoDTensor), "
+             "ROIs (Regions of Interest) to pool over. "
+             "should be a 2-D LoDTensor of shape (num_rois, 4) "
+             "given as [(x1, y1, x2, y2), ...]. "
+             "where (x1, y1) is the top left coordinates, and "
+             "(x2, y2) is the bottom right coordinates. "
+             "The roi batch index can be calculated from LoD.");
+    AddOutput("Out",
+              "(Tensor), "
+              "the output of PSROIPoolOp is a 4-D Tensor with shape "
+              "(num_rois, output_channels, pooled_h, pooled_w).");
+    AddAttr<int>(
+        "output_channels",
+        "(int), "
+        "the number of channels of the output feature map. "
+        "For a task of C classes of objects, output_channels should be "
+        "(C + 1) for classification only.");
+    AddAttr<float>("spatial_scale",
+                   "(float, default 1.0), "
+                   "Multiplicative spatial scale factor "
+                   "to translate ROI coords from their input scale "
+                   "to the scale used when pooling.")
+        .SetDefault(1.0);
+    AddAttr<int>("pooled_height",
+                 "(int, default 1), "
+                 "the pooled output height.")
+        .SetDefault(1);
+    AddAttr<int>("pooled_width",
+                 "(int, default 1), "
+                 "the pooled output width.")
+        .SetDefault(1);
+    AddComment(R"Doc(
+**PSROIPool Operator**
+
+Position sensitive region of interest pooling (also known as PSROIPooling) is to perform
+position-sensitive average pooling on regions of interest specified by input, takes as 
+input N position-sensitive score maps and a list of num_rois regions of interest. 
+
+PSROIPooling for R-FCN. Please refer to https://arxiv.org/abs/1605.06409 for more details.
+    )Doc");
+  }
+};
+
+class PSROIPoolOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of PSROIPoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ROIs"),
+                   "Input(ROIs) of PSROIPoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of PSROIPoolOp should not be null.");
+    auto input_dims = ctx->GetInputDim("X");
+    auto rois_dims = ctx->GetInputDim("ROIs");
+
+    PADDLE_ENFORCE(input_dims.size() == 4,
+                   "The format of input tensor is NCHW");
+    PADDLE_ENFORCE(rois_dims.size() == 2,
+                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
+                   "given as [(x1, y1, x2, y2), ...]");
+    PADDLE_ENFORCE(rois_dims[1] == 4,
+                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
+                   "given as [(x1, y1, x2, y2), ...]");
+
+    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
+    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
+    int output_channels = ctx->Attrs().Get<int>("output_channels");
+    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
+
+    PADDLE_ENFORCE(
+        input_dims[1] == output_channels * pooled_height * pooled_width,
+        "the channel of X(%d) should be equal to the product of "
+        "output_channels(%d), pooled_height(%d) and pooled_width(%d)",
+        input_dims[1], output_channels, pooled_height, pooled_width);
+
+    PADDLE_ENFORCE_GT(pooled_height, 0,
+                      "The pooled output height must be greater than 0");
+    PADDLE_ENFORCE_GT(pooled_width, 0,
+                      "The pooled output width must be greater than 0");
+    PADDLE_ENFORCE_GT(output_channels, 1,
+                      "The pooled output channels must greater than 1");
+    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
+                      "The spatial scale must greater than 0.");
+
+    auto out_dims = input_dims;
+    out_dims[0] = rois_dims[0];
+    out_dims[1] =
+        output_channels;  // input_dims[1] / (pooled_height * pooled_width);
+    out_dims[2] = pooled_height;
+    out_dims[3] = pooled_width;
+    ctx->SetOutputDim("Out", out_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class PSROIPoolGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "The gradient of Out should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "The gradient of X should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(psroi_pool, ops::PSROIPoolOp, ops::PSROIPoolOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp);
+REGISTER_OP_CPU_KERNEL(
+    psroi_pool,
+    ops::CPUPSROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CPUPSROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    psroi_pool_grad,
+    ops::CPUPSROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CPUPSROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/psroi_pool_op.cu b/paddle/fluid/operators/psroi_pool_op.cu
new file mode 100644
index 0000000000..22fec3244f
--- /dev/null
+++ b/paddle/fluid/operators/psroi_pool_op.cu
@@ -0,0 +1,294 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/psroi_pool_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaximumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaximumNumBlocks);
+}
+
+template <typename T>
+__global__ void GPUPSROIPoolForward(
+    const int nthreads, const T* input_data, const T* input_rois,
+    const float spatial_scale, const int input_channels, const int height,
+    const int width, const int output_channels, const int pooled_height,
+    const int pooled_width, const int* rois_batch_id_data, T* output_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (size_t i = index; i < nthreads; i += offset) {
+    // The output is in order (n, c, ph, pw)
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % output_channels;
+    int n = i / pooled_width / pooled_height / output_channels;
+
+    // set roi_batch_id
+    int roi_batch_id = rois_batch_id_data[n];
+
+    // [start, end) interval for spatial sampling
+    const T* offset_input_rois = input_rois + n * 4;
+    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+    T roi_end_w =
+        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+    T roi_end_h =
+        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+    // Force too small ROIs to be 1x1
+    T roi_height = max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+    T roi_width = max(roi_end_w - roi_start_w, (T)0.1);
+
+    // Compute w and h at input feature map
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
+    int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
+    int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
+    int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart, 0), height);
+    hend = min(max(hend, 0), height);
+    wstart = min(max(wstart, 0), width);
+    wend = min(max(wend, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+    const T* offset_input_data =
+        input_data +
+        (roi_batch_id * input_channels + input_channel) * height * width;
+    T outsum = 0;
+
+    for (int ih = hstart; ih < hend; ++ih) {
+      for (int iw = wstart; iw < wend; ++iw) {
+        int input_index = ih * width + iw;
+        outsum += offset_input_data[input_index];
+      }
+    }
+
+    T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
+    output_data[i] = is_empty ? 0. : outsum / bin_area;
+  }
+}
+
+template <typename T>
+__global__ void GPUPSROIPoolBackward(
+    const int nthreads, const T* input_rois, const T* output_grad_data,
+    const float spatial_scale, const int input_channels, const int height,
+    const int width, const int output_channels, const int pooled_height,
+    const int pooled_width, const int* rois_batch_id_data, T* input_grad_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    // The output is in order (n, c, ph, pw)
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % output_channels;
+    int n = i / pooled_width / pooled_height / output_channels;
+
+    // set roi_batch_id
+    int roi_batch_id = rois_batch_id_data[n];
+    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+    int input_offset =
+        (roi_batch_id * input_channels + input_channel) * height * width;
+    T* offset_input_grad_data = input_grad_data + input_offset;
+
+    // [start, end) interval for spatial sampling
+    const T* offset_input_rois = input_rois + n * 4;
+    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+    T roi_end_w =
+        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+    T roi_end_h =
+        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+    // Force too small ROIs to be 1x1
+    T roi_height = max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+    T roi_width = max(roi_end_w - roi_start_w, (T)0.1);
+
+    // Compute w and h at input feature map
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
+    int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
+    int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
+    int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart, 0), height);
+    hend = min(max(hend, 0), height);
+    wstart = min(max(wstart, 0), width);
+    wend = min(max(wend, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    // Accumulate diff_val into input data
+    T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
+    T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area;
+    for (int ih = hstart; ih < hend; ++ih) {
+      for (int iw = wstart; iw < wend; ++iw) {
+        int input_index = ih * width + iw;
+        platform::CudaAtomicAdd(offset_input_grad_data + input_index, diff_val);
+      }
+    }
+  }
+}
+
+template <typename Place, typename T>
+class GPUPSROIPoolOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<Tensor>("X");
+    auto* rois = ctx.Input<LoDTensor>("ROIs");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto output_channels = ctx.Attr<int>("output_channels");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    auto in_dims = in->dims();
+    int batch_size = in_dims[0];
+    int input_channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+
+    PADDLE_ENFORCE_EQ(input_channels,
+                      output_channels * pooled_height * pooled_width,
+                      "the channels of input X should equal the product of "
+                      "output_channels x pooled_height x pooled_width");
+
+    int rois_num = rois->dims()[0];
+    if (rois_num == 0) return;
+
+    auto rois_lod = rois->lod().back();
+    int rois_batch_size = rois_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        rois_batch_size, batch_size,
+        "The rois_batch_size and input(X) batch_size must be the same.");
+    int rois_num_with_lod = rois_lod[rois_batch_size];
+    PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
+                      "The rois_num from input and lod must be the same.");
+
+    // set rois batch id
+    framework::Tensor rois_batch_id_list;
+    rois_batch_id_list.Resize({rois_num});
+    int* rois_batch_id_data =
+        rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        rois_batch_id_data[i] = n;
+      }
+    }
+
+    framework::Tensor rois_batch_id_list_gpu;
+    framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(),
+                          ctx.device_context(), &rois_batch_id_list_gpu);
+
+    int output_size = out->numel();
+    int blocks = NumBlocks(output_size);
+    int threads = kNumCUDAThreads;
+
+    // call cuda kernel function
+    GPUPSROIPoolForward<
+        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+        output_size, in->data<T>(), rois->data<T>(), spatial_scale,
+        input_channels, height, width, output_channels, pooled_height,
+        pooled_width, rois_batch_id_list_gpu.data<int>(),
+        out->mutable_data<T>(ctx.GetPlace()));
+  }
+};
+
+template <typename Place, typename T>
+class GPUPSROIPoolGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<Tensor>("X");
+    auto* rois = ctx.Input<LoDTensor>("ROIs");
+
+    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto output_channels = ctx.Attr<int>("output_channels");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    int rois_num = rois->dims()[0];
+    int input_channels = in->dims()[1];
+    int height = in->dims()[2];
+    int width = in->dims()[3];
+
+    if (input_grad) {
+      // set roi batch id
+      framework::Tensor rois_batch_id_list;
+      rois_batch_id_list.Resize({rois_num});
+      int* rois_batch_id_data =
+          rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
+      auto rois_lod = rois->lod().back();
+      int rois_batch_size = rois_lod.size() - 1;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+      }
+
+      framework::Tensor rois_batch_id_list_gpu;
+      framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(),
+                            ctx.device_context(), &rois_batch_id_list_gpu);
+
+      input_grad->mutable_data<T>(ctx.GetPlace());
+      math::SetConstant<Place, T> set_zero;
+      set_zero(ctx.cuda_device_context(), input_grad, static_cast<T>(0));
+
+      int output_grad_size = output_grad->numel();
+      int blocks = NumBlocks(output_grad_size);
+      int threads = kNumCUDAThreads;
+
+      if (output_grad_size > 0) {
+        GPUPSROIPoolBackward<
+            T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+            output_grad_size, rois->data<T>(), output_grad->data<T>(),
+            spatial_scale, input_channels, height, width, output_channels,
+            pooled_height, pooled_width, rois_batch_id_list_gpu.data<int>(),
+            input_grad->mutable_data<T>(ctx.GetPlace()));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    psroi_pool,
+    ops::GPUPSROIPoolOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GPUPSROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    psroi_pool_grad,
+    ops::GPUPSROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GPUPSROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/psroi_pool_op.h b/paddle/fluid/operators/psroi_pool_op.h
new file mode 100644
index 0000000000..1a424728f7
--- /dev/null
+++ b/paddle/fluid/operators/psroi_pool_op.h
@@ -0,0 +1,253 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class CPUPSROIPoolOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+    auto output_channels = ctx.Attr<int>("output_channels");
+
+    auto in_dims = in->dims();
+    int batch_size = in_dims[0];
+    int input_channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    int rois_num = rois->dims()[0];
+
+    auto in_stride = framework::stride(in_dims);
+    auto roi_stride = framework::stride(rois->dims());
+    auto out_stride = framework::stride(out->dims());
+
+    const T* input_data = in->data<T>();
+
+    framework::Tensor rois_batch_id_list;
+    rois_batch_id_list.Resize({rois_num});
+    int* rois_batch_id_data =
+        rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
+
+    auto rois_lod = rois->lod().back();
+    int rois_batch_size = rois_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        rois_batch_size, batch_size,
+        "the rois_batch_size and input(X) batch_size should be the same.");
+    int rois_num_with_lod = rois_lod[rois_batch_size];
+    PADDLE_ENFORCE_EQ(rois_num_with_lod, rois_num,
+                      "the rois_num from input and lod must be the same");
+
+    PADDLE_ENFORCE_EQ(input_channels,
+                      output_channels * pooled_height * pooled_width,
+                      "the channels of input X should equal the product of "
+                      "output_channels x pooled_height x pooled_width");
+
+    // calculate batch id index for each roi according to LoD
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        rois_batch_id_data[i] = n;
+      }
+    }
+
+    T* output_data = out->mutable_data<T>(ctx.GetPlace());
+    const T* input_rois = rois->data<T>();
+
+    // calculate psroipooling, parallel processing can be implemented per ROI
+    for (int n = 0; n < rois_num; ++n) {
+      // set roi batch id
+      int roi_batch_id = rois_batch_id_data[n];
+
+      // [start, end) interval for spatial sampling
+      const T* offset_input_rois = input_rois + n * 4;
+      T roi_start_w =
+          static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+      T roi_start_h =
+          static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+      T roi_end_w =
+          static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+      T roi_end_h =
+          static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+      // Force too small rois to be 1 x 1
+      T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+      T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
+
+      // Compute bin size w and h at input feature map
+      T bin_size_h = roi_height / static_cast<T>(pooled_height);
+      T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+      // calculate each pixel of the output feature map.
+      int out_roi_offset = n * out_stride[0];
+      for (int c = 0; c < output_channels; ++c) {
+        // per category
+        int out_plane_offset = out_roi_offset + c * out_stride[1];
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          int out_row_offset = out_plane_offset + ph * out_stride[2];
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            // calculate w and h at input feature map
+            int hstart = floor(static_cast<T>(ph) * bin_size_h + roi_start_h);
+            int wstart = floor(static_cast<T>(pw) * bin_size_w + roi_start_w);
+            int hend = ceil(static_cast<T>(ph + 1) * bin_size_h + roi_start_h);
+            int wend = ceil(static_cast<T>(pw + 1) * bin_size_w + roi_start_w);
+            //  Add roi offsets and clip to input boundaries
+            hstart = std::min(std::max(hstart, 0), height);
+            wstart = std::min(std::max(wstart, 0), width);
+            hend = std::min(std::max(hend, 0), height);
+            wend = std::min(std::max(wend, 0), width);
+
+            int output_index = out_row_offset + pw;
+            int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+            int input_plane_offset =
+                roi_batch_id * in_stride[0] + input_channel * in_stride[1];
+            const T* offset_input_data = input_data + input_plane_offset;
+            T out_sum = 0.;
+            bool is_empty = (hend <= hstart) || (wend <= wstart);
+            for (int ih = hstart; ih < hend; ++ih) {
+              for (int iw = wstart; iw < wend; ++iw) {
+                int input_index = ih * in_stride[2] + iw;
+                out_sum += offset_input_data[input_index];
+              }
+            }
+            T bin_area = (hend - hstart) * (wend - wstart);
+            output_data[output_index] = is_empty ? 0. : out_sum / bin_area;
+          }
+        }
+      }
+    }
+    return;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUPSROIPoolGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
+    auto* output_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* input_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto output_channels = ctx.Attr<int>("output_channels");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    if (input_grad) {
+      auto in_dims = in->dims();
+      int input_channels = in_dims[1];
+      int height = in_dims[2];
+      int width = in_dims[3];
+      int rois_num = rois->dims()[0];
+
+      // set roi batch id
+      framework::Tensor rois_batch_id_list;
+      rois_batch_id_list.Resize({rois_num});
+      int* rois_batch_id_data =
+          rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
+      auto rois_lod = rois->lod().back();
+      int rois_batch_size = rois_lod.size() - 1;
+      // calculate batch id index for each roi according to LoD
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+      }
+
+      const T* input_rois = rois->data<T>();
+      const T* output_grad_data = output_grad->data<T>();
+      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+
+      // set gradient of X to be 0. before backpropagate.
+      math::SetConstant<DeviceContext, T> set_zero;
+      set_zero(ctx.template device_context<DeviceContext>(), input_grad,
+               static_cast<T>(0));
+
+      // backpropagate gradient per output pixel
+      int output_grad_size = output_grad->numel();
+      for (int i = 0; i < output_grad_size; ++i) {
+        // The output is in order (n, c, ph, pw)
+        int pw = i % pooled_width;
+        int ph = (i / pooled_width) % pooled_height;
+        int c = (i / pooled_width / pooled_height) % output_channels;
+        int n = i / pooled_width / pooled_height / output_channels;
+
+        // set roi_batch_id
+        int roi_batch_id = rois_batch_id_data[n];
+        int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+        int input_offset =
+            (roi_batch_id * input_channels + input_channel) * height * width;
+        T* offset_input_grad_data = input_grad_data + input_offset;
+
+        // [start, end) interval for spatial sampling
+        const T* offset_input_rois = input_rois + n * 4;
+        T roi_start_w =
+            static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+        T roi_start_h =
+            static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+        T roi_end_w =
+            static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+        T roi_end_h =
+            static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+        // Force too small ROIs to be 1x1
+        T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+        T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
+
+        // Compute w and h at input feature map
+        T bin_size_h = roi_height / static_cast<T>(pooled_height);
+        T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+        int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
+        int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
+        int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
+        int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
+
+        // Add roi offsets and clip to input boundaries
+        hstart = std::min(std::max(hstart, 0), height);
+        hend = std::min(std::max(hend, 0), height);
+        wstart = std::min(std::max(wstart, 0), width);
+        wend = std::min(std::max(wend, 0), width);
+        bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+        // Accumulate diff_val into input data
+        T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
+        T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area;
+        for (int ih = hstart; ih < hend; ++ih) {
+          for (int iw = wstart; iw < wend; ++iw) {
+            int input_index = ih * width + iw;
+            offset_input_grad_data[input_index] += diff_val;
+          }
+        }
+      }
+    }
+    return;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index e25eaaa9fd..3832cae8c3 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -173,6 +173,7 @@ __all__ = [
     'merge_selected_rows',
     'get_tensor_from_selected_rows',
     'lstm',
+    'psroi_pool',
 ]
 
 kIgnoreIndex = -100
@@ -9122,3 +9123,57 @@ def get_tensor_from_selected_rows(x, name=None):
         outputs={'Out': out},
         attrs={})
     return out
+
+
+@templatedoc()
+def psroi_pool(input,
+               rois,
+               output_channels,
+               spatial_scale,
+               pooled_height,
+               pooled_width,
+               name=None):
+    """
+    ${comment}
+
+    Args:
+        input (Variable): ${x_comment}
+        rois (Variable): ROIs (Regions of Interest) to pool over.
+        output_channels (integer): ${output_channels_comment}
+        spatial_scale (float): ${spatial_scale_comment} Default: 1.0
+        pooled_height (integer): ${pooled_height_comment} Default: 1
+        pooled_width (integer): ${pooled_width_comment} Default: 1
+        name (str, default None): The name of this layer.
+
+    Returns:
+        Variable: ${out_comment}.
+
+    Examples:
+        .. code-block:: python
+
+            pool_out = fluid.layers.psroi_pool(input=x, rois=rois, 490, 1.0, 7, 7)
+    """
+    helper = LayerHelper('psroi_pool', **locals())
+    # check attrs
+    if not isinstance(output_channels, int):
+        raise TypeError("output_channels must be int type")
+    if not isinstance(spatial_scale, float):
+        raise TypeError("spatial_scale must be float type")
+    if not isinstance(pooled_height, int):
+        raise TypeError("pooled_height must be int type")
+    if not isinstance(pooled_width, int):
+        raise TypeError("pooled_width must be int type")
+    dtype = helper.input_dtype()
+    out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type='psroi_pool',
+        inputs={'X': input,
+                'ROIs': rois},
+        outputs={'Out': out},
+        attrs={
+            'output_channels': output_channels,
+            'spatial_scale': spatial_scale,
+            'pooled_height': pooled_height,
+            'pooled_width': pooled_width
+        })
+    return out
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 10e8bb5a86..fb3e4da1ef 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -511,6 +511,16 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(output)
         print(str(program))
 
+    def test_psroi_pool(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name="x", shape=[245, 30, 30], dtype="float32")
+            rois = layers.data(
+                name="rois", shape=[4], dtype="float32", lod_level=1)
+            output = layers.psroi_pool(x, rois, 5, 0.25, 7, 7)
+            self.assertIsNotNone(output)
+        print(str(program))
+
     def test_roi_align(self):
         program = Program()
         with program_guard(program):
diff --git a/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py b/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py
new file mode 100644
index 0000000000..abe014a38c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py
@@ -0,0 +1,134 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import math
+import numpy as np
+import unittest
+from op_test import OpTest
+
+
+class TestPSROIPoolOp(OpTest):
+    def set_data(self):
+        self.init_test_case()
+        self.make_rois()
+        self.calc_psroi_pool()
+        self.inputs = {'X': self.x, 'ROIs': (self.rois[:, 1:5], self.rois_lod)}
+        self.attrs = {
+            'output_channels': self.output_channels,
+            'spatial_scale': self.spatial_scale,
+            'pooled_height': self.pooled_height,
+            'pooled_width': self.pooled_width
+        }
+        self.outputs = {'Out': self.outs}
+
+    def init_test_case(self):
+        self.batch_size = 3
+        self.channels = 3 * 2 * 2
+        self.height = 6
+        self.width = 4
+
+        self.x_dim = [self.batch_size, self.channels, self.height, self.width]
+
+        self.spatial_scale = 1.0 / 4.0
+        self.output_channels = 3
+        self.pooled_height = 2
+        self.pooled_width = 2
+
+        self.x = np.random.random(self.x_dim).astype('float32')
+
+    def make_rois(self):
+        rois = []
+        self.rois_lod = [[]]
+        for bno in range(self.batch_size):
+            self.rois_lod[0].append(bno + 1)
+            for i in range(bno + 1):
+                x1 = np.random.random_integers(
+                    0, self.width // self.spatial_scale - self.pooled_width)
+                y1 = np.random.random_integers(
+                    0, self.height // self.spatial_scale - self.pooled_height)
+
+                x2 = np.random.random_integers(x1 + self.pooled_width,
+                                               self.width // self.spatial_scale)
+                y2 = np.random.random_integers(
+                    y1 + self.pooled_height, self.height // self.spatial_scale)
+                roi = [bno, x1, y1, x2, y2]
+                rois.append(roi)
+        self.rois_num = len(rois)
+        self.rois = np.array(rois).astype('float32')
+
+    def calc_psroi_pool(self):
+        output_shape = (self.rois_num, self.output_channels, self.pooled_height,
+                        self.pooled_width)
+        out_data = np.zeros(output_shape)
+        for i in range(self.rois_num):
+            roi = self.rois[i]
+            roi_batch_id = int(roi[0])
+            roi_start_w = round(roi[1]) * self.spatial_scale
+            roi_start_h = round(roi[2]) * self.spatial_scale
+            roi_end_w = (round(roi[3]) + 1.) * self.spatial_scale
+            roi_end_h = (round(roi[4]) + 1.) * self.spatial_scale
+
+            roi_height = max(roi_end_h - roi_start_h, 0.1)
+            roi_width = max(roi_end_w - roi_start_w, 0.1)
+
+            bin_size_h = roi_height / float(self.pooled_height)
+            bin_size_w = roi_width / float(self.pooled_width)
+
+            x_i = self.x[roi_batch_id]
+
+            for c in range(self.output_channels):
+                for ph in range(self.pooled_height):
+                    for pw in range(self.pooled_width):
+                        hstart = int(
+                            math.floor(float(ph) * bin_size_h + roi_start_h))
+                        wstart = int(
+                            math.floor(float(pw) * bin_size_w + roi_start_w))
+                        hend = int(
+                            math.ceil(
+                                float(ph + 1) * bin_size_h + roi_start_h))
+                        wend = int(
+                            math.ceil(
+                                float(pw + 1) * bin_size_w + roi_start_w))
+                        hstart = min(max(hstart, 0), self.height)
+                        hend = min(max(hend, 0), self.height)
+                        wstart = min(max(wstart, 0), self.width)
+                        wend = min(max(wend, 0), self.width)
+
+                        c_in = (c * self.pooled_height + ph
+                                ) * self.pooled_width + pw
+                        is_empty = (hend <= hstart) or (wend <= wstart)
+                        out_sum = 0.
+                        for ih in range(hstart, hend):
+                            for iw in range(wstart, wend):
+                                out_sum += x_i[c_in, ih, iw]
+                        bin_area = (hend - hstart) * (wend - wstart)
+                        out_data[i, c, ph, pw] = 0. if is_empty else (
+                            out_sum / float(bin_area))
+        self.outs = out_data.astype('float32')
+
+    def setUp(self):
+        self.op_type = 'psroi_pool'
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()

From 7b10bf0e60e9ac0f56ff532fe58cbf5c538a81b6 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 13 Dec 2018 16:40:51 +0800
Subject: [PATCH 29/45] Use mkl

---
 .../fluid/operators/hierarchical_sigmoid_op.h | 28 ++++++++++++-------
 paddle/fluid/operators/math/blas.h            |  8 ++++++
 paddle/fluid/operators/math/blas_impl.h       | 21 ++++++++++++++
 paddle/fluid/platform/dynload/mklml.h         |  2 ++
 4 files changed, 49 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h
index b73a32af89..d212e6f843 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -150,19 +150,27 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
                                                        label.data<int64_t>()));
     }
 
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto pre_out_mat = EigenMatrix<T>::From(pre_out);
-    auto pre_out_grad_mat = EigenMatrix<T>::From(pre_out_grad);
-    auto out_grad_mat = EigenMatrix<T>::From(out_grad);
+    // softrelu derivative
 
-    Eigen::array<int, 2> bcast{1, static_cast<int>(pre_out_grad.dims()[1])};
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
 
-    // softrelu derivative
-    pre_out_grad_mat.device(place) =
-        static_cast<T>(1.0) - static_cast<T>(1.0) / pre_out_mat.exp();
+    auto* pre_out_grad_data = pre_out_grad.data<T>();
+    auto* pre_out_data = pre_out.data<T>();
+    auto n = pre_out.numel();
+    blas.VEXP(n, pre_out_data, pre_out_grad_data);
+    blas.VINV(n, pre_out_grad_data, pre_out_grad_data);
+    for (int64_t i = 0; i < n; ++i) {
+      pre_out_grad_data[i] = 1.0 - pre_out_grad_data[i];
+    }
     bit_code->Sub(&pre_out_grad);  // the gradient of clip(w * x + b)
-    pre_out_grad_mat.device(place) =
-        pre_out_grad_mat * out_grad_mat.broadcast(bcast);
+    auto* out_grad_data = out_grad.data<T>();
+
+    int64_t dim0 = pre_out_grad.dims()[0];
+    int64_t dim1 = pre_out_grad.dims()[1];
+    for (int64_t i = 0; i < dim0; ++i) {
+      T tmp = out_grad_data[i];
+      blas.SCAL(dim1, tmp, pre_out_grad_data + i * dim1);
+    }
     // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
     // be consistent with the clipping in forward.
 
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index 9f3a81f22c..f67f57827b 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -181,6 +181,9 @@ class Blas {
               const framework::Tensor& mat_b, const MatDescriptor& dim_b,
               T alpha, framework::Tensor* mat_out, T beta) const;
 
+  template <typename T>
+  void VINV(int n, const T* a, T* y) const;
+
  private:
   const DeviceContext& context_;
 };
@@ -282,6 +285,11 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template BatchedGEMM<T>(args...);
   }
 
+  template <typename... ARGS>
+  void VINV(ARGS... args) const {
+    Base()->template VINV<T>(args...);
+  }
+
  private:
   const Blas<DeviceContext>* Base() const {
     return static_cast<const Blas<DeviceContext>*>(this);
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index c84087bb1e..972366bc09 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -118,6 +118,11 @@ struct CBlas<float> {
   static void VPOW(ARGS... args) {
     platform::dynload::vsPowx(args...);
   }
+
+  template <typename... ARGS>
+  static void VINV(ARGS... args) {
+    platform::dynload::vsInv(args...);
+  }
 };
 
 template <>
@@ -213,6 +218,11 @@ struct CBlas<double> {
   static void VPOW(ARGS... args) {
     platform::dynload::vdPowx(args...);
   }
+
+  template <typename... ARGS>
+  static void VINV(ARGS... args) {
+    platform::dynload::vdInv(args...);
+  }
 };
 
 #else
@@ -603,6 +613,17 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
         dim_a.stride_, dim_b.stride_);
   }
 }
+template <typename DeviceContext>
+template <typename T>
+void Blas<DeviceContext>::VINV(int n, const T *a, T *y) const {
+#ifdef PADDLE_WITH_MKLML
+  CBlas<T>::VINV(n, a, y);
+#else
+  for (int i = 0; i < n; ++i) {
+    y[i] = 1.0 / a[i];
+  }
+#endif
+}
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
index f0a9736623..c3f9433503 100644
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -82,6 +82,8 @@ extern void* mklml_dso_handle;
   __macro(vdSqr);                   \
   __macro(vsPowx);                  \
   __macro(vdPowx);                  \
+  __macro(vsInv);                   \
+  __macro(vdInv);                   \
   __macro(MKL_Set_Num_Threads)
 
 MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);

From e2130502234f042a6381939f80c640bbebe2e1c6 Mon Sep 17 00:00:00 2001
From: Wang Guibao <wang_guibao@163.com>
Date: Thu, 13 Dec 2018 20:49:50 +0800
Subject: [PATCH 30/45] Fix multi-threading bug with WItH_MKL=ON (#14882)

fixes #14884
---
 paddle/fluid/framework/executor_thread_worker.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc
index 3d53511615..5fc5aeb662 100644
--- a/paddle/fluid/framework/executor_thread_worker.cc
+++ b/paddle/fluid/framework/executor_thread_worker.cc
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/pybind/pybind.h"
 namespace paddle {
@@ -174,6 +175,8 @@ void print_fetch_var(Scope* scope, std::string var_name) {
 }
 
 void ExecutorThreadWorker::TrainFiles() {
+  platform::SetNumThreads(1);
+
   // todo: configurable
   SetDevice();
 

From dc2ff42e20d72a449b200da0522b55a53b28091d Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Thu, 13 Dec 2018 23:14:10 +0800
Subject: [PATCH 31/45] add math in python examples. test=develop

---
 python/paddle/fluid/layers/nn.py | 40 +++++++++++++++++++++++++++-----
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 07fc4ccc6b..4a557ce247 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -2536,16 +2536,28 @@ def adaptive_pool2d(input,
         ValueError: 'pool_size' should be a list or tuple with length as 2.
 
     Examples:
-
         .. code-block:: python
 
+          # suppose input data in shape of [N, C, H, W], `pool_size` is [m, n], 
+          # output shape is [N, C, m, n], adaptive pool divide H and W dimentions
+          # of input data into m * n grids averagely and performs poolings in each 
+          # grid to get output.
+          # adaptive average pool performs calculations as follow:
+          # 
+          #     for i in range(m):
+          #         for j in range(n):
+          #             hstart = floor(i * H / m)
+          #             hend = ceil((i + 1) * H / m)
+          #             wstart = floor(i * W / n)
+          #             wend = ceil((i + 1) * W / n)
+          #             output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend])
+          #
           data = fluid.layers.data(
               name='data', shape=[3, 32, 32], dtype='float32')
           pool_out = fluid.layers.adaptive_pool2d(
                             input=data,
                             pool_size=[3, 3],
-                            pool_type='max',
-                            require_index=False)
+                            pool_type='avg')
     """
     if pool_type not in ["max", "avg"]:
         raise ValueError(
@@ -2632,16 +2644,32 @@ def adaptive_pool3d(input,
         ValueError: 'pool_size' should be a list or tuple with length as 2.
 
     Examples:
-
         .. code-block:: python
 
+          # suppose input data in shape of [N, C, D, H, W], `pool_size` is [l, m, n],
+          # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimentions
+          # of input data into l * m * n grids averagely and performs poolings in each 
+          # grid to get output.
+          # adaptive average pool performs calculations as follow:
+          # 
+          #     for i in range(l):
+          #         for j in range(m):
+          #             for k in range(n):
+          #                 dstart = floor(i * D / l)
+          #                 dend = ceil((i + 1) * D / l)
+          #                 hstart = floor(j * H / m)
+          #                 hend = ceil((j + 1) * H / m)
+          #                 wstart = floor(k * W / n)
+          #                 wend = ceil((k + 1) * W / n)
+          #                 output[:, :, i, j, k] = 
+          #                     avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
+          #
           data = fluid.layers.data(
               name='data', shape=[3, 32, 32], dtype='float32')
           pool_out, mask = fluid.layers.adaptive_pool3d(
                             input=data,
                             pool_size=[3, 3],
-                            pool_type='max',
-                            require_index=True)
+                            pool_type='avg')
     """
     if pool_type not in ["max", "avg"]:
         raise ValueError(

From b17444c84c45ae7f04863964099412a8ad9bf8d0 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 14 Dec 2018 12:33:05 +0800
Subject: [PATCH 32/45] Fix merge bug

test=develop
---
 paddle/fluid/operators/psroi_pool_op.cc | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc
index 6978d9c5dc..78989582b7 100644
--- a/paddle/fluid/operators/psroi_pool_op.cc
+++ b/paddle/fluid/operators/psroi_pool_op.cc
@@ -129,9 +129,8 @@ class PSROIPoolOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 
@@ -150,9 +149,8 @@ class PSROIPoolGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-        ctx.device_context());
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
   }
 };
 

From f16aa394f61cd759562ecfb5d3553700932d71b1 Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Fri, 14 Dec 2018 12:14:37 +0800
Subject: [PATCH 33/45] remove use_cudnn in python API. test=develop

---
 paddle/fluid/API.spec            |  4 ++--
 python/paddle/fluid/layers/nn.py | 22 ----------------------
 2 files changed, 2 insertions(+), 24 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 845abe7d5b..d67363003a 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -77,8 +77,8 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name']
 paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
 paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
 paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
-paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=('max', False, False, None))
-paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=('max', False, False, None))
+paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None))
+paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None))
 paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False))
 paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 4a557ce247..28a119906b 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -2506,7 +2506,6 @@ def adaptive_pool2d(input,
                     pool_size,
                     pool_type="max",
                     require_index=False,
-                    use_cudnn=False,
                     name=None):
     """
     ${comment}
@@ -2521,7 +2520,6 @@ def adaptive_pool2d(input,
         pool_type: ${pooling_type_comment}
         require_index (bool): If true, the index of max pooling point along with outputs.
             it cannot be set in average pooling type.
-        use_cudnn (bool, default False): adaptive pool currently not supported in cudnn.
         name (str|None): A name for this layer(optional). If set None, the
                         layer will be named automatically.
 
@@ -2530,8 +2528,6 @@ def adaptive_pool2d(input,
 
     Raises:
         ValueError: 'pool_type' is not 'max' nor 'avg'.
-        ValueError: 'use_cudnn' is not a bool value.
-        ValueError: adaptive pool currently not supported in cudnn.
         ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'.
         ValueError: 'pool_size' should be a list or tuple with length as 2.
 
@@ -2575,12 +2571,6 @@ def adaptive_pool2d(input,
         raise ValueError(
             "'pool_size' should be a list or tuple with length as 2.")
 
-    if not isinstance(use_cudnn, bool):
-        raise ValueError("use_cudnn should be True or False.")
-
-    if use_cudnn:
-        raise ValueError("adaptive pool currently not supported in cudnn.")
-
     if pool_type == "max":
         l_type = 'max_pool2d_with_index'
     else:
@@ -2602,7 +2592,6 @@ def adaptive_pool2d(input,
         attrs={
             "pooling_type": pool_type,
             "ksize": pool_size,
-            "use_cudnn": use_cudnn,
             "adaptive": True,
         })
 
@@ -2614,7 +2603,6 @@ def adaptive_pool3d(input,
                     pool_size,
                     pool_type="max",
                     require_index=False,
-                    use_cudnn=False,
                     name=None):
     """
     ${comment}
@@ -2629,7 +2617,6 @@ def adaptive_pool3d(input,
         pool_type: ${pooling_type_comment}
         require_index (bool): If true, the index of max pooling point along with outputs.
             it cannot be set in average pooling type.
-        use_cudnn (bool, default False): adaptive pool currently not supported in cudnn.
         name (str|None): A name for this layer(optional). If set None, the
                         layer will be named automatically.
 
@@ -2638,8 +2625,6 @@ def adaptive_pool3d(input,
 
     Raises:
         ValueError: 'pool_type' is not 'max' nor 'avg'.
-        ValueError: 'use_cudnn' is not a bool value.
-        ValueError: adaptive pool currently not supported in cudnn.
         ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'.
         ValueError: 'pool_size' should be a list or tuple with length as 2.
 
@@ -2687,12 +2672,6 @@ def adaptive_pool3d(input,
         raise ValueError(
             "'pool_size' should be a list or tuple with length as 3.")
 
-    if not isinstance(use_cudnn, bool):
-        raise ValueError("use_cudnn should be True or False.")
-
-    if use_cudnn:
-        raise ValueError("adaptive pool currently not supported in cudnn.")
-
     if pool_type == "max":
         l_type = 'max_pool3d_with_index'
     else:
@@ -2714,7 +2693,6 @@ def adaptive_pool3d(input,
         attrs={
             "pooling_type": pool_type,
             "ksize": pool_size,
-            "use_cudnn": use_cudnn,
             "adaptive": True,
         })
 

From 04a570b4634f3cab2815cd1688df192f0f5b1d81 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 14 Dec 2018 14:59:35 +0800
Subject: [PATCH 34/45] Fix ut

test=develop
---
 paddle/fluid/framework/data_type_test.cc      | 2 +-
 paddle/fluid/framework/op_kernel_type_test.cc | 3 ++-
 paddle/fluid/inference/api/api_impl_tester.cc | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc
index 92639dfc61..2a380201f2 100644
--- a/paddle/fluid/framework/data_type_test.cc
+++ b/paddle/fluid/framework/data_type_test.cc
@@ -35,6 +35,6 @@ TEST(DataType, float16) {
   EXPECT_EQ(f::SizeOfType(dtype), 2u);
 
   // test debug info
-  std::string type = "float16";
+  std::string type = "::paddle::platform::float16";
   EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str());
 }
diff --git a/paddle/fluid/framework/op_kernel_type_test.cc b/paddle/fluid/framework/op_kernel_type_test.cc
index 3e17a512ce..40db85400d 100644
--- a/paddle/fluid/framework/op_kernel_type_test.cc
+++ b/paddle/fluid/framework/op_kernel_type_test.cc
@@ -34,7 +34,8 @@ TEST(OpKernelType, ToString) {
   OpKernelType op_kernel_type2(DataType::FP16, CUDAPlace(0), DataLayout::kNCHW,
                                LibraryType::kCUDNN);
   ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type2),
-            "data_type[float16]:data_layout[NCHW]:place[CUDAPlace(0)]:library_"
+            "data_type[::paddle::platform::float16]:data_layout[NCHW]:place["
+            "CUDAPlace(0)]:library_"
             "type[CUDNN]");
 }
 
diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc
index 191225493c..7839639739 100644
--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -39,7 +39,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
   if (t->type() == framework::proto::VarType::INT64) {
     pt.data.Reset(t->data<void>(), t->numel() * sizeof(int64_t));
     pt.dtype = PaddleDType::INT64;
-  } else if (t->type() == framework::proto::VarType::INT32) {
+  } else if (t->type() == framework::proto::VarType::FP32) {
     pt.data.Reset(t->data<void>(), t->numel() * sizeof(float));
     pt.dtype = PaddleDType::FLOAT32;
   } else {

From f702ab74b9edfe6310470ad1ad98ae054f3120fc Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Fri, 14 Dec 2018 07:36:45 +0000
Subject: [PATCH 35/45] add dist transpiler test

---
 .../tests/unittests/test_dist_transpiler.py   | 48 +++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 650a745cdc..27575897b5 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -875,5 +875,53 @@ class TestRemoteNce(TestDistLookupTableBase):
                 pass
 
 
+# test for remote prefetch
+class TestRemoteHsigmoid(TestDistLookupTableBase):
+    def network_with_table(self, is_sparse, is_distributed):
+
+        num_total_classes = 10
+
+        input = fluid.layers.data(name="input", shape=[10], dtype="float32")
+        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+        path_table = fluid.layers.data(
+            name='path_table', shape=[10], dtype='int64')
+        path_code = fluid.layers.data(
+            name='path_code', shape=[10], dtype='int64')
+        w_param = fluid.default_main_program().global_block().create_parameter(
+            shape=[num_total_classes, 10],
+            dtype='float32',
+            name='hs_w',
+            initializer=fluid.initializer.ConstantInitializer())
+        b_param = fluid.default_main_program().global_block().create_parameter(
+            shape=[num_total_classes, 1],
+            dtype='float32',
+            name='hs_b',
+            initializer=fluid.initializer.ConstantInitializer())
+
+        cost = fluid.layers.hsigmoid(
+            input=input,
+            label=label,
+            num_classes=non_leaf_num,
+            path_table=path_table,
+            path_code=path_code,
+            is_custom=True,
+            is_sparse=is_sparse)
+        avg_cost = fluid.layers.mean(cost)
+        # optimizer
+        optimizer = fluid.optimizer.SGD(learning_rate=0.003)
+        optimizer.minimize(avg_cost)
+
+    def net_conf(self):
+        import os
+        os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
+        self.network_with_table(is_sparse=True, is_distributed=False)
+
+    def transpiler_test_impl(self):
+        trainer, _ = self.get_trainer()
+        for op in trainer.blocks[0].ops:
+            if op.type == "recv":
+                pass
+
+
 if __name__ == "__main__":
     unittest.main()

From e90b2f104cbf4277e3cc55171e715e91f2512251 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 14 Dec 2018 16:07:00 +0800
Subject: [PATCH 36/45] In most times, const_cast is bad and break interface
 contract and make the code unreadable and make the program unstable.

test=develop
---
 paddle/fluid/operators/cudnn_lstm_op.cu.cc |  2 ++
 paddle/scripts/paddle_build.sh             | 12 ++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index dd64cc327f..f2ba75485c 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -300,9 +300,11 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     }
     CudnnRNNCache *cudnn_rnn_cache = nullptr;
     if (cache_var->IsInitialized()) {
+      // const_cast is usually bad.
       cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var)
                             ->GetMutable<CudnnRNNCache>();
     } else {
+      // const_cast is usually bad.
       cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var)
                             ->GetMutable<CudnnRNNCache>();
       std::random_device rnd;
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 6299b166af..a1c1886c7f 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -517,6 +517,18 @@ function assert_api_spec_approvals() {
           fi
       fi
     done
+
+    HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep const_cast`
+    if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then
+        APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
+        python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433`
+        echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
+        if [ "${APPROVALS}" == "FALSE" ]; then
+            echo "You must have at least 2 approvals for the const_cast"
+        exit 1
+        fi
+    fi
+
 }
 
 

From 67b555d3d3c98d571dffe5b2b8e1c0bae59bd80d Mon Sep 17 00:00:00 2001
From: mozga-intel <mateusz.ozga@intel.com>
Date: Fri, 14 Dec 2018 11:31:00 +0100
Subject: [PATCH 37/45] Enable ngraph tests for a ngraph engine (#14800)

* Enable ngraph tests for a ngraph engine
test=develop

* Move the test structure to other place
test=develop

* Add USE_NGRAPH flag, simple structure
test=develop
---
 python/paddle/fluid/tests/unittests/CMakeLists.txt  | 10 +++++++---
 .../fluid/tests/unittests/ngraph/CMakeLists.txt     |  6 ++++++
 .../paddle/fluid/tests/unittests/ngraph/__init__.py | 13 +++++++++++++
 3 files changed, 26 insertions(+), 3 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/ngraph/CMakeLists.txt
 create mode 100644 python/paddle/fluid/tests/unittests/ngraph/__init__.py

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index a4089ba3ca..6d6fe245d8 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -63,9 +63,9 @@ function(py_test_modules TARGET_NAME)
     set(multiValueArgs MODULES DEPS ENVS)
     cmake_parse_arguments(py_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
-             COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS}
-             ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
-             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+        COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS}
+        ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     if (py_test_modules_SERIAL)
         set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
     endif()
@@ -111,3 +111,7 @@ py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executo
 if(NOT APPLE)
     py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
 endif()
+
+if (WITH_NGRAPH)
+    add_subdirectory(ngraph)
+endif()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ngraph/CMakeLists.txt
new file mode 100644
index 0000000000..5ed2d0aa80
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ngraph/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+foreach(TEST_OP ${TEST_OPS})
+    py_test_modules(${TEST_OP} MODULES ${TEST_OP}  ENVS FLAGS_use_ngraph=true)
+endforeach(TEST_OP)
diff --git a/python/paddle/fluid/tests/unittests/ngraph/__init__.py b/python/paddle/fluid/tests/unittests/ngraph/__init__.py
new file mode 100644
index 0000000000..b94a21a7e4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ngraph/__init__.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

From a985949be99266a12003071bcadec2d9f7785d58 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Fri, 14 Dec 2018 19:21:40 +0800
Subject: [PATCH 38/45] Fea/fuse conv elementwise add fuse (#14669)

---
 paddle/fluid/framework/ir/CMakeLists.txt      |   2 +
 .../ir/conv_elementwise_add2_act_fuse.cc      | 106 ++++++++++++++++
 .../ir/conv_elementwise_add2_act_fuse_pass.cc | 105 ++++++++++++++++
 .../ir/conv_elementwise_add2_act_fuse_pass.h  |  33 +++++
 .../ir/conv_elementwise_add_act_fuse_pass.cc  | 104 ++++++++++++++++
 .../ir/conv_elementwise_add_act_fuse_pass.h   |  33 +++++
 .../framework/ir/graph_pattern_detector.cc    | 113 +++++++++++++++++-
 .../framework/ir/graph_pattern_detector.h     |  45 +++++++
 .../api/analysis_predictor_tester.cc          |   7 +-
 .../fluid/inference/api/paddle_pass_builder.h |   5 +-
 paddle/fluid/inference/io.cc                  |   2 +-
 .../inference/tests/api/trt_models_tester.cc  |  25 +++-
 .../operators/controlflow/CMakeLists.txt      |   2 +-
 paddle/fluid/operators/conv_op.cc             |   4 +-
 paddle/fluid/platform/device_context.cc       |   1 +
 15 files changed, 580 insertions(+), 7 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc
 create mode 100644 paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
 create mode 100644 paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
 create mode 100644 paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
 create mode 100644 paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 883575e41d..be4151b54b 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -42,6 +42,8 @@ pass_library(multi_batch_merge_pass base)
 pass_library(conv_bn_fuse_pass inference)
 pass_library(seqconv_eltadd_relu_fuse_pass inference)
 pass_library(is_test_pass base)
+pass_library(conv_elementwise_add_act_fuse_pass inference)
+pass_library(conv_elementwise_add2_act_fuse_pass inference)
 if(WITH_MKLDNN)
     pass_library(mkldnn_placement_pass base)
     pass_library(depthwise_conv_mkldnn_pass base)
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc
new file mode 100644
index 0000000000..6e9905b7ec
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc
@@ -0,0 +1,106 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
+#define GET_NODES                      \
+  GET_IR_NODE(conv_op);                \
+  GET_IR_NODE(conv_out);               \
+  GET_IR_NODE(conv_filter);            \
+  GET_IR_NODE(elementwise_add_op);     \
+  GET_IR_NODE(elementwise_add_in_y);   \
+  GET_IR_NODE(elementwise_add_out);    \
+  GET_IR_NODE(elementwise_add_op_1);   \
+  GET_IR_NODE(elementwise_add_in_y_1); \
+  GET_IR_NODE(elementwise_add_out_1);  \
+  GET_IR_NODE(act_op);                 \
+  GET_IR_NODE(act_out);
+
+// Inherient the basic infomation from `base_desc`, and modify some fields.
+framework::proto::OpDesc PrepareOpDesc(
+    const framework::proto::OpDesc& base_desc, const std::string& bias,
+    const std::string& bias1, const std::string& activation,
+    const std::string& output) {
+  auto proto = base_desc;
+  framework::OpDesc desc(proto, nullptr);
+  desc.SetInput("Bias", {bias});
+  desc.SetInput("ResidualData", {bias1});
+  desc.SetAttr("activation", activation);
+  desc.SetOutput("Output", {output});
+  desc.SetAttr("is_test", true);
+  desc.SetAttr("use_cudnn", false);
+
+  return *desc.Proto();
+}
+
+std::unique_ptr<ir::Graph> ConvElementwiseAddActFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  const std::string pattern_name = "conv_elementwise_add_act_fuse";
+  FusePassBase::Init(pattern_name, graph.get());
+
+  GraphPatternDetector gpd;
+  auto* x = gpd.mutable_pattern()->NewNode("x")->AsInput()->assert_is_op_input(
+      "conv2d", "Input");
+
+  patterns::ConvElementwiseaddAct pattern(gpd.mutable_pattern(), pattern_name);
+  pattern(x);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_NODES;
+
+    auto base_op_desc = *conv_op->Op()->Proto();
+    std::string bias_name = elementwise_add_in_y->Name();
+    std::string bias1_name = elementwise_add_in_y_1->Name();
+    std::string act_op_type = act_op->Op()->Type();
+    std::string act_op_out = act_out->Name();
+
+    auto new_op_proto = PrepareOpDesc(base_op_desc, bias_name, bias1_name,
+                                      act_op_type, act_op_out);
+    framework::OpDesc new_op_desc(new_op_proto, nullptr);
+
+    // Create a new node for the fused op.
+    auto new_conv_op = graph->CreateOpNode(&new_op_desc);
+
+    // Link inputs and outputs.
+    PADDLE_ENFORCE(subgraph.count(x));
+    auto* conv_in_node = subgraph.at(x);
+
+    IR_NODE_LINK_TO(conv_in_node, new_conv_op);            // Input
+    IR_NODE_LINK_TO(conv_filter, new_conv_op);             // Filter
+    IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op);    // Bias
+    IR_NODE_LINK_TO(elementwise_add_in_y_1, new_conv_op);  // ResidualData
+    IR_NODE_LINK_TO(new_conv_op, act_out);                 // Output
+
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph.get(),
+                         {conv_op, elementwise_add_op, elementwise_add_op_1,
+                          elementwise_add_out});
+  };
+  gpd(graph.get(), handler);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(conv_elementwise_add2_act_fuse_pass,
+              paddle::framework::ir::ConvElementwiseAdd2ActFusePass);
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
new file mode 100644
index 0000000000..23f343f631
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
@@ -0,0 +1,105 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h"
+#include <string>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
+#define GET_NODES                      \
+  GET_IR_NODE(conv_op);                \
+  GET_IR_NODE(conv_out);               \
+  GET_IR_NODE(conv_filter);            \
+  GET_IR_NODE(elementwise_add_op);     \
+  GET_IR_NODE(elementwise_add_in_y);   \
+  GET_IR_NODE(elementwise_add_out);    \
+  GET_IR_NODE(elementwise_add_op_1);   \
+  GET_IR_NODE(elementwise_add_in_y_1); \
+  GET_IR_NODE(elementwise_add_out_1);  \
+  GET_IR_NODE(act_op);                 \
+  GET_IR_NODE(act_out);
+
+// Inherient the basic infomation from `base_desc`, and modify some fields.
+framework::proto::OpDesc PrepareOpDesc(
+    const framework::proto::OpDesc& base_desc, const std::string& bias,
+    const std::string& bias1, const std::string& activation,
+    const std::string& output) {
+  auto proto = base_desc;
+  framework::OpDesc desc(proto, nullptr);
+  desc.SetInput("Bias", {bias});
+  desc.SetInput("ResidualData", {bias1});
+  desc.SetAttr("activation", activation);
+  desc.SetOutput("Output", {output});
+  desc.SetAttr("is_test", true);
+
+  return *desc.Proto();
+}
+
+std::unique_ptr<ir::Graph> ConvElementwiseAdd2ActFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  const std::string pattern_name = "conv_elementwise_add_act_fuse";
+  FusePassBase::Init(pattern_name, graph.get());
+
+  GraphPatternDetector gpd;
+  auto* x = gpd.mutable_pattern()->NewNode("x")->AsInput()->assert_is_op_input(
+      "conv2d", "Input");
+
+  patterns::ConvElementwiseadd2Act pattern(gpd.mutable_pattern(), pattern_name);
+  pattern(x);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_NODES;
+
+    auto base_op_desc = *conv_op->Op()->Proto();
+    std::string bias_name = elementwise_add_in_y->Name();
+    std::string bias1_name = elementwise_add_in_y_1->Name();
+    std::string act_op_type = act_op->Op()->Type();
+    std::string act_op_out = act_out->Name();
+
+    auto new_op_proto = PrepareOpDesc(base_op_desc, bias_name, bias1_name,
+                                      act_op_type, act_op_out);
+    framework::OpDesc new_op_desc(new_op_proto, nullptr);
+
+    // Create a new node for the fused op.
+    graph->CreateOpNode(&new_op_desc);
+
+    // Link inputs and outputs.
+    PADDLE_ENFORCE(subgraph.count(x));
+    auto* conv_in_node = subgraph.at(x);
+
+    IR_NODE_LINK_TO(conv_in_node, conv_op);            // Input
+    IR_NODE_LINK_TO(conv_filter, conv_op);             // Filter
+    IR_NODE_LINK_TO(conv_op, conv_out);                // Output
+    IR_NODE_LINK_TO(elementwise_add_in_y, conv_op);    // Bias
+    IR_NODE_LINK_TO(elementwise_add_in_y_1, conv_op);  // Bias
+
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph.get(),
+                         {conv_op, elementwise_add_op, elementwise_add_op_1,
+                          elementwise_add_out});
+  };
+  gpd(graph.get(), handler);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(conv_elementwise_add2_act_fuse_pass,
+              paddle::framework::ir::ConvElementwiseAdd2ActFusePass);
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
new file mode 100644
index 0000000000..3b40a5a926
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class ConvElementwiseAdd2ActFusePass : public FusePassBase {
+ public:
+  virtual ~ConvElementwiseAdd2ActFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
new file mode 100644
index 0000000000..fe3b4fca79
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
@@ -0,0 +1,104 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h"
+#include <string>
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
+#define GET_NODES                    \
+  GET_IR_NODE(conv_op);              \
+  GET_IR_NODE(conv_out);             \
+  GET_IR_NODE(conv_filter);          \
+  GET_IR_NODE(elementwise_add_op);   \
+  GET_IR_NODE(elementwise_add_in_y); \
+  GET_IR_NODE(elementwise_add_out);  \
+  GET_IR_NODE(act_op);               \
+  GET_IR_NODE(act_out);
+
+// Inherient the basic infomation from `base_desc`, and modify some fields.
+framework::proto::OpDesc PrepareOpDesc(
+    const framework::proto::OpDesc& base_desc, const std::string& bias,
+    const std::string& activation, const std::string& output) {
+  auto proto = base_desc;
+  framework::OpDesc desc(proto, nullptr);
+  desc.SetType("conv2d_fusion");
+  desc.SetInput("Bias", {bias});
+  desc.SetInput("ResidualData", {});
+  desc.SetAttr("activation", activation);
+  desc.SetOutput("Output", {output});
+  desc.SetAttr("is_test", true);
+  desc.SetAttr("use_cudnn", false);
+  desc.Flush();
+  return *desc.Proto();
+}
+
+std::unique_ptr<ir::Graph> ConvElementwiseAddActFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  const std::string pattern_name = "conv_elementwise_add_act_fuse";
+  FusePassBase::Init(pattern_name, graph.get());
+
+  GraphPatternDetector gpd;
+  auto* x = gpd.mutable_pattern()
+                ->NewNode("x")
+                ->assert_is_op_input("conv2d", "Input")
+                ->AsInput();
+
+  patterns::ConvElementwiseaddAct pattern(gpd.mutable_pattern(), pattern_name);
+  pattern(x);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_NODES;
+
+    auto base_op_desc = *conv_op->Op()->Proto();
+    std::string bias_name = elementwise_add_in_y->Name();
+    std::string act_op_type = act_op->Op()->Type();
+    std::string act_op_out = act_out->Name();
+
+    auto new_op_proto =
+        PrepareOpDesc(base_op_desc, bias_name, act_op_type, act_op_out);
+    framework::OpDesc new_op_desc(new_op_proto, nullptr);
+
+    // Create a new node for the fused op.
+    auto* new_conv_op = graph->CreateOpNode(&new_op_desc);
+
+    // Link inputs and outputs.
+    PADDLE_ENFORCE(subgraph.count(x));
+    auto* conv_in_node = subgraph.at(x);
+
+    IR_NODE_LINK_TO(conv_in_node, new_conv_op);          // Input
+    IR_NODE_LINK_TO(conv_filter, new_conv_op);           // Filter
+    IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op);  // Bias
+    IR_NODE_LINK_TO(new_conv_op, act_out);               // Output
+
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph.get(), {conv_op, conv_out, elementwise_add_op,
+                                       elementwise_add_out, act_op});
+  };
+
+  gpd(graph.get(), handler);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(conv_elementwise_add_act_fuse_pass,
+              paddle::framework::ir::ConvElementwiseAddActFusePass);
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
new file mode 100644
index 0000000000..ac69aa6458
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class ConvElementwiseAddActFusePass : public FusePassBase {
+ public:
+  virtual ~ConvElementwiseAddActFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 0118019df2..bf12d12459 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -17,6 +17,7 @@
 #include <string>
 #include <vector>
 
+#include "graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/graph_traits.h"
@@ -25,6 +26,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/string/pretty_log.h"
 #include "paddle/fluid/string/printf.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -104,7 +106,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
   for (auto &node : GraphTraits::DFS(graph)) {
     for (const auto &pdnode : pattern_.nodes()) {
       if (pdnode->Tell(&node)) {
-        VLOG(4) << "pdnode " << pdnode->name() << " marked";
+        VLOG(4) << "Node " << node.Name() << " marked as " << pdnode->name();
         pdnodes2nodes_[pdnode.get()].insert(&node);
       }
     }
@@ -1099,6 +1101,115 @@ PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) {
 
   return out_var;
 }
+
+std::unordered_set<std::string> conv_act_set({"identity", "sigmoid", "relu",
+                                              "relu6", "relux", "tanh",
+                                              "band_pass"});
+
+PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) {
+  conv_in->AsInput();
+  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
+  auto conv_out = pattern->NewNode(conv_out_repr())
+                      ->assert_is_op_output("conv2d")
+                      ->assert_is_op_input("elementwise_add", "X")
+                      ->AsIntermediate();
+  auto conv_filter = pattern->NewNode(conv_filter_repr())
+                         ->assert_is_op_input("conv2d", "Filter")
+                         ->AsInput();
+  auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
+                                ->assert_is_op("elementwise_add");
+  auto elementwise_add_in_y = pattern->NewNode(elementwise_add_in_y_repr())
+                                  ->assert_is_op_input("elementwise_add", "Y")
+                                  ->AsInput();
+  auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr())
+                                 ->assert_is_op_output("elementwise_add")
+                                 ->AsIntermediate();
+
+  auto act_op = pattern->NewNode(act_op_repr())
+                    ->assert_is_op()
+                    ->assert_more([&](Node *node) {
+                      auto op_type = node->Name();
+                      return conv_act_set.count(op_type);
+                    });
+
+  auto act_out = pattern->NewNode(act_out_repr())
+                     ->assert_is_var()
+                     // is activation op's output.
+                     ->assert_more([&](Node *node) {
+                       for (auto *in_op : node->inputs) {
+                         if (conv_act_set.count(in_op->Name())) {
+                           return true;
+                         }
+                       }
+                       return false;
+                     })
+                     ->AsOutput();
+
+  conv_op->LinksFrom({conv_in, conv_filter});
+  conv_out->LinksFrom({conv_op});
+  elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y})
+      .LinksTo({elementwise_add_out});
+  act_op->LinksFrom({elementwise_add_out}).LinksTo({act_out});
+
+  return act_out;
+}
+
+PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) {
+  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
+  auto conv_filter = pattern->NewNode(conv_filter_repr())
+                         ->assert_is_op_input("conv2d", "Filter")
+                         ->AsInput();
+  auto conv_out = pattern->NewNode(conv_out_repr())
+                      ->assert_is_op_output("conv2d")
+                      ->assert_is_op_input("elementwise_add", "X")
+                      ->AsIntermediate();
+  auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
+                                ->assert_is_op("elementwise_add");
+  auto elementwise_add_in_y = pattern->NewNode(elementwise_add_in_y_repr())
+                                  ->assert_is_op_input("elementwise_add", "Y")
+                                  ->AsInput();
+  auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr())
+                                 ->assert_is_op_output("elementwise_add")
+                                 ->assert_is_op_input("elementwise_add", "X")
+                                 ->AsIntermediate();
+
+  auto elementwise_add_op_1 = pattern->NewNode(elementwise_add_op_1_repr())
+                                  ->assert_is_op("elementwise_add");
+  auto elementwise_add_in_y_1 = pattern->NewNode(elementwise_add_in_y_1_repr())
+                                    ->assert_is_op_input("elementwise_add", "Y")
+                                    ->AsInput();
+  auto elementwise_add_out_1 = pattern->NewNode(elementwise_add_out_1_repr())
+                                   ->assert_is_op_output("elementwise_add")
+                                   ->AsIntermediate();
+
+  auto act_op = pattern->NewNode(act_op_repr())
+                    ->assert_is_op()
+                    ->assert_more([&](Node *node) {
+                      auto op_type = node->Name();
+                      return conv_act_set.count(op_type);
+                    });
+  auto act_out = pattern->NewNode(act_out_repr())
+                     ->assert_is_var()
+                     // is activation op's output.
+                     ->assert_more([&](Node *node) {
+                       for (auto *in_op : node->inputs) {
+                         if (conv_act_set.count(in_op->Name())) {
+                           return true;
+                         }
+                       }
+                       return false;
+                     })
+                     ->AsOutput();
+
+  conv_op->LinksFrom({conv_in, conv_filter}).LinksTo({conv_out});
+  elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y})
+      .LinksTo({elementwise_add_out});
+  elementwise_add_op_1->LinksFrom(
+      {elementwise_add_out, elementwise_add_in_y_1});
+  act_op->LinksFrom({elementwise_add_out_1}).LinksTo({act_out});
+  return act_out;
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index d044802f22..0fee2f1c18 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -671,6 +671,51 @@ struct ElementwiseAdd : public PatternBase {
   PATTERN_DECL_NODE(elementwise_add_y);
   PATTERN_DECL_NODE(elementwise_add_out);
 };
+
+// Conv + ElementwiseAdd + an activation
+// This pattern can futher fuse the conv related ops after the conv+bn fusion.
+struct ConvElementwiseaddAct : public PatternBase {
+  ConvElementwiseaddAct(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "conv_elementwiseadd_act") {}
+
+  PDNode* operator()(PDNode* conv_in);
+
+  PATTERN_DECL_NODE(conv_op);
+  PATTERN_DECL_NODE(conv_out);
+  PATTERN_DECL_NODE(conv_filter);
+
+  PATTERN_DECL_NODE(elementwise_add_op);
+  PATTERN_DECL_NODE(elementwise_add_in_y);  // input
+  PATTERN_DECL_NODE(elementwise_add_out);
+
+  PATTERN_DECL_NODE(act_op);
+  PATTERN_DECL_NODE(act_out);
+};
+
+// Conv + ElementwiseAdd + ElementwiseAdd + Activation
+struct ConvElementwiseadd2Act : public PatternBase {
+  ConvElementwiseadd2Act(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope,
+                    "conv_elementwiseadd2_elementwiseadd_act") {}
+
+  PDNode* operator()(PDNode* conv_in);
+
+  PATTERN_DECL_NODE(conv_op);
+  PATTERN_DECL_NODE(conv_filter);
+  PATTERN_DECL_NODE(conv_out);
+
+  PATTERN_DECL_NODE(elementwise_add_op);
+  PATTERN_DECL_NODE(elementwise_add_in_y);  // input
+  PATTERN_DECL_NODE(elementwise_add_out);
+
+  PATTERN_DECL_NODE(elementwise_add_op_1);
+  PATTERN_DECL_NODE(elementwise_add_in_y_1);  // input
+  PATTERN_DECL_NODE(elementwise_add_out_1);
+
+  PATTERN_DECL_NODE(act_op);
+  PATTERN_DECL_NODE(act_out);
+};
+
 }  // namespace patterns
 
 // Link two ir::Nodes from each other.
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index d67305670c..a361b34437 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -55,7 +55,12 @@ TEST(AnalysisPredictor, analysis_off) {
 }
 
 TEST(AnalysisPredictor, analysis_on) {
-  AnalysisConfig config(false);
+#ifdef PADDLE_WITH_CUDA
+  AnalysisConfig config(true);
+  config.fraction_of_gpu_memory = 0.15;
+#else
+  AnalysisConfig config;
+#endif
   config.model_dir = FLAGS_dirname;
   config.enable_ir_optim = true;
 
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index bc5139a7e5..e6e7de2478 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -118,7 +118,10 @@ class GpuPassStrategy : public PassStrategy {
  public:
   GpuPassStrategy() : PassStrategy({}) {
     passes_.assign({
-        "infer_clean_graph_pass", "conv_bn_fuse_pass",
+        "infer_clean_graph_pass",               //
+        "conv_bn_fuse_pass",                    //
+        "conv_elementwise_add_act_fuse_pass",   //
+        "conv_elementwise_add2_act_fuse_pass",  //
     });
   }
 
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 24d15f12f9..ae72a74acc 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -79,7 +79,7 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
 
   for (auto* var : global_block.AllVars()) {
     if (IsPersistable(var)) {
-      VLOG(3) << "persistable variable's name: " << var->Name();
+      VLOG(4) << "persistable variable's name: " << var->Name();
 
       framework::VarDesc* new_var = load_block->Var(var->Name());
       new_var->SetShape(var->GetShape());
diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc
index 9eb3fb5da1..d3bd035c1c 100644
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -78,6 +78,7 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) {
   std::vector<PaddleTensor> outputs;
   if (use_analysis || use_tensorrt) {
     contrib::AnalysisConfig config(true);
+    config.pass_builder()->TurnOnDebug();
     SetConfig<contrib::AnalysisConfig>(&config, model_dir, true, use_tensorrt,
                                        FLAGS_batch_size);
     TestPrediction(reinterpret_cast<PaddlePredictor::Config*>(&config),
@@ -141,9 +142,31 @@ TEST(TensorRT_resnext50, profile) {
   profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt);
 }
 
+TEST(resnext50, compare_analysis_native) {
+  std::string model_dir = FLAGS_infer_model + "/resnext50";
+  compare(model_dir, false /*use tensorrt*/);
+}
+
 TEST(TensorRT_mobilenet, analysis) {
   std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
-  compare(model_dir, /* use_tensorrt */ false);
+  compare(model_dir, false /* use_tensorrt */);
+}
+
+TEST(AnalysisPredictor, use_gpu) {
+  std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
+  AnalysisConfig config(true);
+  config.model_dir = model_dir;
+  config.fraction_of_gpu_memory = 0.15;
+  config.pass_builder()->TurnOnDebug();
+
+  std::vector<std::vector<PaddleTensor>> inputs_all;
+  auto predictor = CreatePaddlePredictor(config);
+  SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
+
+  std::vector<PaddleTensor> outputs;
+  for (auto& input : inputs_all) {
+    ASSERT_TRUE(predictor->Run(input, &outputs));
+  }
 }
 
 }  // namespace inference
diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt
index b1c2ee2295..b614e9b035 100644
--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ b/paddle/fluid/operators/controlflow/CMakeLists.txt
@@ -1,4 +1,4 @@
 include(operators)
-register_operators()
+register_operators(DEPS naive_executor)
 
 file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index d7b8766288..b09e527b90 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -44,7 +44,9 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
   std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
 
   PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
-                 "Conv intput should be 4-D or 5-D tensor.");
+                 "Conv intput should be 4-D or 5-D tensor, get %u",
+                 in_dims.size());
+
   PADDLE_ENFORCE_EQ(
       in_dims.size(), filter_dims.size(),
       "Conv input dimension and filter dimension should be the same.");
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index bd81d4dd1f..d2e23d80f4 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -3,6 +3,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

From 787d837f503a43f5bd2d8dfe5e5c2417a55084c7 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 14 Dec 2018 19:35:48 +0800
Subject: [PATCH 39/45] fix

test=develop
---
 paddle/scripts/paddle_build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index a1c1886c7f..0fc43f33d0 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -518,7 +518,7 @@ function assert_api_spec_approvals() {
       fi
     done
 
-    HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep const_cast`
+    HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep const_cast || true`
     if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then
         APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
         python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433`

From 37c2e24511a29a2b23e18869b51f8edf805cead3 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 14 Dec 2018 19:48:34 +0800
Subject: [PATCH 40/45] Update README.md

---
 README.md | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/README.md b/README.md
index c535e9514e..32a302cc54 100644
--- a/README.md
+++ b/README.md
@@ -19,6 +19,15 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
 
 
+欢迎来到 PaddlePaddle GitHub
+
+PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效灵活、可扩展的深度学习平台，最初由百度科学家和工程师共同开发，目的是将深度学习技术应用到百度的众多产品中。
+
+我们的愿景是让每个人都能通过PaddlePaddle接触深度学习
+
+跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)
+
+
 ### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2)
 ### Install Latest Stable Release:
 ```
@@ -34,6 +43,23 @@ pip install paddlepaddle-gpu==1.2.0.post85
 # For installation on other platform, refer to http://paddlepaddle.org/
 ```
 
+
+### PaddlePaddle最新版本: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2)
+### 安装最新稳定版本:
+```
+# Linux CPU
+pip install paddlepaddle
+# Linux GPU cuda9cudnn7
+pip install paddlepaddle-gpu
+# Linux GPU cuda8cudnn7
+pip install paddlepaddle-gpu==1.2.0.post87
+# Linux GPU cuda8cudnn5
+pip install paddlepaddle-gpu==1.2.0.post85
+
+# 其他平台上的安装指引请参考 http://paddlepaddle.org/
+```
+
+
 ## Features
 
 - **Flexibility**
@@ -74,10 +100,38 @@ pip install paddlepaddle-gpu==1.2.0.post85
     Baidu and it has achieved a significant impact. We hope you can also explore
     the capability of PaddlePaddle to make an impact on your product.
 
+## 特点
+
+- **灵活性**
+
+    PaddlePaddle支持丰富的神经网络架构和优化算法。易于配置复杂模型，例如带有注意力机制或复杂记忆连接的神经网络机器翻译模型。
+
+-  **高效性**
+
+    为了高效使用异步计算资源，PaddlePaddle对框架的不同层进行优化，包括计算、存储、架构和通信。下面是一些样例：
+    
+    - 通过SSE/AVX 内置函数、BLAS库(例如MKL、OpenBLAS、cuBLAS)或定制的CPU/GPU内核优化数学操作。
+    - 通过MKL-DNN库优化CNN网络
+    - 高度优化循环网络，无需执行 `padding` 操作即可处理 **变长** 序列
+    - 针对高维稀疏数据模型，优化了局部和分布式训练。
+     
+
+- **稳定性**
+
+    有了 PaddlePaddle，使得利用各种CPU/GPU和机器来加速训练变得简单。PaddlePaddle 通过优化通信可以实现巨大吞吐量和快速执行。
+
+- **连接产品**
+
+    另外，PaddlePaddle 的设计也易于部署。在百度，PaddlePaddle 已经部署到含有巨大用户量的产品和服务上，包括广告点击率（CTR）预测、大规模图像分类、光学字符识别（OCR）、搜索排序，计算机病毒检测、推荐系统等等。PaddlePaddle广泛应用于百度产品中，产生了非常重要的影响。我们希望您也能探索 PaddlePaddle 的能力，为您的产品创造新的影响力和效果。
+
 ## Installation
 
 It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) on our website.
 
+## 安装
+
+推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) 
+
 ## Documentation
 
 We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) and
@@ -99,10 +153,37 @@ We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarte
 
    We appreciate your contributions!
 
+## 文档
+
+我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)和
+[中文](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) 文档
+
+- [深度学习101](https://github.com/PaddlePaddle/book)
+
+  或许您想从这个在线交互式书籍开始，可以在Jupyter Notebook中运行
+
+- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html)
+
+  可以在MPI集群上运行分布式训练任务
+
+- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html)
+
+   新的API支持代码更少更简洁的程序
+
+- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html)
+
+   欢迎您的贡献!
 
 ## Ask Questions
 
 You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues).
 
+## 答疑
+
+欢迎您将问题和bug报告以[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)的形式提交
+
 ## Copyright and License
 PaddlePaddle is provided under the [Apache-2.0 license](LICENSE).
+
+## 版权和许可证
+PaddlePaddle由[Apache-2.0 license](LICENSE)提供

From 0b1c7d838cfeb2e2000839f173a9be2d641f3d47 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Fri, 14 Dec 2018 20:01:00 +0800
Subject: [PATCH 41/45] Add brpc serialization support. (#11430)

---
 benchmark/fluid/fluid_benchmark.py            |   4 +-
 cmake/external/brpc.cmake                     |  20 +-
 cmake/external/gtest.cmake                    |  10 +-
 cmake/external/leveldb.cmake                  |   4 +-
 paddle/fluid/framework/CMakeLists.txt         |   9 +-
 paddle/fluid/framework/details/CMakeLists.txt |  11 +-
 paddle/fluid/framework/executor.cc            |   6 +-
 .../operators/distributed/CMakeLists.txt      |  31 +-
 .../operators/distributed/brpc_client.cc      | 371 +++++++++++++++---
 .../fluid/operators/distributed/brpc_client.h |  99 ++++-
 .../operators/distributed/brpc_rdma_pool.cc   |  84 ++++
 .../operators/distributed/brpc_rdma_pool.h    |  56 +++
 .../distributed/brpc_sendrecvop_utils.cc      | 196 +++++++++
 .../distributed/brpc_sendrecvop_utils.h       |  49 +++
 .../operators/distributed/brpc_serde_test.cc  | 175 +++++++++
 .../operators/distributed/brpc_server.cc      | 264 +++++++++++--
 .../distributed/brpc_variable_response.cc     |  73 ++++
 .../distributed/brpc_variable_response.h      |  67 ++++
 .../operators/distributed/grpc_client.cc      |   3 +-
 .../fluid/operators/distributed/grpc_serde.cc |   7 -
 .../fluid/operators/distributed/rpc_server.h  |   4 +
 .../operators/distributed/sendrecvop_utils.cc |   2 +-
 .../operators/distributed/sendrecvop_utils.h  |   7 +
 .../operators/distributed_ops/CMakeLists.txt  |   4 +-
 .../distributed_ops/listen_and_serv_op.cc     |   7 +-
 .../operators/distributed_ops/send_op.cc      |   2 +
 paddle/fluid/pybind/pybind.cc                 |   9 +
 python/paddle/fluid/__init__.py               |   1 +
 28 files changed, 1422 insertions(+), 153 deletions(-)
 create mode 100644 paddle/fluid/operators/distributed/brpc_rdma_pool.cc
 create mode 100644 paddle/fluid/operators/distributed/brpc_rdma_pool.h
 create mode 100644 paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc
 create mode 100644 paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h
 create mode 100644 paddle/fluid/operators/distributed/brpc_serde_test.cc
 create mode 100644 paddle/fluid/operators/distributed/brpc_variable_response.cc
 create mode 100644 paddle/fluid/operators/distributed/brpc_variable_response.h

diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index 5f3ce300ac..10b633a4fc 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -81,9 +81,11 @@ def dist_transpile(trainer_id, args, train_prog, startup_prog):
     # the role, should be either PSERVER or TRAINER
     training_role = os.getenv("PADDLE_TRAINING_ROLE")
 
-    config = distribute_transpiler.DistributeTranspilerConfig()
+    config = fluid.DistributeTranspilerConfig()
     config.slice_var_up = not args.no_split_var
+    config.min_block_size = 1048576
     t = distribute_transpiler.DistributeTranspiler(config=config)
+
     t.transpile(
         trainer_id,
         # NOTE: *MUST* use train_prog, for we are using with guard to
diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake
index 30b227b645..6b50cff7a6 100644
--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
@@ -14,14 +14,16 @@
 
 INCLUDE(ExternalProject)
 
-find_library(SSL_LIBRARY NAMES ssl)
+find_package(OpenSSL REQUIRED) 
+
+message(STATUS "ssl:" ${OPENSSL_SSL_LIBRARY})
+message(STATUS "crypto:" ${OPENSSL_CRYPTO_LIBRARY})
+
 ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${SSL_LIBRARY})
+SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY})
 
-find_library(CRYPTO_LIBRARY NAMES crypto)
 ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${CRYPTO_LIBRARY})
-
+SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY})
 
 SET(BRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/brpc)
 SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc)
@@ -31,14 +33,15 @@ SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc libr
 INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})
 
 # Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
-set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib")
+set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog")
 
 # If minimal .a is need, you can set  WITH_DEBUG_SYMBOLS=OFF
 ExternalProject_Add(
     extern_brpc
     ${EXTERNAL_PROJECT_LOG_ARGS}
+    # TODO(gongwb): change to de newst repo when they changed.
     GIT_REPOSITORY  "https://github.com/gongweibao/brpc"
-    GIT_TAG         "7dc04defad1fd4173aae170c3fcbde131b65155a"
+    GIT_TAG         "e9b67ec1b7458f2af5fae76451afe1e27e01b4b4"
     PREFIX          ${BRPC_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@@ -50,7 +53,7 @@ ExternalProject_Add(
                     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                     -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                     -DCMAKE_PREFIX_PATH=${prefix_path}
-                    -DBRPC_WITH_GLOG=ON
+                    -DWITH_GLOG=ON
                     -DIOBUF_WITH_HUGE_BLOCK=ON
                     -DBRPC_WITH_RDMA=${WITH_BRPC_RDMA}
                     ${EXTERNAL_OPTIONAL_ARGS}
@@ -65,5 +68,6 @@ ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
 ADD_DEPENDENCIES(brpc extern_brpc)
 
+add_definitions(-DBRPC_WITH_GLOG)
 
 LIST(APPEND external_project_dependencies brpc)
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index 4fe9c13fb7..9be625b620 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -12,8 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-IF(WITH_TESTING)
-    ENABLE_TESTING()
+#FIXME:(gongwb) Move brpc's gtest dependency.
+IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
+    IF(WITH_TESTING)
+        ENABLE_TESTING()
+    ENDIF(WITH_TESTING)
+
     INCLUDE(ExternalProject)
 
     SET(GTEST_SOURCES_DIR ${THIRD_PARTY_PATH}/gtest)
@@ -76,4 +80,4 @@ IF(WITH_TESTING)
     ADD_DEPENDENCIES(gtest_main extern_gtest)
 
     LIST(APPEND external_project_dependencies gtest gtest_main)
-ENDIF(WITH_TESTING)
+ENDIF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
diff --git a/cmake/external/leveldb.cmake b/cmake/external/leveldb.cmake
index fb5091731d..0df61b01ab 100644
--- a/cmake/external/leveldb.cmake
+++ b/cmake/external/leveldb.cmake
@@ -24,8 +24,8 @@ ExternalProject_Add(
     extern_leveldb
     ${EXTERNAL_PROJECT_LOG_ARGS}
     PREFIX ${LEVELDB_SOURCES_DIR}
-    URL "https://github.com/google/leveldb/archive/v1.18.tar.gz"
-    URL_MD5 "73770de34a2a5ab34498d2e05b2b7fa0"
+    GIT_REPOSITORY "https://github.com/google/leveldb"
+    GIT_TAG v1.18
     CONFIGURE_COMMAND ""
     BUILD_COMMAND CXXFLAGS=-fPIC make -j ${NUM_OF_PROCESSOR} libleveldb.a
     INSTALL_COMMAND mkdir -p ${LEVELDB_INSTALL_DIR}/lib/ 
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 6d7a69c8c9..cea4a44857 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -169,9 +169,12 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor)
 cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
 
 if(WITH_DISTRIBUTE)
-  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper)
-  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog
+        lod_rank_table feed_fetch_method sendrecvop_rpc  ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper)
+
+   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+   set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+
 else()
   if(WITH_NGRAPH)
     if(NOT WIN32)
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index a927a3afcd..97f7713d97 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -12,12 +12,19 @@ cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc
 
 cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
 
+if(WITH_DISTRIBUTE)
+    if(NOT WITH_GRPC)
+        set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+        set_source_files_properties(reduce_op_handle.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    endif()
+endif()
+
 if(WITH_GPU)
     nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
             dynload_cuda variable_visitor)
     if(WITH_DISTRIBUTE)
         nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
-            ddim dynload_cuda selected_rows_functor sendrecvop_grpc)
+            ddim dynload_cuda selected_rows_functor sendrecvop_rpc)
     else()
         nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
             ddim dynload_cuda selected_rows_functor)
@@ -30,7 +37,7 @@ else()
              variable_visitor)
     if(WITH_DISTRIBUTE)
         cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
-            ddim selected_rows_functor sendrecvop_grpc)
+            ddim selected_rows_functor sendrecvop_rpc)
     else()
         cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
             ddim selected_rows_functor)
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 0c4bd336c5..8c3912120b 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -157,9 +157,9 @@ void Executor::Close() {
 #ifdef PADDLE_WITH_DISTRIBUTE
   // TODO(typhoonzero): complete message will need to use real trainer_id,
   // except 0.
-  ::paddle::operators::distributed::RPCClient::GetInstance<
-      ::paddle::operators::distributed::GRPCClient>(0)
-      ->SendComplete();
+  auto client =
+      paddle::operators::distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
+  client->SendComplete();
 #endif
 }
 
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 101dbe9c89..eab4297c73 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -12,7 +12,7 @@ configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @O
 set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
 
 if(WITH_GRPC)
-  grpc_library(sendrecvop_grpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
+  grpc_library(sendrecvop_rpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
         request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc collective_client.cc collective_server.cc
       PROTO send_recv.proto 
       DEPS lod_tensor selected_rows_functor memory)
@@ -20,36 +20,43 @@ if(WITH_GRPC)
   set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
   cc_test(grpc_serde_test SRCS grpc_serde_test.cc 
-    DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
+    DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_rpc scope profiler math_function SERIAL)
 
   cc_test(rpc_server_test SRCS rpc_server_test.cc
-    DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_sparse_table_op SERIAL)
+    DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_sparse_table_op SERIAL)
 
   cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler)
 
   if(WITH_GPU)
   cc_test(collective_server_test SRCS collective_server_test.cc 
-      DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
+      DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
       selected_rows_functor  scope math_function SERIAL)
   endif()
 
-  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_grpc memory)
+  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
 else()
-  set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
-      brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(brpc_server.cc parameter_prefetch.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
+      brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc collective_server.cc collective_server_test.cc
+      collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
-  brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc
-      brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc
+  brpc_library(sendrecvop_rpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc
+      brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc collective_client.cc collective_server.cc
     PROTO send_recv.proto
     DEPS lod_tensor selected_rows memory)
 
-  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_brpc memory)
+  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
 
-  set(brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy)
+  set(brpc_test_depends sendrecvop_rpc brpc ssl crypto protobuf leveldb gflags glog executor
+      proto_desc lookup_sparse_table_op snappystream snappy zlib)
 
-  cc_test(brpc_server_test SRCS rpc_server_test.cc
+  cc_test(rpc_server_test SRCS rpc_server_test.cc
       DEPS ${brpc_test_depends} SERIAL)
 
   cc_test(brpc_serde_test SRCS brpc_serde_test.cc
       DEPS ${brpc_test_depends} SERIAL)
+
+  if(WITH_GPU)
+  cc_test(collective_server_test SRCS collective_server_test.cc 
+      DEPS ${brpc_test_depends} selected_rows_functor  scope math_function SERIAL)
+  endif()
 endif()
diff --git a/paddle/fluid/operators/distributed/brpc_client.cc b/paddle/fluid/operators/distributed/brpc_client.cc
index 350969f74b..62e32977b8 100644
--- a/paddle/fluid/operators/distributed/brpc_client.cc
+++ b/paddle/fluid/operators/distributed/brpc_client.cc
@@ -14,135 +14,316 @@
 
 #include "paddle/fluid/operators/distributed/brpc_client.h"
 #include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
 namespace distributed {
 
-DEFINE_int32(brpc_channel_num, 24,
-             "Number of channels to send requests connected to one server");
 DEFINE_int32(timeout_ms, 30000, "RPC timeout in milliseconds");
 DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)");
 
 BRPCClient::~BRPCClient() { Wait(); }
 
-void HandleSendResponse(brpc::Controller* cntl,
-                        sendrecv::VoidMessage* response) {
+void HandleSendResponse(brpc::Controller* cntl, sendrecv::VoidMessage* response,
+                        VarHandlePtr var_h, ChannelQueuePtr ch_ptr,
+                        ChannelContextPtr ch_ctx, BRPCClient* cls) {
   // std::unique_ptr makes sure cntl/response will be deleted before returning.
   std::unique_ptr<brpc::Controller> cntl_guard(cntl);
   std::unique_ptr<sendrecv::VoidMessage> response_guard(response);
 
+  // this channel can be used by other now.
+  ch_ptr->Push(ch_ctx);
+
   if (cntl->Failed()) {
-    LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText();
+    LOG(FATAL) << "Fail to send SendVar: " << var_h->name()
+               << ", error text: " << cntl->ErrorText();
+    var_h->Finish(false);
+    cls->DecreaseReqCount();
     return;
   }
-  LOG(INFO) << "Received response from " << cntl->remote_side()
-            << " latency=" << cntl->latency_us() << "us";
+  var_h->Finish(true);
+  cls->DecreaseReqCount();
+
+  VLOG(4) << "HandleSendResponse from: " << cntl->remote_side()
+          << ", varname: " << var_h->name()
+          << ", latency: " << cntl->latency_us() << "us";
+  VLOG(4) << "Finish HandleSendResponse";
 }
 
-bool BRPCClient::AsyncSendVar(const std::string& ep,
-                              const platform::DeviceContext& ctx,
-                              const framework::Scope& scope,
-                              const std::string& var_name, int64_t time_out) {
+VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep,
+                                      const platform::DeviceContext& ctx,
+                                      const framework::Scope& scope,
+                                      const std::string& var_name,
+                                      int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string var_name_val = var_name;
   const framework::Scope* p_scope = &scope;
   const auto ch_ptr = GetChannel(ep_val);
+  const std::string method = "SendRPC";
+  VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
+
+  framework::AsyncIO([=] {
+    auto ch_ctx = ch_ptr->Pop();
+    brpc::Controller* cntl = new brpc::Controller();
+    sendrecv::VoidMessage* response = new sendrecv::VoidMessage();
+    cntl->set_timeout_ms(time_out);
 
-  framework::AsyncIO(
-      [var_name_val, p_ctx, ep_val, p_scope, time_out, ch_ptr, this] {
-        auto ch_ctx = ch_ptr->Pop();
-        brpc::Controller* cntl = new brpc::Controller();
-        sendrecv::VoidMessage* response = new sendrecv::VoidMessage();
-        cntl->set_timeout_ms(time_out);
+    auto* var = p_scope->FindVar(var_name_val);
+    sendrecv::VariableMessage request;
+    distributed::SerializeToIOBuf(var_name_val, var, *p_ctx, &request,
+                                  &cntl->request_attachment(), "", false,
+                                  trainer_id_);
 
-        google::protobuf::Closure* done =
-            brpc::NewCallback(&HandleSendResponse, cntl, response);
+    google::protobuf::Closure* done = brpc::NewCallback(
+        &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
 
-        sendrecv::VariableMessage request;
-        ch_ctx->stub->SendVariable(cntl, &request, response, done);
-      });
+    platform::RecordRPCEvent record_event(method, p_ctx);
+
+    ch_ctx->stub->SendVariable(cntl, &request, response, done);
+
+    if (UNLIKELY(platform::IsProfileEnabled())) {
+      var_h->Wait();
+    }
+  });
   req_count_++;
 
-  return true;
+  return var_h;
 }
+void HandleFetchBarrierResponse(brpc::Controller* cntl,
+                                sendrecv::VariableMessage* response,
+                                VarHandlePtr var_h, ChannelQueuePtr ch_ptr,
+                                ChannelContextPtr ch_ctx, BRPCClient* cls) {
+  // std::unique_ptr makes sure cntl/response will be deleted before returning.
+  std::unique_ptr<brpc::Controller> cntl_guard(cntl);
+  std::unique_ptr<sendrecv::VariableMessage> response_guard(response);
+
+  // this channel can be used other now.
+  ch_ptr->Push(ch_ctx);
 
+  if (cntl->Failed()) {
+    LOG(FATAL) << "Fail to get HandleFetchBarrierResponse: " << var_h->name()
+               << ", error text: " << cntl->ErrorText();
+    var_h->Finish(false);
+    cls->DecreaseReqCount();
+    return;
+  }
+
+  var_h->Finish(true);
+  cls->DecreaseReqCount();
+
+  VLOG(4) << "HandleFetchBarrierResponse from: " << cntl->remote_side()
+          << ", varname: " << var_h->name()
+          << ", latency: " << cntl->latency_us() << "us";
+  VLOG(4) << "Finish HandleFetchBarrierResponse";
+}
 void HandleGetResponse(brpc::Controller* cntl,
-                       sendrecv::VariableMessage* response) {
+                       sendrecv::VariableMessage* response, VarHandlePtr var_h,
+                       ChannelQueuePtr ch_ptr, ChannelContextPtr ch_ctx,
+                       BRPCClient* cls) {
   // std::unique_ptr makes sure cntl/response will be deleted before returning.
   std::unique_ptr<brpc::Controller> cntl_guard(cntl);
   std::unique_ptr<sendrecv::VariableMessage> response_guard(response);
 
+  // this channel can be used other now.
+  ch_ptr->Push(ch_ctx);
+
   if (cntl->Failed()) {
-    LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText();
+    LOG(FATAL) << "Fail to GetVar: " << var_h->name()
+               << ", error text: " << cntl->ErrorText();
+    cls->DecreaseReqCount();
+    var_h->Finish(false);
     return;
   }
-  LOG(INFO) << "Received response from " << cntl->remote_side()
-            << " latency=" << cntl->latency_us() << "us";
 
-  // framework::Variable* outvar = nullptr;
-  // DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar);
+  VLOG(4) << "HandleGetResponse from: " << cntl->remote_side()
+          << ", varname: " << var_h->name()
+          << ", latency: " << cntl->latency_us() << "us";
+
+  framework::Variable* outvar = nullptr;
+  int trainer_id;
+  distributed::DeserializeFromIOBuf(*response, cntl->response_attachment(),
+                                    *var_h->ctx(), var_h->scope(), &outvar,
+                                    &trainer_id);
+  VLOG(4) << "Finish HandleGetResponse";
+  cls->DecreaseReqCount();
+  var_h->Finish(true);
 }
 
-bool BRPCClient::AsyncGetVar(const std::string& ep,
-                             const platform::DeviceContext& ctx,
-                             const framework::Scope& scope,
-                             const std::string& var_name, int64_t time_out) {
+VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
+                                      const platform::DeviceContext& ctx,
+                                      const framework::Scope& scope,
+                                      const std::string& var_name,
+                                      const std::string& method_name,
+                                      int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string var_name_val = var_name;
   const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
+  const auto ch_ptr = GetChannel(ep_val);
+  const std::string method = "GetRPC";
+  VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
+
+  framework::AsyncIO([=] {
+    auto ch_ctx = ch_ptr->Pop();
+
+    brpc::Controller* cntl = new brpc::Controller();
+    sendrecv::VariableMessage* response = new sendrecv::VariableMessage();
+    cntl->set_timeout_ms(time_out);
 
-  framework::AsyncIO(
-      [var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {});
+    sendrecv::VariableMessage req;
+    req.set_varname(var_name_val);
+    req.set_trainer_id(trainer_id_);
+
+    google::protobuf::Closure* done = brpc::NewCallback(
+        &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
+
+    platform::RecordRPCEvent record_event(method, p_ctx);
+
+    if (method_name == "GetMonomerVariable") {
+      ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done);
+    } else {
+      ch_ctx->stub->GetVariable(cntl, &req, response, done);
+    }
+
+    if (UNLIKELY(platform::IsProfileEnabled())) {
+      var_h->Wait();
+    }
+  });
 
   req_count_++;
 
-  return true;
+  return var_h;
+}
+
+VarHandlePtr BRPCClient::AsyncGetMonomerVariable(
+    const std::string& ep, const platform::DeviceContext& ctx,
+    const framework::Scope& scope, const std::string& var_name,
+    int64_t time_out) {
+  return _AsyncGetVar(ep, ctx, scope, var_name, "GetMonomerVariable", time_out);
+}
+
+VarHandlePtr BRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
+                                                const std::string& var_name,
+                                                int64_t time_out) {
+  return AsyncSendMessage(ep, "GetMonomerBarrier", var_name, time_out);
 }
 
-bool BRPCClient::AsyncPrefetchVar(const std::string& ep,
-                                  const platform::DeviceContext& ctx,
-                                  const framework::Scope& scope,
-                                  const std::string& in_var_name,
-                                  const std::string& out_var_name,
-                                  int64_t time_out) {
+VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep,
+                                     const platform::DeviceContext& ctx,
+                                     const framework::Scope& scope,
+                                     const std::string& var_name,
+                                     int64_t time_out) {
+  return _AsyncGetVar(ep, ctx, scope, var_name, "GetVariable", time_out);
+}
+
+VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
+                                          const platform::DeviceContext& ctx,
+                                          const framework::Scope& scope,
+                                          const std::string& in_var_name,
+                                          const std::string& out_var_name,
+                                          const std::string& table_name,
+                                          int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string in_var_name_val = in_var_name;
   const std::string out_var_name_val = out_var_name;
+  const std::string table_name_val = table_name;
   const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
+  const auto ch_ptr = GetChannel(ep_val);
+
+  const std::string method = "PrefetchRPC";
+
+  VarHandlePtr var_h(
+      new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope));
+
+  framework::AsyncIO([=] {
+    auto ch_ctx = ch_ptr->Pop();
+
+    brpc::Controller* cntl = new brpc::Controller();
+    sendrecv::VariableMessage* response = new sendrecv::VariableMessage();
+    cntl->set_timeout_ms(time_out);
+
+    auto* var = p_scope->FindVar(in_var_name_val);
+    sendrecv::VariableMessage req;
+    distributed::SerializeToIOBuf(in_var_name_val, var, *p_ctx, &req,
+                                  &cntl->request_attachment(), out_var_name_val,
+                                  false, 0, table_name_val);
+
+    platform::RecordRPCEvent record_event(method, p_ctx);
+
+    google::protobuf::Closure* done = brpc::NewCallback(
+        &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
 
-  framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
-                      time_out, ch, this] {});
+    ch_ctx->stub->PrefetchVariable(cntl, &req, response, done);
+
+    if (UNLIKELY(platform::IsProfileEnabled())) {
+      var_h->Wait();
+    }
+  });
 
   req_count_++;
-  return true;
+  return var_h;
 }
 
-void BRPCClient::AsyncSendBatchBarrier(const std::string& ep,
-                                       int64_t time_out) {
-  req_count_++;
+VarHandlePtr BRPCClient::AsyncSendBatchBarrier(const std::string& ep,
+                                               int64_t time_out) {
+  return AsyncSendMessage(ep, "BatchBarrierRPC", BATCH_BARRIER_MESSAGE,
+                          time_out);
 }
 
-void BRPCClient::AsyncSendFetchBarrier(const std::string& ep,
-                                       int64_t time_out) {
+VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep,
+                                               int64_t time_out) {
+  auto ch_ptr = GetChannel(ep);
+  auto ch_ctx = ch_ptr->Pop();
+
+  brpc::Controller* cntl = new brpc::Controller();
+  sendrecv::VariableMessage* response = new sendrecv::VariableMessage();
+  cntl->set_timeout_ms(time_out);
+
+  sendrecv::VariableMessage req;
+  req.set_varname(FETCH_BARRIER_MESSAGE);
+
+  const std::string method = "FetchBarrierRPC";
+  // var handle
+  VarHandlePtr var_h(
+      new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
+
+  platform::RecordRPCEvent record_event(method, nullptr);
+
+  google::protobuf::Closure* done = brpc::NewCallback(
+      &HandleFetchBarrierResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
+
+  ch_ctx->stub->GetVariable(cntl, &req, response, done);
+
   req_count_++;
+
+  if (UNLIKELY(platform::IsProfileEnabled())) {
+    var_h->Wait();
+  }
+
+  return var_h;
 }
 
-void BRPCClient::Wait() {
-  std::unique_lock<std::mutex> lk(sync_mutex_);
-  sync_cond_.wait(lk, [this] { return req_count_ == 0; });
+bool BRPCClient::Wait() {
+  VLOG(9) << "begin to brpcclient wait";
+  {
+    std::unique_lock<std::mutex> lk(sync_mutex_);
+    sync_cond_.wait(lk, [this] { return req_count_ == 0; });
+  }
+  VLOG(9) << "end to brpcclient wait";
+  return true;
 }
 
 ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
+  VLOG(4) << "begin to GetChannel:" << ep;
   {
     std::lock_guard<std::mutex> guard(chan_mutex_);
     auto it = channels_.find(ep);
     if (it != channels_.end()) {
+      VLOG(4) << "end to GetChannel:" << ep;
       return it->second;
     }
   }
@@ -150,12 +331,20 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
   ChannelQueuePtr q(new framework::BlockingQueue<ChannelContextPtr>());
 
   brpc::ChannelOptions options;
+#ifdef PADDLE_WITH_BRPC_RDMA
+  options.use_rdma = true;
+#endif
   options.protocol = "baidu_std";
-  options.connection_type = "pooled";
-  options.connect_timeout_ms = 100;
+  // don't use pooled type. the server can't afford that.
+  options.connection_type = "single";
+  options.connect_timeout_ms = 1000;
   options.timeout_ms = FLAGS_timeout_ms /*milliseconds*/;
   options.max_retry = FLAGS_max_retry;
-  for (int i = 0; i < FLAGS_brpc_channel_num; ++i) {
+
+  VLOG(1) << "create " << brpc_channel_num_per_server_
+          << " brpc channels to pserver:" << ep;
+
+  for (int i = 0; i < brpc_channel_num_per_server_; ++i) {
     std::shared_ptr<ChannelContext> c(new ChannelContext());
     if (c->channel.Init(ep.c_str(), &options) != 0) {
       LOG(FATAL) << "Fail to initialize channel";
@@ -172,9 +361,75 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
     channels_[ep] = q;
   }
 
+  VLOG(4) << "end to GetChannel:" << ep;
   return q;
 }
 
+VarHandlePtr BRPCClient::AsyncSendComplete(const std::string& ep,
+                                           int64_t time_out) {
+  return AsyncSendMessage(ep, "SendCompleteRPC", COMPLETE_MESSAGE, time_out);
+}
+
+void BRPCClient::SendComplete() {
+  for (auto& kv : channels_) {
+    AsyncSendComplete(kv.first);
+  }
+}
+
+VarHandlePtr BRPCClient::AsyncSendVarMessage(
+    const std::string& ep, const std::string& method_name,
+    const sendrecv::VariableMessage& req, int64_t time_out) {
+  auto ch_ptr = GetChannel(ep);
+  auto ch_ctx = ch_ptr->Pop();
+
+  brpc::Controller* cntl = new brpc::Controller();
+  sendrecv::VoidMessage* response = new sendrecv::VoidMessage();
+  cntl->set_timeout_ms(time_out);
+
+  platform::RecordRPCEvent record_event(method_name, nullptr);
+
+  VarHandlePtr var_h(
+      new VarHandle(ep, method_name, req.varname(), nullptr, nullptr));
+
+  google::protobuf::Closure* done = brpc::NewCallback(
+      &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
+
+  if (method_name == "CheckPointNotifyRPC") {
+    ch_ctx->stub->CheckpointNotify(cntl, &req, response, done);
+  } else if (method_name == "GetMonomerBarrier") {
+    ch_ctx->stub->GetMonomerBarrier(cntl, &req, response, done);
+  } else {
+    ch_ctx->stub->SendVariable(cntl, &req, response, done);
+  }
+  req_count_++;
+
+  if (UNLIKELY(platform::IsProfileEnabled())) {
+    var_h->Wait();
+  }
+
+  return var_h;
+}
+
+VarHandlePtr BRPCClient::AsyncSendMessage(const std::string& ep,
+                                          const std::string& method_name,
+                                          const std::string& message,
+                                          int64_t time_out) {
+  sendrecv::VariableMessage req;
+  req.set_varname(message);
+
+  return AsyncSendVarMessage(ep, method_name, req, time_out);
+}
+
+VarHandlePtr BRPCClient::AsyncCheckpointNotify(const std::string& ep,
+                                               const std::string& dir,
+                                               int64_t time_out) {
+  sendrecv::VariableMessage req;
+  req.set_varname(CHECKPOINT_SAVE_MESSAGE);
+  req.set_out_varname(dir);
+
+  return AsyncSendVarMessage(ep, "CheckPointNotifyRPC", req, time_out);
+}
+
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc_client.h b/paddle/fluid/operators/distributed/brpc_client.h
index 8ff1f0a607..80cc81bff3 100644
--- a/paddle/fluid/operators/distributed/brpc_client.h
+++ b/paddle/fluid/operators/distributed/brpc_client.h
@@ -31,6 +31,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/operators/distributed/rpc_client.h"
 #include "paddle/fluid/operators/distributed/send_recv.pb.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
@@ -53,33 +55,94 @@ class BRPCClient : public RPCClient {
   BRPCClient() {}
   virtual ~BRPCClient();
 
-  bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
-                    const framework::Scope& scope, const std::string& var_name,
-                    int64_t time_out = FLAGS_rpc_deadline) override;
+  VarHandlePtr AsyncSendVar(const std::string& ep,
+                            const platform::DeviceContext& ctx,
+                            const framework::Scope& scope,
+                            const std::string& var_name,
+                            int64_t time_out = FLAGS_rpc_deadline) override;
 
-  bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
-                   const framework::Scope& scope, const std::string& var_name,
-                   int64_t time_out = FLAGS_rpc_deadline) override;
+  VarHandlePtr AsyncGetVar(const std::string& ep,
+                           const platform::DeviceContext& ctx,
+                           const framework::Scope& scope,
+                           const std::string& var_name,
+                           int64_t time_out = FLAGS_rpc_deadline) override;
 
-  bool AsyncPrefetchVar(const std::string& ep,
-                        const platform::DeviceContext& ctx,
-                        const framework::Scope& scope,
-                        const std::string& in_var_name,
-                        const std::string& out_var_name,
-                        int64_t time_out = FLAGS_rpc_deadline) override;
+  VarHandlePtr AsyncGetMonomerBarrier(
+      const std::string& ep, const std::string& var_name,
+      int64_t time_out = FLAGS_rpc_deadline) override;
 
-  void AsyncSendBatchBarrier(const std::string& ep,
-                             int64_t time_out = FLAGS_rpc_deadline) override;
+  VarHandlePtr AsyncGetMonomerVariable(
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& var_name,
+      int64_t time_out = FLAGS_rpc_deadline) override;
 
-  void AsyncSendFetchBarrier(const std::string& ep,
-                             int64_t time_out = FLAGS_rpc_deadline) override;
+  VarHandlePtr AsyncPrefetchVar(const std::string& ep,
+                                const platform::DeviceContext& ctx,
+                                const framework::Scope& scope,
+                                const std::string& in_var_name,
+                                const std::string& out_var_name,
+                                const std::string& table_name = "",
+                                int64_t time_out = FLAGS_rpc_deadline) override;
 
-  void Wait() override;
+  VarHandlePtr AsyncSendBatchBarrier(
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
+
+  VarHandlePtr AsyncSendFetchBarrier(
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
+
+  VarHandlePtr AsyncCheckpointNotify(
+      const std::string& ep, const std::string& dir,
+      int64_t time_out = FLAGS_rpc_deadline) override;
+
+  bool Wait() override;
+
+  void SendComplete() override;
 
  private:
+  VarHandlePtr _AsyncGetVar(const std::string& ep,
+                            const platform::DeviceContext& ctx,
+                            const framework::Scope& scope,
+                            const std::string& var_name,
+                            const std::string& method_name,
+                            int64_t time_out = FLAGS_rpc_deadline);
+
   void Proceed();
   ChannelQueuePtr GetChannel(const std::string& ep);
 
+  VarHandlePtr AsyncSendComplete(const std::string& ep,
+                                 int64_t time_out = FLAGS_rpc_deadline);
+
+  VarHandlePtr AsyncSendMessage(const std::string& ep,
+                                const std::string& method_name,
+                                const std::string& message, int64_t time_out);
+
+  VarHandlePtr AsyncSendVarMessage(const std::string& ep,
+                                   const std::string& method_name,
+                                   const sendrecv::VariableMessage& req,
+                                   int64_t time_out);
+
+  friend void HandleSendResponse(brpc::Controller* cntl,
+                                 sendrecv::VoidMessage* response,
+                                 VarHandlePtr var_h, ChannelQueuePtr ch_ptr,
+                                 ChannelContextPtr ch_ctx, BRPCClient* cls);
+
+  friend void HandleGetResponse(brpc::Controller* cntl,
+                                sendrecv::VariableMessage* response,
+                                VarHandlePtr var_h, ChannelQueuePtr ch_ptr,
+                                ChannelContextPtr ch_ctx, BRPCClient* cls);
+
+  friend void HandleFetchBarrierResponse(brpc::Controller* cntl,
+                                         sendrecv::VariableMessage* response,
+                                         VarHandlePtr var_h,
+                                         ChannelQueuePtr ch_ptr,
+                                         ChannelContextPtr ch_ctx,
+                                         BRPCClient* cls);
+  void DecreaseReqCount() {
+    if (--req_count_ <= 0) {
+      sync_cond_.notify_all();
+    }
+  }
+
  private:
   std::unordered_map<std::string, ChannelQueuePtr> channels_;
 
@@ -88,6 +151,8 @@ class BRPCClient : public RPCClient {
   std::condition_variable sync_cond_;
   std::atomic<int64_t> req_count_{0};
 
+  static constexpr int brpc_channel_num_per_server_ = 4;
+
   // mutex for GetChannel thread safety
   std::mutex chan_mutex_;
   DISABLE_COPY_AND_ASSIGN(BRPCClient);
diff --git a/paddle/fluid/operators/distributed/brpc_rdma_pool.cc b/paddle/fluid/operators/distributed/brpc_rdma_pool.cc
new file mode 100644
index 0000000000..e1be5673df
--- /dev/null
+++ b/paddle/fluid/operators/distributed/brpc_rdma_pool.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_BRPC_RDMA
+
+#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h"
+#include "brpc/channel.h"
+#include "brpc/rdma/rdma_helper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+RdmaMemPool& RdmaMemPool::Instance() {
+  static RdmaMemPool* g_rdma_mem_pool = new RdmaMemPool();
+  return *g_rdma_mem_pool;
+}
+
+void* RdmaMemPool::Find(const std::string& varname, int64_t size) {
+  pthread_rwlock_rdlock(&access_);
+  auto it = pool_.find(varname);
+  if (it == pool_.end()) {
+    pthread_rwlock_unlock(&access_);
+    return nullptr;
+  }
+
+  auto info = it->second;
+  if (info.data_size != size) {
+    pthread_rwlock_unlock(&access_);
+    PADDLE_ENFORCE(false, "var:%s size:%ld != %ld", varname, size,
+                   info.data_size);
+    return nullptr;
+  }
+
+  pthread_rwlock_unlock(&access_);
+  return info.data;
+}
+
+void RdmaMemPool::Register(const std::string& varname, void* data,
+                           int64_t data_size) {
+  void* old = Find(varname, data_size);
+  if (old != nullptr) {
+    if (data != old) {
+      PADDLE_ENFORCE(false, "var:%s data:%ld != %ld", varname, data, old);
+    }
+    VLOG(7) << "Find on rdma:" << varname << " data:" << data
+            << " data_size:" << data_size;
+    return;
+  }
+
+  VarInfo info;
+  info.data = data;
+  info.data_size = data_size;
+
+  pthread_rwlock_wrlock(&access_);
+  pool_[varname] = info;
+  pthread_rwlock_unlock(&access_);
+
+  if (brpc::rdma::RegisterMemoryForRdma(data, data_size)) {
+    LOG(FATAL) << "register " << varname << " data:" << data
+               << " data_size:" << data_size << " error";
+  }
+
+  VLOG(4) << "register on rdma:" << varname << " data:" << data
+          << " data_size:" << data_size;
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/operators/distributed/brpc_rdma_pool.h b/paddle/fluid/operators/distributed/brpc_rdma_pool.h
new file mode 100644
index 0000000000..156a93ec57
--- /dev/null
+++ b/paddle/fluid/operators/distributed/brpc_rdma_pool.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifdef PADDLE_WITH_BRPC_RDMA
+
+#include <pthread.h>  // NOLINT
+#include <string>
+#include <unordered_map>
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+/*
+ * This class is used to avoid duplicated registion of brpc::rdma.
+ */
+class RdmaMemPool {
+ public:
+  static RdmaMemPool& Instance();
+  RdmaMemPool() : access_(PTHREAD_RWLOCK_INITIALIZER) {}
+
+  virtual ~RdmaMemPool() { pthread_rwlock_destroy(&access_); }
+
+  void Register(const std::string& varname, void* data, int64_t size);
+  void* Find(const std::string& varname, int64_t size);
+
+ private:
+  struct VarInfo {
+    void* data;
+    int64_t data_size;
+
+    VarInfo() : data(nullptr), data_size(0) {}
+  };
+
+ private:
+  std::unordered_map<std::string, VarInfo> pool_;
+  pthread_rwlock_t access_;
+};
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc
new file mode 100644
index 0000000000..6fed9ba92c
--- /dev/null
+++ b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc
@@ -0,0 +1,196 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+#include <nccl.h>
+#endif
+#include <sys/time.h>
+#include <thread>  // NOLINT
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h"
+#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/brpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+class IOBufWriter {
+ public:
+  static void Append(butil::IOBuf* iobuf, int k, const char* v, int64_t vlen) {
+    iobuf->append(reinterpret_cast<char*>(&k), 4);
+    iobuf->append(reinterpret_cast<char*>(&vlen), 8);
+    iobuf->append(v, vlen);
+  }
+
+  static void AppendTCPZeroCopy(butil::IOBuf* iobuf, int k, const char* v,
+                                int64_t vlen, bool in_cuda_pinned,
+                                void (*destroy)(void*), void* user_data) {
+    VLOG(7) << "AppendTCPZeroCopy "
+            << " k:" << k
+            << " data:" << static_cast<void*>(const_cast<char*>(v))
+            << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned;
+
+    iobuf->append(reinterpret_cast<char*>(&k), 4);
+    iobuf->append(reinterpret_cast<char*>(&vlen), 8);
+
+    // FIXME(gongwb): use append_zerocopy
+    /*
+    if (in_cuda_pinned) {
+      iobuf->append_zerocopy(v, vlen, IOBufWriter::FreeMemory);
+    } else {
+      iobuf->append_zerocopy(v, vlen, nullptr);
+    }
+    */
+    iobuf->append(v, vlen);
+    destroy(user_data);
+  }
+
+#ifdef PADDLE_WITH_BRPC_RDMA
+  static void AppendRdmaZeroCopy(const std::string varname, butil::IOBuf* iobuf,
+                                 int k, const char* v, int64_t vlen,
+                                 bool in_cuda_pinned, void (*destroy)(void*),
+                                 void* user_data) {
+    VLOG(7) << "AppendRdmaZeroCopy varname:" << varname << " k:" << k
+            << " data:" << static_cast<void*>(const_cast<char*>(v))
+            << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned;
+
+    iobuf->append(reinterpret_cast<char*>(&k), 4);
+    iobuf->append(reinterpret_cast<char*>(&vlen), 8);
+
+    RdmaMemPool::Instance().Register(
+        varname, static_cast<void*>(const_cast<char*>(v)), vlen);
+
+    // FIXME(gongwb): use append_zerocopy
+    // iobuf->append_zerocopy(v, vlen, nullptr);
+    iobuf->append(v, vlen);
+    destroy(user_data);
+    return;
+  }
+#endif
+
+  static void AppendZeroCopy(const std::string varname, butil::IOBuf* iobuf,
+                             int k, const char* v, int64_t vlen,
+                             bool in_cuda_pinned, void (*destroy)(void*),
+                             void* user_data) {
+#ifdef PADDLE_WITH_BRPC_RDMA
+    IOBufWriter::AppendRdmaZeroCopy(varname, iobuf, k, v, vlen, in_cuda_pinned,
+                                    destroy, user_data);
+#else
+    IOBufWriter::AppendTCPZeroCopy(iobuf, k, v, vlen, in_cuda_pinned, destroy,
+                                   user_data);
+#endif
+  }
+};
+
+void SerializeToIOBuf(const std::string& name, framework::Variable* var,
+                      const platform::DeviceContext& ctx, VarMsg* request,
+                      butil::IOBuf* iobuf, const std::string& out_varname,
+                      bool var_is_not_stable, int trainer_id,
+                      const std::string& table_name) {
+  std::unique_ptr<TensorPayload> payload;
+
+  request->set_varname(name);
+  request->set_trainer_id(trainer_id);
+  // Note: normally the profiler is enabled in 1 trainer, hence only
+  // 1 trainer returns true for ShouldSendProfileState(). It tells PS
+  // servers the trainer's profiling state so that PS can follow the
+  // trainer.
+  if (platform::ShouldSendProfileState()) {
+    if (platform::IsProfileEnabled()) {
+      request->set_profile(platform::kEnableProfiler);
+    } else {
+      request->set_profile(platform::kDisableProfiler);
+    }
+  }
+  if (!out_varname.empty()) {
+    request->set_out_varname(out_varname);
+  }
+  if (!table_name.empty()) {
+    request->set_table_name(table_name);
+  }
+  if (var->IsType<framework::LoDTensor>()) {
+    request->set_type(::sendrecv::LOD_TENSOR);
+    payload.reset(new TensorPayload(GetTensorPayload(var, ctx, request)));
+  } else if (var->IsType<framework::SelectedRows>()) {
+    request->set_type(::sendrecv::SELECTED_ROWS);
+    payload.reset(new TensorPayload(GetSelectedRowsPayload(var, ctx, request)));
+#ifdef PADDLE_WITH_CUDA
+  } else if (var->IsType<ncclUniqueId>()) {
+    request->set_type(::sendrecv::NCCL_ID);
+    const ncclUniqueId& uid = var->Get<ncclUniqueId>();
+    // TODO(gongwb): use append_zero to avoid data copy.
+    IOBufWriter::Append(iobuf,
+                        sendrecv::VariableMessage::kSerializedFieldNumber,
+                        uid.internal, NCCL_UNIQUE_ID_BYTES);
+    return;
+#endif
+  } else {
+    PADDLE_THROW("Serialize does not support type: %s",
+                 typeid(var->Type()).name());
+  }
+
+  PADDLE_ENFORCE_NOT_NULL(payload);
+
+  // FIXME(gongwb): it seems that can use zero copy.
+  if (var_is_not_stable) {
+    IOBufWriter::Append(
+        iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber,
+        static_cast<const char*>(payload->ptr()), payload->memory_size());
+  } else {
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+      IOBufWriter::AppendZeroCopy(
+          name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber,
+          static_cast<const char*>(payload->ptr()), payload->memory_size(),
+          true, SerializeDestroyCallback, static_cast<void*>(payload.get()));
+      payload.release();
+#endif
+    } else {
+      IOBufWriter::AppendZeroCopy(
+          name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber,
+          static_cast<const char*>(payload->ptr()), payload->memory_size(),
+          false, SerializeDestroyCallback, static_cast<void*>(payload.get()));
+      payload.release();
+    }
+  }
+
+  if (var->IsType<framework::SelectedRows>()) {
+    auto* slr = var->GetMutable<framework::SelectedRows>();
+    size_t rows_memory_size =
+        slr->rows().size() * framework::SizeOfType(typeid(int64_t));
+
+    IOBufWriter::Append(iobuf, ::sendrecv::VariableMessage::kRowsFieldNumber,
+                        reinterpret_cast<const char*>(slr->rows().data()),
+                        static_cast<int64_t>(rows_memory_size));
+  }
+}
+
+void DeserializeFromIOBuf(const ::sendrecv::VariableMessage& meta,
+                          const butil::IOBuf& iobuf,
+                          const platform::DeviceContext& ctx,
+                          const framework::Scope* scope,
+                          framework::Variable** var, int* trainer_id) {
+  operators::distributed::BRPCVariableResponse resp(scope, &ctx);
+  PADDLE_ENFORCE(resp.Parse(iobuf, meta) == 0, "parse iobuf to tensor error!");
+  *var = resp.GetVar();
+  *trainer_id = resp.GetTrainerId();
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h
new file mode 100644
index 0000000000..ffaf442224
--- /dev/null
+++ b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <sys/time.h>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "brpc/channel.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+void SerializeToIOBuf(const std::string& name, framework::Variable* var,
+                      const platform::DeviceContext& ctx, VarMsg* request,
+                      butil::IOBuf* iobuf, const std::string& out_varname,
+                      bool var_is_not_stable, const int trainer_id = 0,
+                      const std::string& table_name = std::string());
+
+void DeserializeFromIOBuf(const VarMsg& meta, const butil::IOBuf& iobuf,
+                          const platform::DeviceContext& ctx,
+                          const framework::Scope* scope,
+                          framework::Variable** var, int* trainer_id);
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc_serde_test.cc b/paddle/fluid/operators/distributed/brpc_serde_test.cc
new file mode 100644
index 0000000000..2a2dc72150
--- /dev/null
+++ b/paddle/fluid/operators/distributed/brpc_serde_test.cc
@@ -0,0 +1,175 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <string>
+#include <thread>  // NOLINT
+
+#include "brpc/channel.h"
+#include "google/protobuf/text_format.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/brpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace operators = paddle::operators;
+namespace math = paddle::operators::math;
+namespace memory = paddle::memory;
+
+void RunSerdeTestSelectedRows(platform::Place place) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& ctx = *pool.Get(place);
+
+  butil::IOBuf iobuf;
+  sendrecv::VariableMessage msg;
+  int tensor_numel = 564 * 128;
+
+  // serialize var to IOBuf
+  {
+    framework::Variable var;
+    auto* slr = var.GetMutable<framework::SelectedRows>();
+    slr->set_height(1000);
+    auto* tensor = slr->mutable_value();
+    auto* rows = slr->mutable_rows();
+    tensor->Resize(framework::make_ddim({564, 128}));
+    tensor->mutable_data<float>(place);
+    math::set_constant(ctx, tensor, 32.7);
+    for (int i = 0; i < 564; ++i) rows->push_back(i);
+
+    operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf,
+                                             "", false);
+  }
+
+  // desrialize
+  {
+    framework::Scope scope;
+    scope.Var("myvar");
+    operators::distributed::BRPCVariableResponse resp(&scope, &ctx);
+    EXPECT_EQ(resp.Parse(iobuf, msg), 0);
+
+    framework::Variable* var2 = resp.GetVar();
+
+    auto* slr2 = var2->GetMutable<framework::SelectedRows>();
+    auto* tensor2 = slr2->mutable_value();
+    auto* rows2 = slr2->mutable_rows();
+    float* tensor_data2 = nullptr;
+    framework::Tensor tmp_tensor;
+
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+      platform::CPUPlace cpu;
+      framework::TensorCopy(*tensor2, cpu, &tmp_tensor);
+      tensor_data2 = tmp_tensor.data<float>();
+    } else {
+      tensor_data2 = const_cast<float*>(tensor2->data<float>());
+    }
+    const int64_t* rows_data2 = rows2->data();
+
+    for (int i = 0; i < tensor_numel; ++i) {
+      EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
+    }
+    for (size_t i = 0; i < rows2->size(); ++i) {
+      EXPECT_EQ(rows_data2[i], static_cast<int64_t>(i));
+    }
+    EXPECT_EQ(slr2->height(), 1000);
+  }
+}
+
+void RunTestLodTensor(platform::Place place) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& ctx = *pool.Get(place);
+
+  // serialize var to ByteBuffer
+  butil::IOBuf iobuf;
+  sendrecv::VariableMessage msg;
+  int tensor_numel = 512 * 8 * 4 * 2;
+  {
+    framework::Variable var;
+    auto* tensor = var.GetMutable<framework::LoDTensor>();
+    tensor->Resize(framework::make_ddim({512, 8, 4, 2}));
+    framework::LoD lod;
+    lod.push_back(framework::Vector<size_t>({1, 3, 8}));
+    tensor->set_lod(lod);
+    tensor->mutable_data<float>(place);
+    math::set_constant(ctx, tensor, 31.9);
+
+    operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf,
+                                             "", false);
+  }
+
+  // check sendrecv::VariableMessage meta data
+  {
+    EXPECT_EQ(msg.varname(), "myvar");
+    EXPECT_EQ(msg.type(), 0);
+    EXPECT_EQ(msg.dims()[0], 512);
+    EXPECT_EQ(msg.dims()[1], 8);
+    EXPECT_EQ(msg.dims()[2], 4);
+    EXPECT_EQ(msg.dims()[3], 2);
+    EXPECT_EQ(msg.lod_level(), 1);
+    EXPECT_EQ(msg.lod(0).lod_data(0), 1);
+    EXPECT_EQ(msg.lod(0).lod_data(1), 3);
+    EXPECT_EQ(msg.lod(0).lod_data(2), 8);
+  }
+
+  // deserialize
+  {
+    framework::Scope scope;
+    scope.Var("myvar");
+    operators::distributed::BRPCVariableResponse resp(&scope, &ctx);
+    EXPECT_EQ(resp.Parse(iobuf, msg), 0);
+
+    framework::Variable* var2 = resp.GetVar();
+
+    auto tensor2 = var2->Get<framework::LoDTensor>();
+    float* tensor_data2 = nullptr;
+    framework::Tensor tmp_tensor;
+
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+      platform::CPUPlace cpu;
+      framework::TensorCopy(tensor2, cpu, &tmp_tensor);
+      tensor_data2 = tmp_tensor.data<float>();
+    } else {
+      tensor_data2 = const_cast<float*>(tensor2.data<float>());
+    }
+
+    for (int i = 0; i < tensor_numel; ++i)
+      EXPECT_FLOAT_EQ(tensor_data2[i], 31.9);
+  }
+}
+
+TEST(LodTensor, Run) {
+  platform::CPUPlace place;
+  RunTestLodTensor(place);
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu(0);
+  RunTestLodTensor(gpu);
+#endif
+}
+
+TEST(SelectedRows, Run) {
+  platform::CPUPlace place;
+  RunSerdeTestSelectedRows(place);
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu;
+  RunSerdeTestSelectedRows(gpu);
+#endif
+}
diff --git a/paddle/fluid/operators/distributed/brpc_server.cc b/paddle/fluid/operators/distributed/brpc_server.cc
index 862167f020..78d41aeac5 100644
--- a/paddle/fluid/operators/distributed/brpc_server.cc
+++ b/paddle/fluid/operators/distributed/brpc_server.cc
@@ -13,84 +13,287 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/distributed/brpc_server.h"
+#include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/brpc_variable_response.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 
 namespace sendrecv {
 
-typedef std::unordered_map<std::string,
-                           paddle::operators::distributed::RequestHandler*>
+namespace distributed = paddle::operators::distributed;
+
+typedef std::unordered_map<std::string, distributed::RequestHandler*>
     HandlerMap;
 
 class BRPCServiceImpl : public SendRecvService {
  public:
-  explicit BRPCServiceImpl(const HandlerMap& rpc_call_map)
-      : request_send_h_(nullptr),
-        request_get_h_(nullptr),
-        request_prefetch_h_(nullptr) {
-    auto it = rpc_call_map.find(paddle::operators::distributed::kRequestSend);
+  explicit BRPCServiceImpl(const HandlerMap& rpc_call_map,
+                           distributed::RPCServer* rpc_server)
+      : rpc_server_(rpc_server) {
+    VLOG(3) << "BRPCServiceImpl size: " << rpc_call_map.size();
+    auto it = rpc_call_map.find(distributed::kRequestSend);
     if (it != rpc_call_map.end()) {
       request_send_h_ = it->second;
+      send_threads_.reset(new paddle::framework::ThreadPool(
+          rpc_server_->GetThreadNum(distributed::kRequestSend)));
     }
 
-    it = rpc_call_map.find(paddle::operators::distributed::kRequestSend);
+    it = rpc_call_map.find(distributed::kRequestGet);
     if (it != rpc_call_map.end()) {
       request_get_h_ = it->second;
+      get_threads_.reset(new paddle::framework::ThreadPool(
+          rpc_server_->GetThreadNum(distributed::kRequestGet)));
     }
 
-    it = rpc_call_map.find(paddle::operators::distributed::kRequestPrefetch);
+    it = rpc_call_map.find(distributed::kRequestPrefetch);
     if (it != rpc_call_map.end()) {
       request_prefetch_h_ = it->second;
+      prefetch_threads_.reset(new paddle::framework::ThreadPool(
+          rpc_server_->GetThreadNum(distributed::kRequestPrefetch)));
+    }
+
+    it = rpc_call_map.find(distributed::kRequestCheckpoint);
+    if (it != rpc_call_map.end()) {
+      request_checkpoint_h_ = it->second;
+      checkpoint_notify_threads_.reset(new paddle::framework::ThreadPool(
+          rpc_server_->GetThreadNum(distributed::kRequestPrefetch)));
+    }
+
+    it = rpc_call_map.find(distributed::kRequestGetMonomerVariable);
+    if (it != rpc_call_map.end()) {
+      request_get_monomer_handler_h_ = it->second;
+    }
+
+    it = rpc_call_map.find(distributed::kRequestGetMonomerBarrier);
+    if (it != rpc_call_map.end()) {
+      request_get_monomer_barrier_handler_h_ = it->second;
     }
   }
 
   virtual ~BRPCServiceImpl() {}
-
   void SendVariable(google::protobuf::RpcController* cntl_butil,
                     const VariableMessage* request, VoidMessage* response,
                     google::protobuf::Closure* done) override {
+    send_threads_->Run(
+        [=] { _SendVariable(cntl_butil, request, response, done); });
+  }
+
+  void _SendVariable(google::protobuf::RpcController* cntl_butil,
+                     const VariableMessage* request, VoidMessage* response,
+                     google::protobuf::Closure* done) {
     PADDLE_ENFORCE(request_send_h_ != nullptr,
                    "RequestSend handler should be registed first!");
     brpc::ClosureGuard done_guard(done);
-
-    paddle::framework::Scope* local_scope = request_send_h_->scope();
-    paddle::framework::Variable* outvar = nullptr;
-    paddle::framework::Variable* invar = nullptr;
+    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
 
     std::string varname = request->varname();
+    VLOG(3) << "RequestSend var_name:" << varname
+            << ", trainer_id:" << request->trainer_id()
+            << ", from:" << cntl->remote_side();
 
-    if (!request_send_h_->sync_mode()) {
-      local_scope = &request_send_h_->scope()->NewScope();
-      invar = local_scope->Var(varname);
-    } else {
-      invar = local_scope->FindVar(varname);
-    }
+    distributed::BRPCVariableResponse resp(request_send_h_->scope(),
+                                           request_send_h_->dev_ctx(),
+                                           !request_send_h_->sync_mode());
+    PADDLE_ENFORCE(resp.Parse(cntl->request_attachment(), *request) == 0,
+                   "parse iobuf to tensor error!");
 
-    request_send_h_->Handle(varname, local_scope, invar, &outvar);
+    auto scope = resp.GetMutableLocalScope();
+    auto invar = resp.GetVar();
+    int trainer_id = request->trainer_id();
+    paddle::framework::Variable* outvar = nullptr;
 
-    if (!request_send_h_->sync_mode()) {
-      request_send_h_->scope()->DeleteScope(local_scope);
-    }
+    request_send_h_->Handle(varname, scope, invar, &outvar, trainer_id);
   }
 
   void GetVariable(google::protobuf::RpcController* cntl_butil,
                    const VariableMessage* request, VariableMessage* response,
                    google::protobuf::Closure* done) override {
+    get_threads_->Run(
+        [=] { _GetVariable(cntl_butil, request, response, done); });
+  }
+
+  void _GetVariable(google::protobuf::RpcController* cntl_butil,
+                    const VariableMessage* request, VariableMessage* response,
+                    google::protobuf::Closure* done) {
     PADDLE_ENFORCE(request_get_h_ != nullptr,
                    "RequestGet handler should be registed first!");
-  }
 
+    brpc::ClosureGuard done_guard(done);
+    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
+
+    std::string varname = request->varname();
+    VLOG(3) << "RequestGet varname:" << varname
+            << ", trainer_id:" << request->trainer_id()
+            << ", from:" << cntl->remote_side();
+
+    auto scope = request_get_h_->scope();
+    auto invar = scope->FindVar(varname);
+    int trainer_id = request->trainer_id();
+    paddle::framework::Variable* outvar = nullptr;
+
+    request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id);
+
+    if (outvar) {
+      distributed::SerializeToIOBuf(varname, outvar, *request_get_h_->dev_ctx(),
+                                    response, &cntl->response_attachment(), "",
+                                    false);
+    }
+  }
   void PrefetchVariable(google::protobuf::RpcController* cntl_butil,
                         const VariableMessage* request,
                         VariableMessage* response,
                         google::protobuf::Closure* done) override {
+    prefetch_threads_->Run(
+        [=] { _PrefetchVariable(cntl_butil, request, response, done); });
+  }
+
+  void _PrefetchVariable(google::protobuf::RpcController* cntl_butil,
+                         const VariableMessage* request,
+                         VariableMessage* response,
+                         google::protobuf::Closure* done) {
     PADDLE_ENFORCE(request_prefetch_h_ != nullptr,
                    "kRequestPrefetch handler should be registed first!");
+
+    brpc::ClosureGuard done_guard(done);
+    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
+
+    // prefetch process...
+    std::string in_var_name = request->varname();
+    std::string out_var_name = request->out_varname();
+    VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name
+            << ", out_var_name: " << out_var_name
+            << ", trainer_id:" << request->trainer_id()
+            << ", from:" << cntl->remote_side();
+
+    distributed::BRPCVariableResponse resp(
+        request_prefetch_h_->scope(), request_prefetch_h_->dev_ctx(), true);
+
+    PADDLE_ENFORCE(resp.Parse(cntl->request_attachment(), *request) == 0,
+                   "parse iobuf to tensor error!");
+
+    auto scope = resp.GetMutableLocalScope();
+    auto invar = scope->FindVar(in_var_name);
+    std::string table_name = request->table_name();
+    int trainer_id = request->trainer_id();
+    paddle::framework::Variable* outvar = scope->Var(out_var_name);
+
+    request_prefetch_h_->Handle(in_var_name, scope, invar, &outvar, trainer_id,
+                                out_var_name, table_name);
+
+    distributed::SerializeToIOBuf(out_var_name, outvar,
+                                  *request_prefetch_h_->dev_ctx(), response,
+                                  &cntl->response_attachment(), "", true);
+  }
+
+  void CheckpointNotify(google::protobuf::RpcController* cntl_butil,
+                        const VariableMessage* request, VoidMessage* response,
+                        google::protobuf::Closure* done) override {
+    checkpoint_notify_threads_->Run(
+        [=] { _CheckpointNotify(cntl_butil, request, response, done); });
+  }
+
+  void _CheckpointNotify(google::protobuf::RpcController* cntl_butil,
+                         const VariableMessage* request, VoidMessage* response,
+                         google::protobuf::Closure* done) {
+    PADDLE_ENFORCE(
+        request_checkpoint_h_ != nullptr,
+        "kRequestCheckpointNotify handler should be registed first!");
+
+    brpc::ClosureGuard done_guard(done);
+    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
+
+    distributed::BRPCVariableResponse resp(request_checkpoint_h_->scope(),
+                                           request_checkpoint_h_->dev_ctx());
+
+    auto scope = resp.GetMutableLocalScope();
+
+    std::string checkpoint_notify = request->varname();
+    std::string checkpoint_dir = request->out_varname();
+    int trainer_id = request->trainer_id();
+
+    VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify
+            << ", dir: " << checkpoint_dir
+            << ", trainer_id:" << request->trainer_id()
+            << ", from:" << cntl->remote_side();
+
+    request_checkpoint_h_->Handle(checkpoint_notify, scope, nullptr, nullptr,
+                                  trainer_id, checkpoint_dir);
+  }
+
+  void GetMonomerVariable(google::protobuf::RpcController* cntl_butil,
+                          const VariableMessage* request,
+                          VariableMessage* response,
+                          google::protobuf::Closure* done) override {
+    PADDLE_ENFORCE(
+        request_get_monomer_handler_h_ != nullptr,
+        "kRequestGetMonomerVariable handler should be registed first!");
+
+    brpc::ClosureGuard done_guard(done);
+    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
+
+    // proc request.
+    std::string varname = request->varname();
+    VLOG(3) << "GetMonomerVariable " << varname
+            << ", trainer_id:" << request->trainer_id()
+            << ", from:" << cntl->remote_side();
+
+    rpc_server_->WaitVarCond(varname);
+    distributed::MonomerHandle h = rpc_server_->GetMonomer(varname);
+
+    auto scope = h.scope_;
+    auto invar = scope->FindVar(varname);
+    paddle::framework::Variable* outvar = nullptr;
+
+    request_get_monomer_handler_h_->Handle(varname, scope, invar, &outvar,
+                                           request->trainer_id());
+
+    if (outvar) {
+      distributed::SerializeToIOBuf(varname, outvar, *h.dev_ctx_, response,
+                                    &cntl->response_attachment(), "", false);
+    }
+  }
+
+  void GetMonomerBarrier(google::protobuf::RpcController* cntl_butil,
+                         const VariableMessage* request, VoidMessage* response,
+                         google::protobuf::Closure* done) override {
+    PADDLE_ENFORCE(
+        request_get_monomer_barrier_handler_h_ != nullptr,
+        "RequestGetMonomerBarrier handler should be registed first!");
+
+    brpc::ClosureGuard done_guard(done);
+    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
+
+    std::string varname = request->varname();
+    VLOG(3) << "RequestGetMonomerBarrier var_name:" << varname
+            << ", trainer_id:" << request->trainer_id()
+            << ", from:" << cntl->remote_side();
+
+    rpc_server_->WaitVarCond(varname);
+    distributed::MonomerHandle h = rpc_server_->GetMonomer(varname);
+
+    paddle::framework::Scope* scope = nullptr;
+    paddle::framework::Variable* invar = nullptr;
+    paddle::framework::Variable* outvar = nullptr;
+
+    request_get_monomer_barrier_handler_h_->Handle(
+        varname, scope, invar, &outvar, request->trainer_id());
   }
 
  private:
-  paddle::operators::distributed::RequestHandler* request_send_h_;
-  paddle::operators::distributed::RequestHandler* request_get_h_;
-  paddle::operators::distributed::RequestHandler* request_prefetch_h_;
+  distributed::RequestHandler* request_send_h_{nullptr};
+  distributed::RequestHandler* request_get_h_{nullptr};
+  distributed::RequestHandler* request_prefetch_h_{nullptr};
+  distributed::RequestHandler* request_checkpoint_h_{nullptr};
+  distributed::RequestHandler* request_get_monomer_handler_h_{nullptr};
+  distributed::RequestHandler* request_get_monomer_barrier_handler_h_{nullptr};
+
+  distributed::RPCServer* rpc_server_{nullptr};
+
+  // FIXME(gongwb): brpc should support process one rpce use one threadpool.
+  std::unique_ptr<paddle::framework::ThreadPool> send_threads_;
+  std::unique_ptr<paddle::framework::ThreadPool> get_threads_;
+  std::unique_ptr<paddle::framework::ThreadPool> prefetch_threads_;
+  std::unique_ptr<paddle::framework::ThreadPool> checkpoint_notify_threads_;
 };
 }  // namespace sendrecv
 
@@ -100,7 +303,7 @@ namespace distributed {
 
 void AsyncBRPCServer::StartServer() {
   // Instance of your service.
-  sendrecv::BRPCServiceImpl service_impl(rpc_call_map_);
+  sendrecv::BRPCServiceImpl service_impl(rpc_call_map_, this);
 
   // Add the service into server. Notice the second parameter, because the
   // service is put on stack, we don't want server to delete it, otherwise
@@ -111,6 +314,9 @@ void AsyncBRPCServer::StartServer() {
   }
 
   brpc::ServerOptions options;
+#ifdef PADDLE_WITH_BRPC_RDMA
+  options.use_rdma = true;
+#endif
   options.idle_timeout_sec = idle_timeout_s_;
   options.max_concurrency = max_concurrency_;
   if (server_.Start(bind_address_.c_str(), &options) != 0) {
diff --git a/paddle/fluid/operators/distributed/brpc_variable_response.cc b/paddle/fluid/operators/distributed/brpc_variable_response.cc
new file mode 100644
index 0000000000..75306d7233
--- /dev/null
+++ b/paddle/fluid/operators/distributed/brpc_variable_response.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "paddle/fluid/operators/distributed/brpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+namespace pb = ::google::protobuf;
+using vr = ::sendrecv::VariableMessage;
+
+int BRPCVariableResponse::Parse(Source* source) {
+  pb::io::ZeroCopyInputStream* input_stream = source->contents();
+  pb::io::CodedInputStream input(input_stream);
+  input.SetTotalBytesLimit(INT_MAX, INT_MAX);
+
+  while (1) {
+    unsigned int tag = 0;
+    if (!input.ReadLittleEndian32(&tag)) {
+      break;
+    }
+
+    uint64_t num_bytes = 0;
+    if (!input.ReadLittleEndian64(&num_bytes)) {
+      break;
+    }
+
+    int field = static_cast<int>(tag);
+    int ret = field == 0 ? -1 : field;
+    switch (field) {
+      case vr::kSerializedFieldNumber: {
+        if (!ProcSerializedField(field, &input, num_bytes)) {
+          return ret;
+        }
+        break;
+      }
+      case vr::kRowsFieldNumber: {
+        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
+                        meta_.type() == sendrecv::LOD_TENSOR) &&
+                           meta_.varname() != "",
+                       "meta info should be got first!");
+
+        if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) {
+          return ret;
+        }
+        break;
+      }
+      default: {
+        PADDLE_ENFORCE(false, "not surpported %u fieldnumber", field);
+        return ret;
+      }
+    }
+  }
+
+  return 0;
+}
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc_variable_response.h b/paddle/fluid/operators/distributed/brpc_variable_response.h
new file mode 100644
index 0000000000..b0b91a42a0
--- /dev/null
+++ b/paddle/fluid/operators/distributed/brpc_variable_response.h
@@ -0,0 +1,67 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "brpc/channel.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+class BRPCSourceWrapper : public Source {
+ public:
+  explicit BRPCSourceWrapper(const butil::IOBuf& iobuf) : source_(iobuf) {}
+  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
+    return &source_;
+  }
+
+ private:
+  butil::IOBufAsZeroCopyInputStream source_;
+};
+
+class BRPCVariableResponse : public VariableResponse {
+ public:
+  BRPCVariableResponse(const framework::Scope* scope,
+                       const platform::DeviceContext* dev_ctx,
+                       bool create_scope = false)
+      : VariableResponse(scope, dev_ctx, create_scope) {}
+
+  virtual ~BRPCVariableResponse() {}
+
+  // parse attachment from iobuf
+  int Parse(Source* source) override;
+  int Parse(const butil::IOBuf& iobuf, const sendrecv::VariableMessage& meta) {
+    BRPCSourceWrapper wrapper(iobuf);
+    return VariableResponse::Parse(&wrapper, meta);
+  }
+};
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
index f14dfcdb23..78956c9ea4 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -293,8 +293,7 @@ VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
   const auto ch = GetChannel(ep);
   BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
   const std::string method = "SendMonomerFetchBarrierRPC";
-  VarHandlePtr h(
-      new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
+  VarHandlePtr h(new VarHandle(ep, method, var_name, nullptr, nullptr));
   s->Prepare(h, time_out);
 
   VLOG(30) << s->GetVarHandlePtr()->String() << " begin";
diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc
index 31fac2133c..1f797ea91d 100644
--- a/paddle/fluid/operators/distributed/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde.cc
@@ -32,13 +32,6 @@ namespace paddle {
 namespace operators {
 namespace distributed {
 
-static void SerializeDestroyCallback(void* payload) {
-  if (payload != nullptr) {
-    auto* shared_payload = reinterpret_cast<TensorPayload*>(payload);
-    delete shared_payload;
-  }
-}
-
 void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                            const platform::DeviceContext& ctx,
                            ::grpc::ByteBuffer* msg, const std::string& out_name,
diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h
index 45d1d3479c..8c7b7f1d7e 100644
--- a/paddle/fluid/operators/distributed/rpc_server.h
+++ b/paddle/fluid/operators/distributed/rpc_server.h
@@ -75,6 +75,10 @@ class RPCServer {
   void RegisterRPC(const std::string& rpc_name, RequestHandler* handler,
                    int thread_num = 5);
 
+  int GetThreadNum(const std::string& rpc_name) {
+    return rpc_thread_num_[rpc_name];
+  }
+
   // Wait util all the clients have reached the barrier for one
   // rpc method. This function should be called in the
   // RequestHandler if you want to run the server/client in a
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
index 6ba883ba01..5aadbcf220 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <thread>  // NOLINT
 
 #include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
 #include "paddle/fluid/platform/port.h"
@@ -45,7 +46,6 @@ static TensorPayload GetCommunicationAllocationFromTensor(
     memory::Copy(cuda_pinned, result->ptr(),
                  boost::get<platform::CUDAPlace>(tensor.place()),
                  tensor.data<void>(), copy_size, gpu_dev_ctx.stream());
-
     ctx.Wait();
     return TensorPayload(result);
 #else
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h
index 523e56fe3e..1a32ffdbec 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.h
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h
@@ -50,6 +50,13 @@ class TensorPayload final {
   size_t memory_size_;
 };
 
+inline void SerializeDestroyCallback(void* payload) {
+  if (payload != nullptr) {
+    auto* shared_payload = reinterpret_cast<TensorPayload*>(payload);
+    delete shared_payload;
+  }
+}
+
 TensorPayload GetTensorPayload(framework::Variable* var,
                                const platform::DeviceContext& ctx,
                                VarMsg* request);
diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
index 28bb90af56..3c0b7ff24f 100644
--- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
@@ -2,9 +2,9 @@ include(operators)
 
 set(DISTRIBUTE_DEPS "")
 if(WITH_GRPC)
-    set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
+    set(DISTRIBUTE_DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
 else()
-    set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
+    set(DISTRIBUTE_DEPS sendrecvop_rpc brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
     if(WITH_BRPC_RDMA)
         find_library(IBVERBS_LIBRARY NAMES ibverbs)
         ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
index ab92ad4506..20870ea07e 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
@@ -26,10 +26,11 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
+#include "paddle/fluid/platform/profiler.h"
 
-DEFINE_int32(rpc_send_thread_num, 5, "number of threads for rpc send");
-DEFINE_int32(rpc_get_thread_num, 5, "number of threads for rpc get");
-DEFINE_int32(rpc_prefetch_thread_num, 5, "number of threads for rpc prefetch");
+DEFINE_int32(rpc_send_thread_num, 12, "number of threads for rpc send");
+DEFINE_int32(rpc_get_thread_num, 12, "number of threads for rpc get");
+DEFINE_int32(rpc_prefetch_thread_num, 12, "number of threads for rpc prefetch");
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc
index 58a3ca8272..0bf4bebbc9 100644
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
@@ -58,7 +58,9 @@ class SendOp : public framework::OperatorBase {
     }
     if (sync_send) {
       for (size_t i = 0; i < rets.size(); i++) {
+        VLOG(7) << "before sync_send " << ins[i] << "from " << epmap[i];
         PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+        VLOG(7) << "after sync_send " << ins[i] << "from " << epmap[i];
       }
     }
   }
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 74b4f2e937..d590c3a3c6 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -81,6 +81,14 @@ bool IsCompiledWithCUDA() {
 #endif
 }
 
+bool IsCompiledWithBrpc() {
+#if defined(PADDLE_WITH_BRPC) || defined(PADDLE_WITH_BRPC_RDMA)
+  return true;
+#else
+  return false;
+#endif
+}
+
 bool IsCompiledWithDIST() {
 #ifdef PADDLE_WITH_DISTRIBUTE
   return true;
@@ -631,6 +639,7 @@ All parameter, weight, gradient are variables in Paddle.
         [](bool init_p2p) { framework::InitDevices(init_p2p); });
 
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
+  m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
   m.def("is_compiled_with_dist", IsCompiledWithDIST);
 #ifdef PADDLE_WITH_CUDA
   m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index e0bb0d1152..2dea71d7af 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -152,6 +152,7 @@ def __bootstrap__():
             'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
             'cudnn_exhaustive_search', 'selected_gpus'
         ]
+
     core.init_gflags([sys.argv[0]] +
                      ["--tryfromenv=" + ",".join(read_env_flags)])
     core.init_glog(sys.argv[0])

From 2ebf12f340a82e1512f5f889d37b41e76b9eb3f7 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Sun, 16 Dec 2018 09:52:58 +0800
Subject: [PATCH 42/45] fix

test=develop
---
 paddle/scripts/paddle_build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 0fc43f33d0..a0da89d319 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -518,7 +518,7 @@ function assert_api_spec_approvals() {
       fi
     done
 
-    HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep const_cast || true`
+    HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "const_cast" || true`
     if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then
         APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
         python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433`

From e439257ef7880e9b0b19d7b0c7ef8965fc180279 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 17 Dec 2018 11:24:19 +0800
Subject: [PATCH 43/45] Fix include style

test=develop
---
 paddle/fluid/framework/tensor.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 6ddc07af9a..6a1cbe5cd5 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -14,15 +14,14 @@ limitations under the License. */
 
 #pragma once
 
-#include <paddle/fluid/framework/framework.pb.h>
 #include <cstdint>
 #include <cstring>
 #include <memory>
 #include <typeindex>
 #include <vector>
-
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"

From 4de1a8bd9d55469f0612cf8f60b749681a5d657c Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 17 Dec 2018 14:15:27 +0800
Subject: [PATCH 44/45] Remove unused cmake log

test=develop
---
 cmake/external/python.cmake | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index 52ad02a355..623c53f4f7 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -79,6 +79,5 @@ IF(PYTHONINTERP_FOUND)
         "please use pip to upgrade protobuf. pip install -U protobuf")
     ENDIF()
 ENDIF(PYTHONINTERP_FOUND)
-message(STATUS ${PYTHON_INCLUDE_DIR})
 INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
 INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})

From b5fa916413aebd0d35af8b3ae04d4d555ecb4629 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Tue, 18 Dec 2018 08:38:52 +0000
Subject: [PATCH 45/45] fix bug after merge reyoung optimization, test=develop

---
 .../fluid/operators/hierarchical_sigmoid_op.h |  1 -
 .../fluid/operators/math/matrix_bit_code.cc   | 35 -------------------
 paddle/fluid/operators/math/matrix_bit_code.h | 29 +++++++--------
 3 files changed, 15 insertions(+), 50 deletions(-)

diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h
index 802b444d7c..b47bf49ecb 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -71,7 +71,6 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
       // server
       auto height_sections = ctx.Attr<std::vector<int>>("height_sections");
       auto table_names = ctx.Attr<std::vector<std::string>>("table_names");
-      VLOG(3) << "path type is " << path->type().name();
       std::vector<int64_t> real_rows = PathToRows(*path);
       framework::Scope& local_scope = ctx.scope().NewScope();
       auto* ids = local_scope.Var("Ids@Prefetch");
diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc
index d55e832cc2..d6f51c6e5c 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.cc
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
@@ -84,41 +84,6 @@ void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor &tmat,
   code_table_.apply_visitor(func);
 }
 
-template <typename T>
-struct MatrixBitCodeFunctorSelectedRowsAddGrad
-    : public boost::static_visitor<void> {
-  const framework::Tensor &tmat_;
-  framework::SelectedRows *vec_;
-
-  MatrixBitCodeFunctorSelectedRowsAddGrad(const framework::Tensor &tmat,
-                                          framework::SelectedRows *vec)
-      : tmat_(tmat), vec_(vec) {}
-
-  template <typename CodeTable>
-  void operator()(const CodeTable &code_table) {
-    size_t batch_size = tmat_.dims()[0];
-    size_t width = tmat_.dims()[1];
-    auto *vec_data = vec_->mutable_value()->template data<T>();
-    auto *tmat_data = tmat_.data<T>();
-    for (size_t i = 0; i < batch_size; ++i) {
-      auto code = code_table.get_code(i);
-      int code_length = code.get_length();
-      for (int j = 0; j < code_length; ++j) {
-        size_t index = code.calc_index(j);
-        int64_t row_index = vec_->GetIndexFromId(static_cast<int64_t>(index));
-        vec_data[row_index] += tmat_data[i * width + j];
-      }
-    }
-  }
-};
-
-template <typename T>
-void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor &tmat,
-                                      framework::SelectedRows *vec) {
-  MatrixBitCodeFunctorSelectedRowsAddGrad<T> func(tmat, vec);
-  code_table_.apply_visitor(func);
-}
-
 template <typename T>
 struct MatrixBitCodeFunctorSum : public boost::static_visitor<void> {
   const framework::Tensor &tmat_;
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
index 7a084a41e5..c399cb5d44 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -124,11 +124,12 @@ class SimpleCode {
 template <typename T>
 class CustomCode {
  public:
-  CustomCode(const framework::Tensor& ptable, const framework::Tensor& pcode,
-             const int64_t* ids, int index) {
-    seq_len_ = ptable.dims()[1];
-    ptable_data_ = ptable.data<T>() + seq_len_ * index;
-    pcode_data_ = pcode.data<T>() + seq_len_ * index;
+  CustomCode(const framework::Tensor& path_table,
+             const framework::Tensor& path_code, const int64_t* ids,
+             int index) {
+    seq_len_ = path_table.dims()[1];
+    path_table_data_ = path_table.data<T>() + seq_len_ * index;
+    path_code_data_ = path_code.data<T>() + seq_len_ * index;
   }
   /**
    * Here the id of root should be 1 rather than 0, thus the encoding of class c
@@ -139,25 +140,25 @@ class CustomCode {
    * Binary classification path is the suffixes of encoding, thus leave out the
    * left most bit in calc_bit.
    */
-  size_t calc_index(int bit) const { return ptable_data_[bit]; }
-  bool calc_bit(int bit) const { return pcode_data_[bit]; }
+  size_t calc_index(int bit) const { return path_table_data_[bit]; }
+  bool calc_bit(int bit) const { return path_code_data_[bit]; }
 
   // NOTE: this function is not thread-safe.
   int get_length() const {
     if (length_ < 0) {
       auto len = seq_len_;
-      length_ =
-          static_cast<int>(std::find_if(ptable_data_, ptable_data_ + len,
-                                        [](const T& val) { return val < 0; }) -
-                           ptable_data_);
+      length_ = static_cast<int>(
+          std::find_if(path_table_data_, path_table_data_ + len,
+                       [](const T& val) { return val < 0; }) -
+          path_table_data_);
     }
     return length_;
   }
 
  private:
   int64_t seq_len_;
-  const T* ptable_data_;
-  const T* pcode_data_;
+  const T* path_table_data_;
+  const T* path_code_data_;
   mutable int length_{-1};
 };
 
@@ -214,7 +215,7 @@ class MatrixBitCodeFunctor {
                        const framework::Tensor& path_code, const int64_t* ids)
       : num_classes_(static_cast<size_t>(path_table.dims()[1])),
         ids_(ids),
-        code_table_(CustomCodeTable<int64_t>(ptable, pcode, ids)) {}
+        code_table_(CustomCodeTable<int64_t>(path_table, path_code, ids)) {}
   /* For j < code_length
        tmat(i, j) += vec(0, index(i, j))
   */