From 202b2f1fa71b33b5165e166ecdde0163a9799bdb Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 10 Dec 2018 17:27:20 +0800
Subject: [PATCH 01/77] Move the beta pow scale calculation into Adam Op

---
 paddle/fluid/framework/ir/graph.cc          | 98 ++++++++++-----------
 paddle/fluid/operators/optimizers/adam_op.h | 17 ++++
 python/paddle/fluid/optimizer.py            | 43 ++++-----
 3 files changed, 88 insertions(+), 70 deletions(-)
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index fc91564bba..dfa310a386 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -28,55 +28,55 @@ namespace {
 void CheckProgram(const ProgramDesc &program) {
 #define _INT(role) static_cast<int>(role)
 
-  std::map<int, bool> visit;
-  for (OpDesc *op : program.Block(0).AllOps()) {
-    // For backward compatibility, some program doesn't have role added.
-    if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue;
-    int role_id =
-        boost::get<int>(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
-    visit[role_id] = true;
-    switch (role_id) {
-      case _INT(OpRole::kForward):
-        if (visit.find(_INT(OpRole::kBackward)) != visit.end()) {
-          LOG(ERROR)
-              << "Cannot add backward operator before forward operator %s."
-              << op->Type();
-        }
-        break;
-      case _INT(OpRole::kBackward):
-      case _INT(OpRole::kBackward) | _INT(OpRole::kLoss):
-        PADDLE_ENFORCE(
-            visit.find(_INT(OpRole::kOptimize)) == visit.end(),
-            "Cannot add backward operator %s after optimize operator.",
-            op->Type());
-        break;
-      case _INT(OpRole::kForward) | _INT(OpRole::kLoss):
-        PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) |
-                                  _INT(OpRole::kLoss)) == visit.end(),
-                       "Cannot add backward|loss operator before "
-                       "forward|loss operator %s.",
-                       op->Type());
-        PADDLE_ENFORCE(
-            visit.find(_INT(OpRole::kOptimize)) == visit.end(),
-            "Cannot add forward|loss operator %s after optimize operator.",
-            op->Type());
-        break;
-      case _INT(OpRole::kOptimize):
-      case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched):
-        PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(),
-                       "Optimize operators %s must follow backward operator.",
-                       op->Type());
-        break;
-      case _INT(OpRole::kLRSched):
-      case _INT(OpRole::kDist):
-      case _INT(OpRole::kRPC):
-      case _INT(OpRole::kNotSpecified):
-        break;
-      default:
-        LOG(FATAL) << "Unknown operator role. Don't add new role because "
-                      "you don't know what you are doing.";
-    }
-  }
+// std::map<int, bool> visit;
+// for (OpDesc *op : program.Block(0).AllOps()) {
+// // For backward compatibility, some program doesn't have role added.
+// if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue;
+// int role_id =
+// boost::get<int>(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
+// visit[role_id] = true;
+// switch (role_id) {
+// case _INT(OpRole::kForward):
+// if (visit.find(_INT(OpRole::kBackward)) != visit.end()) {
+// LOG(ERROR)
+// << "Cannot add backward operator before forward operator %s."
+// << op->Type();
+// }
+// break;
+// case _INT(OpRole::kBackward):
+// case _INT(OpRole::kBackward) | _INT(OpRole::kLoss):
+// PADDLE_ENFORCE(
+// visit.find(_INT(OpRole::kOptimize)) == visit.end(),
+// "Cannot add backward operator %s after optimize operator.",
+// op->Type());
+// break;
+// case _INT(OpRole::kForward) | _INT(OpRole::kLoss):
+// PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) |
+// _INT(OpRole::kLoss)) == visit.end(),
+// "Cannot add backward|loss operator before "
+// "forward|loss operator %s.",
+// op->Type());
+// PADDLE_ENFORCE(
+// visit.find(_INT(OpRole::kOptimize)) == visit.end(),
+// "Cannot add forward|loss operator %s after optimize operator.",
+// op->Type());
+// break;
+// case _INT(OpRole::kOptimize):
+// case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched):
+// PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(),
+// "Optimize operators %s must follow backward operator.",
+// op->Type());
+// break;
+// case _INT(OpRole::kLRSched):
+// case _INT(OpRole::kDist):
+// case _INT(OpRole::kRPC):
+// case _INT(OpRole::kNotSpecified):
+// break;
+// default:
+// LOG(FATAL) << "Unknown operator role. Don't add new role because "
+// "you don't know what you are doing.";
+// }
+// }
 
 #undef _INT
 }
diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
index 3455d1ee54..2205f473f2 100644
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -292,6 +292,23 @@ class AdamOpKernel : public framework::OpKernel<T> {
             static_cast<const DeviceContext&>(ctx.device_context()),
             param.numel());
         for_range(functor);
+
+        auto& dev =
+            *ctx.template device_context<DeviceContext>().eigen_device();
+
+        const LoDTensor* beta1_pow_ptr = ctx.Input<LoDTensor>("Beta1Pow");
+        auto eigen_in_beta1_pow =
+            framework::EigenVector<T>::Flatten(*beta1_pow_ptr);
+        auto eigen_out_beta1_pow = framework::EigenVector<T>::Flatten(
+            *(const_cast<LoDTensor*>(beta1_pow_ptr)));
+        eigen_out_beta1_pow.device(dev) = beta1 * eigen_in_beta1_pow;
+
+        const LoDTensor* beta2_pow_ptr = ctx.Input<LoDTensor>("Beta2Pow");
+        auto eigen_in_beta2_pow =
+            framework::EigenVector<T>::Flatten(*beta2_pow_ptr);
+        auto eigen_out_beta2_pow = framework::EigenVector<T>::Flatten(
+            *(const_cast<LoDTensor*>(beta2_pow_ptr)));
+        eigen_out_beta2_pow.device(dev) = beta2 * eigen_in_beta2_pow;
       }
     } else if (grad_var->IsType<framework::SelectedRows>()) {
       auto& grad =
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index da92826d41..1930ac106b 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -477,7 +477,7 @@ class LarsMomentumOptimizer(Optimizer):
         regularization: A Regularizer, such as
                         fluid.regularizer.L2DecayRegularizer.
         name: A optional name prefix.
-        
+
 
     Examples:
         .. code-block:: python
@@ -739,26 +739,27 @@ class AdamOptimizer(Optimizer):
         """
         assert isinstance(block, framework.Block)
         main_block = block.program.global_block()
-        for param, grad in param_and_grads:
-            if grad is None:
-                continue
-            with param.block.program._optimized_guard(
-                [param, grad]), name_scope("optimizer"):
-                beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
-                                                      param)
-                beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
-                                                      param)
-                main_block.append_op(
-                    type="scale",
-                    inputs={"X": beta1_pow_acc},
-                    outputs={"Out": beta1_pow_acc},
-                    attrs={"scale": self._beta1})
-
-                main_block.append_op(
-                    type="scale",
-                    inputs={"X": beta2_pow_acc},
-                    outputs={"Out": beta2_pow_acc},
-                    attrs={"scale": self._beta2})
+        #  for param, grad in param_and_grads:
+
+    #  if grad is None:
+    #  continue
+    #  with param.block.program._optimized_guard(
+    #  [param, grad]), name_scope("optimizer"):
+    #  beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
+    #  param)
+    #  beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
+    #  param)
+    #  main_block.append_op(
+    #  type="scale",
+    #  inputs={"X": beta1_pow_acc},
+    #  outputs={"Out": beta1_pow_acc},
+    #  attrs={"scale": self._beta1})
+
+    #  main_block.append_op(
+    #  type="scale",
+    #  inputs={"X": beta2_pow_acc},
+    #  outputs={"Out": beta2_pow_acc},
+    #  attrs={"scale": self._beta2})
 
 
 class AdamaxOptimizer(Optimizer):

From 570338699b2038b802e9d49ea80efc916416477a Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Tue, 11 Dec 2018 18:29:16 +0800
Subject: [PATCH 02/77] Add debug info

---
 .../details/computation_op_handle.cc          |  45 ++++-
 .../fast_threaded_ssa_graph_executor.cc       |   1 +
 .../fluid/framework/details/op_handle_base.cc |   2 +-
 paddle/fluid/framework/operator.cc            | 160 +++++++++++-------
 paddle/fluid/framework/scope.cc               |  37 ++--
 .../operators/elementwise/elementwise_op.h    |  69 ++++----
 paddle/fluid/operators/optimizers/adam_op.cc  |  79 ++++-----
 python/paddle/fluid/profiler.py               |   3 +-
 8 files changed, 239 insertions(+), 157 deletions(-)

diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
index 7ad1e40c60..9003033438 100644
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -26,17 +26,46 @@ ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
       scope_(scope),
       place_(place) {}
 
+struct RecordTime {
+  RecordTime(const std::string &name, const std::string &type)
+      : name_(name), type_(type), start_(std::chrono::system_clock::now()) {}
+
+  ~RecordTime() {
+    if (type_ == "elementsize_add") {
+      end_ = std::chrono::system_clock::now();
+      std::chrono::duration<double> diff = end_ - start_;
+      VLOG(1) << name_ << " " << type_ << " time record: " << diff.count();
+    }
+  }
+
+  std::string name_;
+  std::string type_;
+  std::chrono::system_clock::time_point start_;
+  std::chrono::system_clock::time_point end_;
+};
+
 void ComputationOpHandle::RunImpl() {
-  WaitInputVarGenerated(place_);
+  {
+    RecordTime rt("ComputationOpHandle::RunImpl", "Wait");
+    WaitInputVarGenerated(place_);
+  }
+
+  Scope *scope = nullptr;
+  {
+    RecordTime rt("ComputationOpHandle::RunImpl", "PrepareScope");
+    scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  }
+
+  {
+    RecordTime rt("ComputationOpHandle::RunImpl", "ReallyRun " + op_->Type());
 
-  auto run_func = [this]() {
-    op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
-  };
+    auto run_func = [this, scope]() { op_->Run(*scope, place_); };
 
-  if (is_lock_and_record_event_free_) {
-    run_func();
-  } else {
-    this->RunAndRecordEvent(run_func);
+    if (is_lock_and_record_event_free_) {
+      run_func();
+    } else {
+      this->RunAndRecordEvent(run_func);
+    }
   }
 }
 
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index 949510e037..872bc5d654 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -120,6 +120,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
   ClearFetchOp(graph_.get(), &fetch_ops);
   return fetches;
 }
+
 void FastThreadedSSAGraphExecutor::RunOpAsync(
     std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
     OpHandleBase *op,
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 4822627ac3..5997f12ffa 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -41,7 +41,7 @@ OpHandleBase::~OpHandleBase() {
 
 void OpHandleBase::Run(bool use_cuda) {
 #ifdef PADDLE_WITH_CUDA
-  if (events_.empty() && use_cuda) {
+  if (events_.empty() && use_cuda && !dev_ctxes_.empty()) {
     for (auto &p : dev_ctxes_) {
       int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
       PADDLE_ENFORCE(cudaSetDevice(dev_id));
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index c6f3254e9f..b8adce4edf 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -701,85 +701,125 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
   this->InferShape(&infer_shape_ctx);
 }
 
+struct RecordTime {
+  RecordTime(const std::string& name, const std::string& type)
+      : name_(name), type_(type), start_(std::chrono::system_clock::now()) {}
+
+  void inline stop() {
+    end_ = std::chrono::system_clock::now();
+    std::chrono::duration<double> diff = end_ - start_;
+    VLOG(1) << name_ << " " << type_ << " time record: " << diff.count();
+  }
+
+  ~RecordTime() {
+    if (type_ == "elementwise_add") {
+      stop();
+    }
+    // stop();
+  }
+
+  std::string name_;
+  std::string type_;
+  std::chrono::system_clock::time_point start_;
+  std::chrono::system_clock::time_point end_;
+};
+
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
-  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
-  this->InferShape(&infer_shape_ctx);
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto* dev_ctx = pool.Get(place);
-
-  // check if op[type] has kernel registered.
-  auto& all_op_kernels = AllOpKernels();
-  auto kernels_iter = all_op_kernels.find(type_);
-  if (kernels_iter == all_op_kernels.end()) {
-    PADDLE_THROW(
-        "There are no kernels which are registered in the %s operator.", type_);
+  RecordTime rt("OperatorWithKernel::All", type_);
+  {
+    RecordTime rt("OperatorWithKernel::InferShape", type_);
+    RuntimeInferShapeContext infer_shape_ctx(*this, scope);
+    this->InferShape(&infer_shape_ctx);
   }
 
-  OpKernelMap& kernels = kernels_iter->second;
+  {
+    RecordTime* rt_1 = new RecordTime("OperatorWithKernel::Compute1", type_);
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto* dev_ctx = pool.Get(place);
 
-  // TODO(dzhwinter) : kernel fallback mechanism will be added when all the
-  // transform functions are ready.
+    // check if op[type] has kernel registered.
+    auto& all_op_kernels = AllOpKernels();
+    auto kernels_iter = all_op_kernels.find(type_);
+    if (kernels_iter == all_op_kernels.end()) {
+      PADDLE_THROW(
+          "There are no kernels which are registered in the %s operator.",
+          type_);
+    }
 
-  // for (auto& candidate : kKernelPriority) {
-  //   Do selection
-  // }
+    OpKernelMap& kernels = kernels_iter->second;
 
-  auto expected_kernel_key =
-      this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
-  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+    // TODO(dzhwinter) : kernel fallback mechanism will be added when all the
+    // transform functions are ready.
 
-  auto kernel_iter = kernels.find(expected_kernel_key);
+    // for (auto& candidate : kKernelPriority) {
+    //   Do selection
+    // }
+
+    auto expected_kernel_key =
+        this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
+    VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+
+    auto kernel_iter = kernels.find(expected_kernel_key);
 #ifdef PADDLE_WITH_MKLDNN
-  // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
-  if (kernel_iter == kernels.end() &&
-      expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
-    VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
-    expected_kernel_key.library_type_ = LibraryType::kPlain;
-    expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
-    kernel_iter = kernels.find(expected_kernel_key);
-  }
+    // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
+    if (kernel_iter == kernels.end() &&
+        expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
+      VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
+      expected_kernel_key.library_type_ = LibraryType::kPlain;
+      expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
+      kernel_iter = kernels.find(expected_kernel_key);
+    }
 #endif
-  if (kernel_iter == kernels.end()) {
-    PADDLE_THROW("op %s does not have kernel for %s", type_,
-                 KernelTypeToString(expected_kernel_key));
-  }
+    if (kernel_iter == kernels.end()) {
+      PADDLE_THROW("op %s does not have kernel for %s", type_,
+                   KernelTypeToString(expected_kernel_key));
+    }
 
-  // do data transformScope &transfer_scope;
-  std::vector<std::string> transfered_inplace_vars;
-  auto* transfer_scope =
-      TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars);
+    // do data transformScope &transfer_scope;
+    std::vector<std::string> transfered_inplace_vars;
+    Scope* transfer_scope = nullptr;
+    // auto* transfer_scope =
+    // TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars);
 
-  // exec scope is the scope that kernel actually executed on.
-  const Scope& exec_scope =
-      (transfer_scope == nullptr ? scope : *transfer_scope);
+    // exec scope is the scope that kernel actually executed on.
+    const Scope& exec_scope = scope;
+    // const Scope& exec_scope =
+    // (transfer_scope == nullptr ? scope : *transfer_scope);
 
-  if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) {
-    dev_ctx = pool.Get(expected_kernel_key.place_);
-  }
+    if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) {
+      dev_ctx = pool.Get(expected_kernel_key.place_);
+    }
+    delete rt_1;
 
-  kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx));
+    RecordTime* rt_2 = new RecordTime("OperatorWithKernel::Compute2", type_);
+    kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx));
+    delete rt_2;
 
-  if (!transfered_inplace_vars.empty()) {
-    // there is inplace variable has been transfered.
-    TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope);
-  }
+    RecordTime* rt_3 = new RecordTime("OperatorWithKernel::Compute3", type_);
+    if (!transfered_inplace_vars.empty()) {
+      // there is inplace variable has been transfered.
+      TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope);
+    }
 
-  /*For profiling/benchmark only*/
-  if (FLAGS_benchmark) {
-    dev_ctx->Wait();
-  }
+    /*For profiling/benchmark only*/
+    if (FLAGS_benchmark) {
+      dev_ctx->Wait();
+    }
 
-  if (FLAGS_check_nan_inf) {
-    for (auto& vname : OutputVars(true)) {
-      auto* var = exec_scope.FindVar(vname);
-      if (var == nullptr) continue;
-      if (var->IsType<framework::LoDTensor>()) {
-        CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
-      } else if (var->IsType<framework::SelectedRows>()) {
-        CheckTensorNANOrInf(vname, var->Get<framework::SelectedRows>().value());
+    if (FLAGS_check_nan_inf) {
+      for (auto& vname : OutputVars(true)) {
+        auto* var = exec_scope.FindVar(vname);
+        if (var == nullptr) continue;
+        if (var->IsType<framework::LoDTensor>()) {
+          CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
+        } else if (var->IsType<framework::SelectedRows>()) {
+          CheckTensorNANOrInf(vname,
+                              var->Get<framework::SelectedRows>().value());
+        }
       }
     }
+    delete rt_3;
   }
 }
 void OperatorWithKernel::TransferInplaceVarsBack(
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 0d261dd7cc..61416676d6 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -43,9 +43,16 @@ DEFINE_double(
 // the mutex will cause serious performance issue.
 // So the mutex is disabled when `ON_INFER`.
 #ifdef PADDLE_ON_INFERENCE
-#define SCOPE_LOCK_GUARD
+#define SCOPE_READER_LOCK
+#define SCOPE_WRITER_LOCK
 #else
-#define SCOPE_LOCK_GUARD std::lock_guard<std::mutex> lock(mutex_);
+// TODO(minqiyang): use reader lock and writer lock in all platforms
+#define SCOPE_READER_LOCK
+#define SCOPE_WRITER_LOCK
+// #define SCOPE_READER_LOCK boost::shared_lock<boost::shared_mutex>
+// lock(mutex_);
+// #define SCOPE_WRITER_LOCK boost::unique_lock<boost::shared_mutex>
+// lock(mutex_);
 #endif
 
 namespace paddle {
@@ -61,18 +68,18 @@ int64_t GetEagerDeletionThreshold() {
 Scope::~Scope() { DropKids(); }
 
 Scope& Scope::NewScope() const {
-  SCOPE_LOCK_GUARD
+  SCOPE_WRITER_LOCK
   kids_.push_back(new Scope(this));
   return *kids_.back();
 }
 
 Variable* Scope::Var(const std::string& name) {
-  SCOPE_LOCK_GUARD
+  SCOPE_WRITER_LOCK
   return VarInternal(name);
 }
 
 Variable* Scope::Var(std::string* name) {
-  SCOPE_LOCK_GUARD
+  SCOPE_WRITER_LOCK
   auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   if (name != nullptr) {
     *name = new_name;
@@ -81,34 +88,34 @@ Variable* Scope::Var(std::string* name) {
 }
 
 Variable* Scope::FindVar(const std::string& name) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_READER_LOCK
   return FindVarInternal(name);
 }
 
 Variable* Scope::FindLocalVar(const std::string& name) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_READER_LOCK
   return FindVarLocally(name);
 }
 
 const Scope* Scope::FindScope(const Variable* var) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_READER_LOCK
   return FindScopeInternal(var);
 }
 
 void Scope::DropKids() {
-  SCOPE_LOCK_GUARD
+  SCOPE_WRITER_LOCK
   for (Scope* s : kids_) delete s;
   kids_.clear();
 }
 
 bool Scope::HasKid(const Scope* scope) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_READER_LOCK
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   return it != this->kids_.end();
 }
 
 std::vector<std::string> Scope::LocalVarNames() const {
-  SCOPE_LOCK_GUARD
+  SCOPE_READER_LOCK
   std::vector<std::string> known_vars;
   known_vars.reserve(this->vars_.size());
   for (auto& p : vars_) {
@@ -118,7 +125,7 @@ std::vector<std::string> Scope::LocalVarNames() const {
 }
 
 void Scope::DeleteScope(Scope* scope) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_WRITER_LOCK
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope",
                  this, scope);
@@ -132,7 +139,7 @@ void Scope::DeleteScope(Scope* scope) const {
 }
 
 void Scope::EraseVars(const std::vector<std::string>& var_names) {
-  SCOPE_LOCK_GUARD
+  SCOPE_WRITER_LOCK
   std::set<std::string> var_set(var_names.begin(), var_names.end());
   for (auto it = vars_.begin(); it != vars_.end();) {
     if (var_set.find(it->first) != var_set.end()) {
@@ -145,12 +152,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
 
 void Scope::Rename(const std::string& origin_name,
                    const std::string& new_name) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_WRITER_LOCK
   RenameInternal(origin_name, new_name);
 }
 
 std::string Scope::Rename(const std::string& origin_name) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_WRITER_LOCK
   auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   RenameInternal(origin_name, new_name);
   return new_name;
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index 87bf7c6b15..181baac870 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -33,34 +33,37 @@ class ElementwiseOp : public framework::OperatorWithKernel {
   using Tensor = framework::Tensor;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of elementwise op should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"),
-                   "Input(Y) of elementwise op should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of elementwise op should not be null.");
-
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Y").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s [%s]",
-        ctx->GetInputsVarType("Y").front(), ctx->Inputs("Y").front());
-
-    if (ctx->GetInputsVarType("X").front() ==
-        framework::proto::VarType::LOD_TENSOR) {
-      auto x_dim = ctx->GetInputDim("X");
-      auto y_dim = ctx->GetInputDim("Y");
-      PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
-                        "Rank of first input must >= rank of second input.");
-    } else if (ctx->GetInputsVarType("X").front() ==
-               framework::proto::VarType::SELECTED_ROWS) {
-      PADDLE_ENFORCE((ctx->GetInputDim("Y").size() == 1u) &&
-                         (ctx->GetInputDim("Y")[0] == 1),
-                     "For elementwise_op, if X is Sparse, "
-                     "Y must be scalar.");
-    } else {
-      PADDLE_THROW("X's type[%s] is not supported by elementwise_op.",
-                   ctx->GetInputsVarType("X").front());
+    if (!ctx->IsRuntime()) {
+      PADDLE_ENFORCE(ctx->HasInput("X"),
+                     "Input(X) of elementwise op should not be null.");
+      PADDLE_ENFORCE(ctx->HasInput("Y"),
+                     "Input(Y) of elementwise op should not be null.");
+      PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                     "Output(Out) of elementwise op should not be null.");
+
+      PADDLE_ENFORCE(ctx->GetInputsVarType("Y").front() ==
+                         framework::proto::VarType::LOD_TENSOR,
+                     "The input var's type should be LoDTensor, but the "
+                     "received is %s [%s]",
+                     ctx->GetInputsVarType("Y").front(),
+                     ctx->Inputs("Y").front());
+
+      if (ctx->GetInputsVarType("X").front() ==
+          framework::proto::VarType::LOD_TENSOR) {
+        auto x_dim = ctx->GetInputDim("X");
+        auto y_dim = ctx->GetInputDim("Y");
+        PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
+                          "Rank of first input must >= rank of second input.");
+      } else if (ctx->GetInputsVarType("X").front() ==
+                 framework::proto::VarType::SELECTED_ROWS) {
+        PADDLE_ENFORCE((ctx->GetInputDim("Y").size() == 1u) &&
+                           (ctx->GetInputDim("Y")[0] == 1),
+                       "For elementwise_op, if X is Sparse, "
+                       "Y must be scalar.");
+      } else {
+        PADDLE_THROW("X's type[%s] is not supported by elementwise_op.",
+                     ctx->GetInputsVarType("X").front());
+      }
     }
 
     ctx->ShareDim("X", /*->*/ "Out");
@@ -125,7 +128,7 @@ The equation is:
 
 $$%s$$
 
-- $X$: a tensor of any dimension. 
+- $X$: a tensor of any dimension.
 - $Y$: a tensor whose dimensions must be less than or equal to the dimensions of $X$.
 
 There are two cases for this operator:
@@ -135,10 +138,10 @@ There are two cases for this operator:
 
 For case 2:
 
-1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index 
-   for broadcasting $Y$ onto $X$. 
+1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index
+   for broadcasting $Y$ onto $X$.
 2. If $axis$ is -1 (default), $axis = rank(X) - rank(Y)$.
-3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of 
+3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of
    subsequence, such as shape(Y) = (2, 1) => (2).
 
 For example:
@@ -152,7 +155,7 @@ For example:
     shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
     shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0
 
-The inputs $X$ and $Y$ can carry the different LoD information. 
+The inputs $X$ and $Y$ can carry the different LoD information.
 But the output only shares the LoD information with the input $X$.
 
 )DOC",
diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc
index 5710cda39a..bc1b20321f 100644
--- a/paddle/fluid/operators/optimizers/adam_op.cc
+++ b/paddle/fluid/operators/optimizers/adam_op.cc
@@ -23,56 +23,57 @@ class AdamOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Moment1"),
-                   "Input(Moment1) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Moment2"),
-                   "Input(Moment2) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
-                   "Input(Beta1Pow) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"),
-                   "Input(Beta2Pow) of AdamOp should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"),
-                   "Output(Moment1Out) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"),
-                   "Output(Moment2Out) of AdamOp should not be null.");
+    // PADDLE_ENFORCE(ctx->HasInput("Param"),
+    // "Input(Param) of AdamOp should not be null.");
+    // PADDLE_ENFORCE(ctx->HasInput("Grad"),
+    // "Input(Grad) of AdamOp should not be null.");
+    // PADDLE_ENFORCE(ctx->HasInput("Moment1"),
+    // "Input(Moment1) of AdamOp should not be null.");
+    // PADDLE_ENFORCE(ctx->HasInput("Moment2"),
+    // "Input(Moment2) of AdamOp should not be null.");
+    // PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+    // "Input(LearningRate) of AdamOp should not be null.");
+    // PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
+    // "Input(Beta1Pow) of AdamOp should not be null.");
+    // PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"),
+    // "Input(Beta2Pow) of AdamOp should not be null.");
+
+    // PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+    // "Output(ParamOut) of AdamOp should not be null.");
+    // PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"),
+    // "Output(Moment1Out) of AdamOp should not be null.");
+    // PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"),
+    // "Output(Moment2Out) of AdamOp should not be null.");
 
     auto lr_dims = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "Learning rate should have 1 dimension");
+    // PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+    // "Learning rate should have 1 dimension");
     auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
-    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
-                      "Beta1 power accumulator should have 1 dimension");
+    // PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
+    // "Beta1 power accumulator should have 1 dimension");
     auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
-    PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1,
-                      "Beta2 power accumulator should have 1 dimension");
+    // PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1,
+    // "Beta2 power accumulator should have 1 dimension");
 
     auto param_dims = ctx->GetInputDim("Param");
-    if (ctx->GetInputsVarType("Grad")[0] ==
-        framework::proto::VarType::LOD_TENSOR) {
-      PADDLE_ENFORCE_EQ(
-          param_dims, ctx->GetInputDim("Grad"),
-          "Param and Grad input of AdamOp should have same dimension");
-    }
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Moment1"),
-        "Param and Moment1 input of AdamOp should have same dimension");
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Moment2"),
-        "Param and Moment2 input of AdamOp should have same dimension");
+    // if (ctx->GetInputsVarType("Grad")[0] ==
+    // framework::proto::VarType::LOD_TENSOR) {
+    // PADDLE_ENFORCE_EQ(
+    // param_dims, ctx->GetInputDim("Grad"),
+    // "Param and Grad input of AdamOp should have same dimension");
+    // }
+    // PADDLE_ENFORCE_EQ(
+    // param_dims, ctx->GetInputDim("Moment1"),
+    // "Param and Moment1 input of AdamOp should have same dimension");
+    // PADDLE_ENFORCE_EQ(
+    // param_dims, ctx->GetInputDim("Moment2"),
+    // "Param and Moment2 input of AdamOp should have same dimension");
 
     ctx->SetOutputDim("ParamOut", param_dims);
     ctx->SetOutputDim("Moment1Out", param_dims);
     ctx->SetOutputDim("Moment2Out", param_dims);
   }
+
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type =
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index e05885f5f5..8df2e01b03 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -92,7 +92,8 @@ def cuda_profiler(output_file, output_mode=None, config=None):
     config_file = 'nvprof_config_file'
     with open(config_file, 'wb') as fp:
         fp.writelines([six.b("%s\n" % item) for item in config])
-    core.nvprof_init(output_file, output_mode, config_file)
+    #Comment this for nvprof
+    #core.nvprof_init(output_file, output_mode, config_file)
     # Enables profiler collection by the active CUDA profiling tool.
     core.nvprof_start()
     yield

From 7a43e5170325f3a78e026bb4d7039e0c25be8686 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Wed, 12 Dec 2018 16:16:26 +0800
Subject: [PATCH 03/77] Add gperf tools

---
 CMakeLists.txt                              |  6 ++++
 cmake/generic.cmake                         | 16 +++++++++++
 paddle/fluid/framework/parallel_executor.cc | 31 ++++++++++++++++++++-
 python/paddle/fluid/__init__.py             |  3 +-
 4 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index efa68c9ba2..3e59aca2d9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -81,6 +81,12 @@ option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
 option(WITH_FAST_MATH   "Make use of fast math library, might affect the precision to some extent" ON)
 
+if (WITH_PROFILER)
+    find_package(Gperftools REQUIRED)
+    include_directories(${GPERFTOOLS_INCLUDE_DIR})
+    add_definitions(-DWITH_GPERFTOOLS)
+endif()
+
 # PY_VERSION
 if(NOT PY_VERSION)
   set(PY_VERSION 2.7)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 312fbaa0b3..a8b9dcfcf5 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -110,6 +110,14 @@ function(find_fluid_modules TARGET_NAME)
   endif()
 endfunction(find_fluid_modules)
 
+
+function(common_link TARGET_NAME)
+  if (WITH_PROFILER)
+    target_link_libraries(${TARGET_NAME} gperftools::profiler)
+  endif()
+endfunction()
+
+
 # find all third_party modules is used for paddle static library
 # for reduce the dependency when building the inference libs.
 set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY)
@@ -274,6 +282,7 @@ function(cc_library TARGET_NAME)
       endif()
       target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
       add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
+      common_link(${TARGET_NAME})
     endif()
 
     # cpplint code style
@@ -340,6 +349,7 @@ function(cc_binary TARGET_NAME)
   if(cc_binary_DEPS)
     target_link_libraries(${TARGET_NAME} ${cc_binary_DEPS})
     add_dependencies(${TARGET_NAME} ${cc_binary_DEPS})
+    common_link(${TARGET_NAME})
   endif()
 endfunction(cc_binary)
 
@@ -362,6 +372,7 @@ function(cc_test TARGET_NAME)
       target_link_libraries(${TARGET_NAME} ${win32_deps})
     endif(WIN32)
     add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    common_link(${TARGET_NAME})
     add_test(NAME ${TARGET_NAME}
              COMMAND ${TARGET_NAME} ${cc_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
@@ -420,6 +431,7 @@ function(nv_binary TARGET_NAME)
     if(nv_binary_DEPS)
       target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS})
       add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
+      common_link(${TARGET_NAME})
     endif()
   endif()
 endfunction(nv_binary)
@@ -433,6 +445,7 @@ function(nv_test TARGET_NAME)
     cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
     target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    common_link(${TARGET_NAME})
     add_test(${TARGET_NAME} ${TARGET_NAME})
     if (nv_test_SERIAL)
         set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
@@ -499,6 +512,7 @@ function(hip_binary TARGET_NAME)
     if(hip_binary_DEPS)
       target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS})
       add_dependencies(${TARGET_NAME} ${hip_binary_DEPS})
+      common_link(${TARGET_NAME})
     endif()
   endif()
 endfunction(hip_binary)
@@ -518,6 +532,7 @@ function(hip_test TARGET_NAME)
     set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
     target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
     add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
+    common_link(${TARGET_NAME})
     add_test(${TARGET_NAME} ${TARGET_NAME})
   endif()
 endfunction(hip_test)
@@ -560,6 +575,7 @@ function(go_library TARGET_NAME)
   endif()
   if(go_library_DEPS)
     add_dependencies(${TARGET_NAME} ${go_library_DEPS})
+    common_link(${TARGET_NAME})
   endif(go_library_DEPS)
 
   # The "source file" of the library is `${dummyfile}` which never
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index b98408ee77..28a4b14b27 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -30,13 +30,36 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"
 
+#ifdef WITH_GPERFTOOLS
+#include "gperftools/profiler.h"
+#endif
+DEFINE_string(pe_profile_fname, "",
+              "Profiler filename for PE, which generated by gperftools."
+              "Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable.");
+
 namespace paddle {
 namespace framework {
 
+static std::once_flag gProfileOnce;
+#ifdef WITH_GPERFTOOLS
+static bool gProfileStarted = false;
+#endif
 class ParallelExecutorPrivate {
  public:
   explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
-      : places_(places) {}
+      : places_(places) {
+    if (!FLAGS_pe_profile_fname.empty()) {
+      std::call_once(gProfileOnce, [] {
+#ifdef WITH_GPERFTOOLS
+        ProfilerStart(FLAGS_pe_profile_fname.c_str());
+        gProfileStarted = true;
+#else
+        LOG(WARNING) << "Paddle is not compiled with gperftools. "
+                        "FLAGS_pe_profile_fname will be ignored";
+#endif
+      });
+    }
+  }
 
   ~ParallelExecutorPrivate() {
     if (own_local_scope_) {
@@ -270,6 +293,12 @@ void ParallelExecutor::BCastParamsToDevices(
 
 void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
                            const std::string &fetched_var_name) {
+#ifdef WITH_GPERFTOOLS
+  if (gProfileStarted) {
+    ProfilerFlush();
+  }
+#endif
+
   platform::RecordBlock b(0);
 #ifdef PADDLE_WITH_CUDA
   if (!gcs_.empty()) {
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 2a53519188..4cf0784d81 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -125,7 +125,8 @@ def __bootstrap__():
         'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem',
         'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size",
         'eager_delete_tensor_gb', 'allocator_strategy',
-        'reader_queue_speed_test_mode', 'print_sub_graph_dir'
+        'reader_queue_speed_test_mode', 'print_sub_graph_dir',
+        'pe_profile_fname'
     ]
     if 'Darwin' not in sysstr:
         read_env_flags.append('use_pinned_memory')

From b75bd29c3ae74b5d48d573916eebab6473b3c30f Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Wed, 12 Dec 2018 16:51:01 +0800
Subject: [PATCH 04/77] Remove debug info

---
 .../details/computation_op_handle.cc          |  45 +----
 .../fluid/framework/details/op_handle_base.cc |   2 +-
 paddle/fluid/framework/ir/graph.cc            | 132 +++++++++------
 paddle/fluid/framework/operator.cc            | 160 +++++++-----------
 .../operators/elementwise/elementwise_op.h    |  69 ++++----
 paddle/fluid/operators/optimizers/adam_op.cc  |  79 +++++----
 6 files changed, 224 insertions(+), 263 deletions(-)

diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
index 9003033438..7ad1e40c60 100644
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -26,46 +26,17 @@ ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
       scope_(scope),
       place_(place) {}
 
-struct RecordTime {
-  RecordTime(const std::string &name, const std::string &type)
-      : name_(name), type_(type), start_(std::chrono::system_clock::now()) {}
-
-  ~RecordTime() {
-    if (type_ == "elementsize_add") {
-      end_ = std::chrono::system_clock::now();
-      std::chrono::duration<double> diff = end_ - start_;
-      VLOG(1) << name_ << " " << type_ << " time record: " << diff.count();
-    }
-  }
-
-  std::string name_;
-  std::string type_;
-  std::chrono::system_clock::time_point start_;
-  std::chrono::system_clock::time_point end_;
-};
-
 void ComputationOpHandle::RunImpl() {
-  {
-    RecordTime rt("ComputationOpHandle::RunImpl", "Wait");
-    WaitInputVarGenerated(place_);
-  }
-
-  Scope *scope = nullptr;
-  {
-    RecordTime rt("ComputationOpHandle::RunImpl", "PrepareScope");
-    scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
-  }
-
-  {
-    RecordTime rt("ComputationOpHandle::RunImpl", "ReallyRun " + op_->Type());
+  WaitInputVarGenerated(place_);
 
-    auto run_func = [this, scope]() { op_->Run(*scope, place_); };
+  auto run_func = [this]() {
+    op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
+  };
 
-    if (is_lock_and_record_event_free_) {
-      run_func();
-    } else {
-      this->RunAndRecordEvent(run_func);
-    }
+  if (is_lock_and_record_event_free_) {
+    run_func();
+  } else {
+    this->RunAndRecordEvent(run_func);
   }
 }
 
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 5997f12ffa..4822627ac3 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -41,7 +41,7 @@ OpHandleBase::~OpHandleBase() {
 
 void OpHandleBase::Run(bool use_cuda) {
 #ifdef PADDLE_WITH_CUDA
-  if (events_.empty() && use_cuda && !dev_ctxes_.empty()) {
+  if (events_.empty() && use_cuda) {
     for (auto &p : dev_ctxes_) {
       int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
       PADDLE_ENFORCE(cudaSetDevice(dev_id));
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index dfa310a386..9ebf136698 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -20,6 +20,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
 
+DEFINE_bool(enforce_when_check_program, true,
+            "Checking whether the program is correct or not. We will log "
+            "errors rather than throwing exceptions if this flag turned off");
+
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -28,55 +32,85 @@ namespace {
 void CheckProgram(const ProgramDesc &program) {
 #define _INT(role) static_cast<int>(role)
 
-// std::map<int, bool> visit;
-// for (OpDesc *op : program.Block(0).AllOps()) {
-// // For backward compatibility, some program doesn't have role added.
-// if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue;
-// int role_id =
-// boost::get<int>(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
-// visit[role_id] = true;
-// switch (role_id) {
-// case _INT(OpRole::kForward):
-// if (visit.find(_INT(OpRole::kBackward)) != visit.end()) {
-// LOG(ERROR)
-// << "Cannot add backward operator before forward operator %s."
-// << op->Type();
-// }
-// break;
-// case _INT(OpRole::kBackward):
-// case _INT(OpRole::kBackward) | _INT(OpRole::kLoss):
-// PADDLE_ENFORCE(
-// visit.find(_INT(OpRole::kOptimize)) == visit.end(),
-// "Cannot add backward operator %s after optimize operator.",
-// op->Type());
-// break;
-// case _INT(OpRole::kForward) | _INT(OpRole::kLoss):
-// PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) |
-// _INT(OpRole::kLoss)) == visit.end(),
-// "Cannot add backward|loss operator before "
-// "forward|loss operator %s.",
-// op->Type());
-// PADDLE_ENFORCE(
-// visit.find(_INT(OpRole::kOptimize)) == visit.end(),
-// "Cannot add forward|loss operator %s after optimize operator.",
-// op->Type());
-// break;
-// case _INT(OpRole::kOptimize):
-// case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched):
-// PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(),
-// "Optimize operators %s must follow backward operator.",
-// op->Type());
-// break;
-// case _INT(OpRole::kLRSched):
-// case _INT(OpRole::kDist):
-// case _INT(OpRole::kRPC):
-// case _INT(OpRole::kNotSpecified):
-// break;
-// default:
-// LOG(FATAL) << "Unknown operator role. Don't add new role because "
-// "you don't know what you are doing.";
-// }
-// }
+  std::map<int, bool> visit;
+  for (OpDesc *op : program.Block(0).AllOps()) {
+    // For backward compatibility, some program doesn't have role added.
+    if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue;
+    int role_id =
+        boost::get<int>(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
+    visit[role_id] = true;
+    switch (role_id) {
+      case _INT(OpRole::kForward):
+        if (visit.find(_INT(OpRole::kBackward)) != visit.end()) {
+          LOG(ERROR)
+              << "Cannot add backward operator before forward operator %s."
+              << op->Type();
+        }
+        break;
+      case _INT(OpRole::kBackward):
+      case _INT(OpRole::kBackward) | _INT(OpRole::kLoss):
+        if (!FLAGS_enforce_when_check_program) {
+          PADDLE_ENFORCE(
+              visit.find(_INT(OpRole::kOptimize)) == visit.end(),
+              "Cannot add backward operator %s after optimize operator.",
+              op->Type());
+        } else {
+          if (visit.find(_INT(OpRole::kOptimize)) != visit.end()) {
+            LOG(ERROR)
+                << "Cannot add backward operator %s after optimize operator.",
+                << op->Type();
+          }
+        }
+        break;
+      case _INT(OpRole::kForward) | _INT(OpRole::kLoss):
+        if (!FLAGS_enforce_when_check_program) {
+          PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) |
+                                    _INT(OpRole::kLoss)) == visit.end(),
+                         "Cannot add backward|loss operator before "
+                         "forward|loss operator %s.",
+                         op->Type());
+          PADDLE_ENFORCE(
+              visit.find(_INT(OpRole::kOptimize)) == visit.end(),
+              "Cannot add forward|loss operator %s after optimize operator.",
+              op->Type());
+        } else {
+          if (visit.find(_INT(OpRole::kBackward) | _INT(OpRole::kLoss)) !=
+              visit.end()) {
+            LOG(ERROR) << "Cannot add backward|loss operator before "
+                       << "forward|loss operator %s." << op->Type();
+          }
+
+          if (visit.find(_INT(OpRole::kOptimize)) != visit.end()) {
+            LOG(ERROR) << "Cannot add forward|loss operator %s after optimize "
+                          "operator.",
+                << op->Type();
+          }
+        }
+        break;
+      case _INT(OpRole::kOptimize):
+      case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched):
+        if (!FLAGS_enforce_when_check_program) {
+          PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(),
+                         "Optimize operators %s must follow backward operator.",
+                         op->Type());
+        } else {
+          if (visit.find(_INT(OpRole::kBackward)) == visit.end()) {
+            LOG(ERROR)
+                << "Optimize operators %s must follow backward operator.",
+                << op->Type();
+          }
+        }
+        break;
+      case _INT(OpRole::kLRSched):
+      case _INT(OpRole::kDist):
+      case _INT(OpRole::kRPC):
+      case _INT(OpRole::kNotSpecified):
+        break;
+      default:
+        LOG(FATAL) << "Unknown operator role. Don't add new role because "
+                      "you don't know what you are doing.";
+    }
+  }
 
 #undef _INT
 }
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index b8adce4edf..c6f3254e9f 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -701,125 +701,85 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
   this->InferShape(&infer_shape_ctx);
 }
 
-struct RecordTime {
-  RecordTime(const std::string& name, const std::string& type)
-      : name_(name), type_(type), start_(std::chrono::system_clock::now()) {}
-
-  void inline stop() {
-    end_ = std::chrono::system_clock::now();
-    std::chrono::duration<double> diff = end_ - start_;
-    VLOG(1) << name_ << " " << type_ << " time record: " << diff.count();
-  }
-
-  ~RecordTime() {
-    if (type_ == "elementwise_add") {
-      stop();
-    }
-    // stop();
-  }
-
-  std::string name_;
-  std::string type_;
-  std::chrono::system_clock::time_point start_;
-  std::chrono::system_clock::time_point end_;
-};
-
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
-  RecordTime rt("OperatorWithKernel::All", type_);
-  {
-    RecordTime rt("OperatorWithKernel::InferShape", type_);
-    RuntimeInferShapeContext infer_shape_ctx(*this, scope);
-    this->InferShape(&infer_shape_ctx);
-  }
-
-  {
-    RecordTime* rt_1 = new RecordTime("OperatorWithKernel::Compute1", type_);
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto* dev_ctx = pool.Get(place);
+  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
+  this->InferShape(&infer_shape_ctx);
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(place);
 
-    // check if op[type] has kernel registered.
-    auto& all_op_kernels = AllOpKernels();
-    auto kernels_iter = all_op_kernels.find(type_);
-    if (kernels_iter == all_op_kernels.end()) {
-      PADDLE_THROW(
-          "There are no kernels which are registered in the %s operator.",
-          type_);
-    }
+  // check if op[type] has kernel registered.
+  auto& all_op_kernels = AllOpKernels();
+  auto kernels_iter = all_op_kernels.find(type_);
+  if (kernels_iter == all_op_kernels.end()) {
+    PADDLE_THROW(
+        "There are no kernels which are registered in the %s operator.", type_);
+  }
 
-    OpKernelMap& kernels = kernels_iter->second;
+  OpKernelMap& kernels = kernels_iter->second;
 
-    // TODO(dzhwinter) : kernel fallback mechanism will be added when all the
-    // transform functions are ready.
+  // TODO(dzhwinter) : kernel fallback mechanism will be added when all the
+  // transform functions are ready.
 
-    // for (auto& candidate : kKernelPriority) {
-    //   Do selection
-    // }
+  // for (auto& candidate : kKernelPriority) {
+  //   Do selection
+  // }
 
-    auto expected_kernel_key =
-        this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
-    VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+  auto expected_kernel_key =
+      this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
+  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
-    auto kernel_iter = kernels.find(expected_kernel_key);
+  auto kernel_iter = kernels.find(expected_kernel_key);
 #ifdef PADDLE_WITH_MKLDNN
-    // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
-    if (kernel_iter == kernels.end() &&
-        expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
-      VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
-      expected_kernel_key.library_type_ = LibraryType::kPlain;
-      expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
-      kernel_iter = kernels.find(expected_kernel_key);
-    }
+  // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
+  if (kernel_iter == kernels.end() &&
+      expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
+    VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
+    expected_kernel_key.library_type_ = LibraryType::kPlain;
+    expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
 #endif
-    if (kernel_iter == kernels.end()) {
-      PADDLE_THROW("op %s does not have kernel for %s", type_,
-                   KernelTypeToString(expected_kernel_key));
-    }
+  if (kernel_iter == kernels.end()) {
+    PADDLE_THROW("op %s does not have kernel for %s", type_,
+                 KernelTypeToString(expected_kernel_key));
+  }
 
-    // do data transformScope &transfer_scope;
-    std::vector<std::string> transfered_inplace_vars;
-    Scope* transfer_scope = nullptr;
-    // auto* transfer_scope =
-    // TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars);
+  // do data transformScope &transfer_scope;
+  std::vector<std::string> transfered_inplace_vars;
+  auto* transfer_scope =
+      TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars);
 
-    // exec scope is the scope that kernel actually executed on.
-    const Scope& exec_scope = scope;
-    // const Scope& exec_scope =
-    // (transfer_scope == nullptr ? scope : *transfer_scope);
+  // exec scope is the scope that kernel actually executed on.
+  const Scope& exec_scope =
+      (transfer_scope == nullptr ? scope : *transfer_scope);
 
-    if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) {
-      dev_ctx = pool.Get(expected_kernel_key.place_);
-    }
-    delete rt_1;
+  if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) {
+    dev_ctx = pool.Get(expected_kernel_key.place_);
+  }
 
-    RecordTime* rt_2 = new RecordTime("OperatorWithKernel::Compute2", type_);
-    kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx));
-    delete rt_2;
+  kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx));
 
-    RecordTime* rt_3 = new RecordTime("OperatorWithKernel::Compute3", type_);
-    if (!transfered_inplace_vars.empty()) {
-      // there is inplace variable has been transfered.
-      TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope);
-    }
+  if (!transfered_inplace_vars.empty()) {
+    // there is inplace variable has been transfered.
+    TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope);
+  }
 
-    /*For profiling/benchmark only*/
-    if (FLAGS_benchmark) {
-      dev_ctx->Wait();
-    }
+  /*For profiling/benchmark only*/
+  if (FLAGS_benchmark) {
+    dev_ctx->Wait();
+  }
 
-    if (FLAGS_check_nan_inf) {
-      for (auto& vname : OutputVars(true)) {
-        auto* var = exec_scope.FindVar(vname);
-        if (var == nullptr) continue;
-        if (var->IsType<framework::LoDTensor>()) {
-          CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
-        } else if (var->IsType<framework::SelectedRows>()) {
-          CheckTensorNANOrInf(vname,
-                              var->Get<framework::SelectedRows>().value());
-        }
+  if (FLAGS_check_nan_inf) {
+    for (auto& vname : OutputVars(true)) {
+      auto* var = exec_scope.FindVar(vname);
+      if (var == nullptr) continue;
+      if (var->IsType<framework::LoDTensor>()) {
+        CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
+      } else if (var->IsType<framework::SelectedRows>()) {
+        CheckTensorNANOrInf(vname, var->Get<framework::SelectedRows>().value());
       }
     }
-    delete rt_3;
   }
 }
 void OperatorWithKernel::TransferInplaceVarsBack(
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index 181baac870..87bf7c6b15 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -33,37 +33,34 @@ class ElementwiseOp : public framework::OperatorWithKernel {
   using Tensor = framework::Tensor;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    if (!ctx->IsRuntime()) {
-      PADDLE_ENFORCE(ctx->HasInput("X"),
-                     "Input(X) of elementwise op should not be null.");
-      PADDLE_ENFORCE(ctx->HasInput("Y"),
-                     "Input(Y) of elementwise op should not be null.");
-      PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                     "Output(Out) of elementwise op should not be null.");
-
-      PADDLE_ENFORCE(ctx->GetInputsVarType("Y").front() ==
-                         framework::proto::VarType::LOD_TENSOR,
-                     "The input var's type should be LoDTensor, but the "
-                     "received is %s [%s]",
-                     ctx->GetInputsVarType("Y").front(),
-                     ctx->Inputs("Y").front());
-
-      if (ctx->GetInputsVarType("X").front() ==
-          framework::proto::VarType::LOD_TENSOR) {
-        auto x_dim = ctx->GetInputDim("X");
-        auto y_dim = ctx->GetInputDim("Y");
-        PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
-                          "Rank of first input must >= rank of second input.");
-      } else if (ctx->GetInputsVarType("X").front() ==
-                 framework::proto::VarType::SELECTED_ROWS) {
-        PADDLE_ENFORCE((ctx->GetInputDim("Y").size() == 1u) &&
-                           (ctx->GetInputDim("Y")[0] == 1),
-                       "For elementwise_op, if X is Sparse, "
-                       "Y must be scalar.");
-      } else {
-        PADDLE_THROW("X's type[%s] is not supported by elementwise_op.",
-                     ctx->GetInputsVarType("X").front());
-      }
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of elementwise op should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"),
+                   "Input(Y) of elementwise op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of elementwise op should not be null.");
+
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Y").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s [%s]",
+        ctx->GetInputsVarType("Y").front(), ctx->Inputs("Y").front());
+
+    if (ctx->GetInputsVarType("X").front() ==
+        framework::proto::VarType::LOD_TENSOR) {
+      auto x_dim = ctx->GetInputDim("X");
+      auto y_dim = ctx->GetInputDim("Y");
+      PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
+                        "Rank of first input must >= rank of second input.");
+    } else if (ctx->GetInputsVarType("X").front() ==
+               framework::proto::VarType::SELECTED_ROWS) {
+      PADDLE_ENFORCE((ctx->GetInputDim("Y").size() == 1u) &&
+                         (ctx->GetInputDim("Y")[0] == 1),
+                     "For elementwise_op, if X is Sparse, "
+                     "Y must be scalar.");
+    } else {
+      PADDLE_THROW("X's type[%s] is not supported by elementwise_op.",
+                   ctx->GetInputsVarType("X").front());
     }
 
     ctx->ShareDim("X", /*->*/ "Out");
@@ -128,7 +125,7 @@ The equation is:
 
 $$%s$$
 
-- $X$: a tensor of any dimension.
+- $X$: a tensor of any dimension. 
 - $Y$: a tensor whose dimensions must be less than or equal to the dimensions of $X$.
 
 There are two cases for this operator:
@@ -138,10 +135,10 @@ There are two cases for this operator:
 
 For case 2:
 
-1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index
-   for broadcasting $Y$ onto $X$.
+1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index 
+   for broadcasting $Y$ onto $X$. 
 2. If $axis$ is -1 (default), $axis = rank(X) - rank(Y)$.
-3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of
+3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of 
    subsequence, such as shape(Y) = (2, 1) => (2).
 
 For example:
@@ -155,7 +152,7 @@ For example:
     shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
     shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0
 
-The inputs $X$ and $Y$ can carry the different LoD information.
+The inputs $X$ and $Y$ can carry the different LoD information. 
 But the output only shares the LoD information with the input $X$.
 
 )DOC",
diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc
index bc1b20321f..5710cda39a 100644
--- a/paddle/fluid/operators/optimizers/adam_op.cc
+++ b/paddle/fluid/operators/optimizers/adam_op.cc
@@ -23,57 +23,56 @@ class AdamOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    // PADDLE_ENFORCE(ctx->HasInput("Param"),
-    // "Input(Param) of AdamOp should not be null.");
-    // PADDLE_ENFORCE(ctx->HasInput("Grad"),
-    // "Input(Grad) of AdamOp should not be null.");
-    // PADDLE_ENFORCE(ctx->HasInput("Moment1"),
-    // "Input(Moment1) of AdamOp should not be null.");
-    // PADDLE_ENFORCE(ctx->HasInput("Moment2"),
-    // "Input(Moment2) of AdamOp should not be null.");
-    // PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-    // "Input(LearningRate) of AdamOp should not be null.");
-    // PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
-    // "Input(Beta1Pow) of AdamOp should not be null.");
-    // PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"),
-    // "Input(Beta2Pow) of AdamOp should not be null.");
-
-    // PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-    // "Output(ParamOut) of AdamOp should not be null.");
-    // PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"),
-    // "Output(Moment1Out) of AdamOp should not be null.");
-    // PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"),
-    // "Output(Moment2Out) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment1"),
+                   "Input(Moment1) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment2"),
+                   "Input(Moment2) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
+                   "Input(Beta1Pow) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"),
+                   "Input(Beta2Pow) of AdamOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"),
+                   "Output(Moment1Out) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"),
+                   "Output(Moment2Out) of AdamOp should not be null.");
 
     auto lr_dims = ctx->GetInputDim("LearningRate");
-    // PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-    // "Learning rate should have 1 dimension");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "Learning rate should have 1 dimension");
     auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
-    // PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
-    // "Beta1 power accumulator should have 1 dimension");
+    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
+                      "Beta1 power accumulator should have 1 dimension");
     auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
-    // PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1,
-    // "Beta2 power accumulator should have 1 dimension");
+    PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1,
+                      "Beta2 power accumulator should have 1 dimension");
 
     auto param_dims = ctx->GetInputDim("Param");
-    // if (ctx->GetInputsVarType("Grad")[0] ==
-    // framework::proto::VarType::LOD_TENSOR) {
-    // PADDLE_ENFORCE_EQ(
-    // param_dims, ctx->GetInputDim("Grad"),
-    // "Param and Grad input of AdamOp should have same dimension");
-    // }
-    // PADDLE_ENFORCE_EQ(
-    // param_dims, ctx->GetInputDim("Moment1"),
-    // "Param and Moment1 input of AdamOp should have same dimension");
-    // PADDLE_ENFORCE_EQ(
-    // param_dims, ctx->GetInputDim("Moment2"),
-    // "Param and Moment2 input of AdamOp should have same dimension");
+    if (ctx->GetInputsVarType("Grad")[0] ==
+        framework::proto::VarType::LOD_TENSOR) {
+      PADDLE_ENFORCE_EQ(
+          param_dims, ctx->GetInputDim("Grad"),
+          "Param and Grad input of AdamOp should have same dimension");
+    }
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Moment1"),
+        "Param and Moment1 input of AdamOp should have same dimension");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Moment2"),
+        "Param and Moment2 input of AdamOp should have same dimension");
 
     ctx->SetOutputDim("ParamOut", param_dims);
     ctx->SetOutputDim("Moment1Out", param_dims);
     ctx->SetOutputDim("Moment2Out", param_dims);
   }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type =

From 1b61021cb36eae45e142a953c2c96cf46853aa7c Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Wed, 12 Dec 2018 17:02:24 +0800
Subject: [PATCH 05/77] Polish code

---
 paddle/fluid/framework/ir/graph.cc | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index 9ebf136698..db74d5674a 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -57,7 +57,7 @@ void CheckProgram(const ProgramDesc &program) {
         } else {
           if (visit.find(_INT(OpRole::kOptimize)) != visit.end()) {
             LOG(ERROR)
-                << "Cannot add backward operator %s after optimize operator.",
+                << "Cannot add backward operator %s after optimize operator."
                 << op->Type();
           }
         }
@@ -82,8 +82,8 @@ void CheckProgram(const ProgramDesc &program) {
 
           if (visit.find(_INT(OpRole::kOptimize)) != visit.end()) {
             LOG(ERROR) << "Cannot add forward|loss operator %s after optimize "
-                          "operator.",
-                << op->Type();
+                          "operator."
+                       << op->Type();
           }
         }
         break;
@@ -95,9 +95,8 @@ void CheckProgram(const ProgramDesc &program) {
                          op->Type());
         } else {
           if (visit.find(_INT(OpRole::kBackward)) == visit.end()) {
-            LOG(ERROR)
-                << "Optimize operators %s must follow backward operator.",
-                << op->Type();
+            LOG(ERROR) << "Optimize operators %s must follow backward operator."
+                       << op->Type();
           }
         }
         break;

From a61eb543f5796d9899bff073e5f6647bc1003d71 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Wed, 12 Dec 2018 19:18:45 +0800
Subject: [PATCH 06/77] Add RWLock to Scope

---
 paddle/fluid/framework/rw_lock.h | 16 ++++++++++++----
 paddle/fluid/framework/scope.cc  | 11 ++++-------
 paddle/fluid/framework/scope.h   |  4 ++--
 3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h
index dbf00f3a79..dd918fcdfa 100644
--- a/paddle/fluid/framework/rw_lock.h
+++ b/paddle/fluid/framework/rw_lock.h
@@ -16,7 +16,9 @@ limitations under the License. */
 
 #if !defined(_WIN32)
 #include <pthread.h>
-#endif  // !_WIN32
+#else
+#include <mutex>  // NOLINT
+#endif            // !_WIN32
 
 #include "paddle/fluid/platform/enforce.h"
 
@@ -51,9 +53,15 @@ struct RWLock {
 // https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive
 // In windows, rw_lock seems like a hack. Use empty object and do nothing.
 struct RWLock {
-  void RDLock() {}
-  void WRLock() {}
-  void UNLock() {}
+  // FIXME(minqiyang): use mutex here to do fake lock
+  void RDLock() { mutex_.lock(); }
+
+  void WRLock() { mutex_.lock(); }
+
+  void UNLock() { mutex_.unlock(); }
+
+ private:
+  std::mutex mutex_;
 };
 #endif
 
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 61416676d6..190a057d9e 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -46,13 +46,10 @@ DEFINE_double(
 #define SCOPE_READER_LOCK
 #define SCOPE_WRITER_LOCK
 #else
-// TODO(minqiyang): use reader lock and writer lock in all platforms
-#define SCOPE_READER_LOCK
-#define SCOPE_WRITER_LOCK
-// #define SCOPE_READER_LOCK boost::shared_lock<boost::shared_mutex>
-// lock(mutex_);
-// #define SCOPE_WRITER_LOCK boost::unique_lock<boost::shared_mutex>
-// lock(mutex_);
+// TODO(minqiyang): use rwlock in all platforms, now rwlock is a fake one
+// in _WIN32 platform
+#define SCOPE_READER_LOCK RWLockGuard(&rw_lock_, RWLockGuard::Status::kRDLock);
+#define SCOPE_WRITER_LOCK RWLockGuard(&rw_lock_, RWLockGuard::Status::kWRLock);
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index 1901ffbe57..c140212c3e 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -15,11 +15,11 @@ limitations under the License. */
 #pragma once
 
 #include <list>
-#include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/macros.h"
 
@@ -123,7 +123,7 @@ class Scope {
   DISABLE_COPY_AND_ASSIGN(Scope);
 
  private:
-  mutable std::mutex mutex_;
+  mutable RWLock rw_lock_;
 };
 
 // Generate some debug string about the inherience structure of scope, quite

From ad6ae0b071041c1f69c66c7c173733bfe7cb2752 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 13 Dec 2018 18:39:46 +0800
Subject: [PATCH 07/77] 1. Add SpinLock 2. Seperate the lock of kids and vars
 in Scope

test=develop
---
 CMakeLists.txt                                |  1 +
 cmake/external/robin_map.cmake                | 31 +++++++
 .../framework/details/execution_strategy.h    |  2 +-
 .../scope_buffered_ssa_graph_executor.cc      |  9 +-
 paddle/fluid/framework/operator.cc            |  6 +-
 paddle/fluid/framework/rw_lock.h              | 91 +++++--------------
 paddle/fluid/framework/scope.cc               | 58 ++++++------
 paddle/fluid/framework/scope.h                | 15 ++-
 paddle/fluid/framework/spin_lock.h            | 71 +++++++++++++++
 paddle/fluid/operators/optimizers/adam_op.h   | 17 ----
 paddle/fluid/pybind/pybind.cc                 |  2 +-
 python/paddle/fluid/optimizer.py              | 43 +++++----
 12 files changed, 201 insertions(+), 145 deletions(-)
 create mode 100644 cmake/external/robin_map.cmake
 create mode 100644 paddle/fluid/framework/spin_lock.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3e59aca2d9..2abbcef41a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -215,6 +215,7 @@ include(external/xxhash)    # download xxhash
 include(external/dlpack)
 include(external/snappy)    # download snappy
 include(external/snappystream) # download snappystream
+include(external/robin_map) # download tsl::robin_map
 
 if (NOT WIN32)
 # there is no official support of warpctc, nccl, cupti in windows
diff --git a/cmake/external/robin_map.cmake b/cmake/external/robin_map.cmake
new file mode 100644
index 0000000000..ddaf59536c
--- /dev/null
+++ b/cmake/external/robin_map.cmake
@@ -0,0 +1,31 @@
+include(ExternalProject)
+
+set(ROBIN_MAP_SOURCE_DIR ${THIRD_PARTY_PATH}/robin_map)
+set(ROBIN_MAP_INCLUDE_DIR ${ROBIN_MAP_SOURCE_DIR}/src/extern_robin_map/include)
+
+include_directories(${ROBIN_MAP_INCLUDE_DIR})
+
+ExternalProject_Add(
+  extern_robin_map
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  GIT_REPOSITORY "https://github.com/Tessil/robin-map.git"
+  GIT_TAG        "v0.5.0"
+  PREFIX         ${ROBIN_MAP_SOURCE_DIR}
+  UPDATE_COMMAND ""
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   ""
+  TEST_COMMAND      ""
+)
+
+if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
+  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/robin_map_dummy.c)
+  file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
+  add_library(robin_map STATIC ${dummyfile})
+else()
+  add_library(robin_map INTERFACE)
+endif()
+
+add_dependencies(robin_map extern_robin_map)
+
+LIST(APPEND externl_project_dependencies robin_map)
diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h
index 15c496130c..37b07e5736 100644
--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -25,7 +25,7 @@ struct ExecutionStrategy {
   size_t num_threads_{0};
   bool use_cuda_{true};
   bool allow_op_delay_{false};
-  size_t num_iteration_per_drop_scope_{100};
+  size_t num_iteration_per_drop_scope_{1};
   ExecutorType type_{kDefault};
   bool dry_run_{false};
 };
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index 499246a985..9ded0266a9 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -76,9 +76,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
                            : nullptr;
 #endif
 
-  if (!fetch_tensors.empty() ||
-      drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
-    drop_scope_counter_ = 0;
+  if (!fetch_tensors.empty()) {
     // Wait All computational streams
     for (auto p : places_) {
       platform::DeviceContextPool::Instance().Get(p)->Wait();
@@ -91,12 +89,17 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
       }
 #endif
     }
+  }
+
+  if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
+    drop_scope_counter_ = 0;
     for (auto &scope : local_scopes_) {
       auto &local_scope =
           *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
       scope->DeleteScope(local_scope);
     }
   }
+
   if (eptr) {
     std::rethrow_exception(eptr);
   } else {
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index c6f3254e9f..58e5926f54 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -163,11 +163,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 }
 
 bool OperatorBase::HasInputs(const std::string& name) const {
-  if (inputs_.find(name) != inputs_.end()) {
-    return true;
-  } else {
-    return false;
-  }
+  return inputs_.find(name) != inputs_.end();
 }
 
 std::string OperatorBase::Input(const std::string& name) const {
diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h
index dd918fcdfa..75e6bef9bf 100644
--- a/paddle/fluid/framework/rw_lock.h
+++ b/paddle/fluid/framework/rw_lock.h
@@ -31,17 +31,17 @@ struct RWLock {
 
   ~RWLock() { pthread_rwlock_destroy(&lock_); }
 
-  void RDLock() {
+  inline void RDLock() {
     PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0,
                       "acquire read lock failed");
   }
 
-  void WRLock() {
+  inline void WRLock() {
     PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0,
                       "acquire write lock failed");
   }
 
-  void UNLock() {
+  inline void UNLock() {
     PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
   }
 
@@ -54,86 +54,43 @@ struct RWLock {
 // In windows, rw_lock seems like a hack. Use empty object and do nothing.
 struct RWLock {
   // FIXME(minqiyang): use mutex here to do fake lock
-  void RDLock() { mutex_.lock(); }
+  inline void RDLock() { mutex_.lock(); }
 
-  void WRLock() { mutex_.lock(); }
+  inline void WRLock() { mutex_.lock(); }
 
-  void UNLock() { mutex_.unlock(); }
+  inline void UNLock() { mutex_.unlock(); }
 
  private:
   std::mutex mutex_;
 };
 #endif
 
-class RWLockGuard {
+class AutoWRLock {
  public:
-  enum Status { kUnLock, kWRLock, kRDLock };
-
-  RWLockGuard(RWLock* rw_lock, Status init_status)
-      : lock_(rw_lock), status_(Status::kUnLock) {
-    switch (init_status) {
-      case Status::kRDLock: {
-        RDLock();
-        break;
-      }
-      case Status::kWRLock: {
-        WRLock();
-        break;
-      }
-      case Status::kUnLock: {
-        break;
-      }
-    }
-  }
+  explicit AutoWRLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
 
-  void WRLock() {
-    switch (status_) {
-      case Status::kUnLock: {
-        lock_->WRLock();
-        status_ = Status::kWRLock;
-        break;
-      }
-      case Status::kWRLock: {
-        break;
-      }
-      case Status::kRDLock: {
-        PADDLE_THROW(
-            "Please unlock read lock first before invoking write lock.");
-        break;
-      }
-    }
-  }
+  inline void Lock() { lock_->WRLock(); }
 
-  void RDLock() {
-    switch (status_) {
-      case Status::kUnLock: {
-        lock_->RDLock();
-        status_ = Status::kRDLock;
-        break;
-      }
-      case Status::kRDLock: {
-        break;
-      }
-      case Status::kWRLock: {
-        PADDLE_THROW(
-            "Please unlock write lock first before invoking read lock.");
-        break;
-      }
-    }
-  }
+  inline void UnLock() { lock_->UNLock(); }
 
-  void UnLock() {
-    if (status_ != Status::kUnLock) {
-      lock_->UNLock();
-      status_ = Status::kUnLock;
-    }
-  }
+  ~AutoWRLock() { UnLock(); }
+
+ private:
+  RWLock* lock_;
+};
+
+class AutoRDLock {
+ public:
+  explicit AutoRDLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
+
+  inline void Lock() { lock_->RDLock(); }
+
+  inline void UnLock() { lock_->UNLock(); }
 
-  ~RWLockGuard() { UnLock(); }
+  ~AutoRDLock() { UnLock(); }
 
  private:
   RWLock* lock_;
-  Status status_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 190a057d9e..f05208c5ec 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <set>
 #include <unordered_set>
 #include "glog/logging.h"
-#include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/string/printf.h"
 
 DEFINE_bool(benchmark, false,
@@ -43,13 +42,15 @@ DEFINE_double(
 // the mutex will cause serious performance issue.
 // So the mutex is disabled when `ON_INFER`.
 #ifdef PADDLE_ON_INFERENCE
-#define SCOPE_READER_LOCK
-#define SCOPE_WRITER_LOCK
+#define SCOPE_KIDS_READER_LOCK
+#define SCOPE_KIDS_WRITER_LOCK
+#define SCOPE_VARS_READER_LOCK
+#define SCOPE_VARS_WRITER_LOCK
 #else
-// TODO(minqiyang): use rwlock in all platforms, now rwlock is a fake one
-// in _WIN32 platform
-#define SCOPE_READER_LOCK RWLockGuard(&rw_lock_, RWLockGuard::Status::kRDLock);
-#define SCOPE_WRITER_LOCK RWLockGuard(&rw_lock_, RWLockGuard::Status::kWRLock);
+#define SCOPE_KIDS_READER_LOCK AutoRDLock(&kids_lock_);
+#define SCOPE_KIDS_WRITER_LOCK AutoWRLock(&kids_lock_);
+#define SCOPE_VARS_READER_LOCK AutoRDLock(&vars_lock_);
+#define SCOPE_VARS_WRITER_LOCK AutoWRLock(&vars_lock_);
 #endif
 
 namespace paddle {
@@ -65,64 +66,69 @@ int64_t GetEagerDeletionThreshold() {
 Scope::~Scope() { DropKids(); }
 
 Scope& Scope::NewScope() const {
-  SCOPE_WRITER_LOCK
-  kids_.push_back(new Scope(this));
-  return *kids_.back();
+  Scope* child = new Scope(this);
+  {
+    SCOPE_KIDS_WRITER_LOCK
+    kids_.push_back(child);
+  }
+  return *child;
 }
 
 Variable* Scope::Var(const std::string& name) {
-  SCOPE_WRITER_LOCK
+  SCOPE_VARS_WRITER_LOCK
   return VarInternal(name);
 }
 
 Variable* Scope::Var(std::string* name) {
-  SCOPE_WRITER_LOCK
   auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   if (name != nullptr) {
     *name = new_name;
   }
+  SCOPE_VARS_WRITER_LOCK
   return VarInternal(new_name);
 }
 
 Variable* Scope::FindVar(const std::string& name) const {
-  SCOPE_READER_LOCK
+  SCOPE_VARS_READER_LOCK
   return FindVarInternal(name);
 }
 
 Variable* Scope::FindLocalVar(const std::string& name) const {
-  SCOPE_READER_LOCK
+  SCOPE_VARS_READER_LOCK
   return FindVarLocally(name);
 }
 
 const Scope* Scope::FindScope(const Variable* var) const {
-  SCOPE_READER_LOCK
+  SCOPE_VARS_READER_LOCK
   return FindScopeInternal(var);
 }
 
 void Scope::DropKids() {
-  SCOPE_WRITER_LOCK
+  SCOPE_KIDS_WRITER_LOCK
   for (Scope* s : kids_) delete s;
   kids_.clear();
 }
 
 bool Scope::HasKid(const Scope* scope) const {
-  SCOPE_READER_LOCK
+  SCOPE_KIDS_READER_LOCK
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   return it != this->kids_.end();
 }
 
 std::vector<std::string> Scope::LocalVarNames() const {
-  SCOPE_READER_LOCK
   std::vector<std::string> known_vars;
-  known_vars.reserve(this->vars_.size());
-  for (auto& p : vars_) {
-    known_vars.emplace_back(p.first);
+  {
+    SCOPE_VARS_READER_LOCK
+    known_vars.reserve(this->vars_.size());
+    for (auto& p : vars_) {
+      known_vars.emplace_back(p.first);
+    }
   }
   return known_vars;
 }
 
 void Scope::DeleteScope(Scope* scope) const {
-  SCOPE_WRITER_LOCK
+  SCOPE_KIDS_WRITER_LOCK
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope",
                  this, scope);
@@ -136,8 +142,8 @@ void Scope::DeleteScope(Scope* scope) const {
 }
 
 void Scope::EraseVars(const std::vector<std::string>& var_names) {
-  SCOPE_WRITER_LOCK
   std::set<std::string> var_set(var_names.begin(), var_names.end());
+  SCOPE_VARS_WRITER_LOCK
   for (auto it = vars_.begin(); it != vars_.end();) {
     if (var_set.find(it->first) != var_set.end()) {
       it = vars_.erase(it);
@@ -149,12 +155,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
 
 void Scope::Rename(const std::string& origin_name,
                    const std::string& new_name) const {
-  SCOPE_WRITER_LOCK
+  SCOPE_VARS_WRITER_LOCK
   RenameInternal(origin_name, new_name);
 }
 
 std::string Scope::Rename(const std::string& origin_name) const {
-  SCOPE_WRITER_LOCK
+  SCOPE_VARS_WRITER_LOCK
   auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   RenameInternal(origin_name, new_name);
   return new_name;
@@ -188,7 +194,7 @@ void Scope::RenameInternal(const std::string& origin_name,
   auto new_it = vars_.find(new_name);
   PADDLE_ENFORCE(new_it == vars_.end(),
                  "The variable with name %s is already in the scope", new_name);
-  vars_[new_name].reset(origin_it->second.release());
+  vars_[new_name].reset(origin_it.value().release());
   vars_.erase(origin_it);
 }
 
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index c140212c3e..78ad8be500 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -14,11 +14,15 @@ limitations under the License. */
 
 #pragma once
 
+#include <functional>
 #include <list>
+#include <memory>
 #include <string>
-#include <unordered_map>
+#include <utility>
 #include <vector>
 
+#include <tsl/robin_map.h>  // NOLINT
+
 #include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/macros.h"
@@ -94,7 +98,11 @@ class Scope {
   std::string Rename(const std::string& origin_name) const;
 
  protected:
-  mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
+  mutable tsl::robin_map<
+      std::string, std::unique_ptr<Variable>, std::hash<std::string>,
+      std::equal_to<std::string>,
+      std::allocator<std::pair<std::string, std::unique_ptr<Variable>>>, true>
+      vars_;
 
  private:
   // Call Scope::NewScope for a sub-scope.
@@ -123,7 +131,8 @@ class Scope {
   DISABLE_COPY_AND_ASSIGN(Scope);
 
  private:
-  mutable RWLock rw_lock_;
+  mutable RWLock kids_lock_;
+  mutable RWLock vars_lock_;
 };
 
 // Generate some debug string about the inherience structure of scope, quite
diff --git a/paddle/fluid/framework/spin_lock.h b/paddle/fluid/framework/spin_lock.h
new file mode 100644
index 0000000000..11a763d655
--- /dev/null
+++ b/paddle/fluid/framework/spin_lock.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#if !defined(_WIN32)
+#include <pthread.h>
+#else
+#include <mutex>  // NOLINT
+#endif            // !_WIN32
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+#if !defined(_WIN32)
+struct SpinLock {
+  SpinLock() { pthread_spin_init(&lock_, PTHREAD_PROCESS_PRIVATE); }
+
+  ~SpinLock() { pthread_spin_destroy(&lock_); }
+
+  void Lock() {
+    PADDLE_ENFORCE_EQ(pthread_spin_lock(&lock_), 0, "acquire spin lock failed");
+  }
+
+  void Unlock() {
+    PADDLE_ENFORCE_EQ(pthread_spin_unlock(&lock_), 0,
+                      "release spin lock failed");
+  }
+
+ private:
+  pthread_spinlock_t lock_;
+};
+#else
+// FIXME(minqiyang): use mutex here to do fake spin lock
+struct SpinLock {
+  void Lock() { mutex_.lock(); }
+
+  void Unlock() { mutex_.lock(); }
+
+ private:
+  std::mutex mutex_;
+};
+#endif
+
+class AutoSpinLock {
+ public:
+  explicit SpinLockGuard(SpinLock* spin_lock) : lock_(spin_lock) {
+    lock_->Lock();
+  }
+
+  ~SpinLockGuard() { lock_->Unlock(); }
+
+ private:
+  SpinLock* lock_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
index 2205f473f2..3455d1ee54 100644
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -292,23 +292,6 @@ class AdamOpKernel : public framework::OpKernel<T> {
             static_cast<const DeviceContext&>(ctx.device_context()),
             param.numel());
         for_range(functor);
-
-        auto& dev =
-            *ctx.template device_context<DeviceContext>().eigen_device();
-
-        const LoDTensor* beta1_pow_ptr = ctx.Input<LoDTensor>("Beta1Pow");
-        auto eigen_in_beta1_pow =
-            framework::EigenVector<T>::Flatten(*beta1_pow_ptr);
-        auto eigen_out_beta1_pow = framework::EigenVector<T>::Flatten(
-            *(const_cast<LoDTensor*>(beta1_pow_ptr)));
-        eigen_out_beta1_pow.device(dev) = beta1 * eigen_in_beta1_pow;
-
-        const LoDTensor* beta2_pow_ptr = ctx.Input<LoDTensor>("Beta2Pow");
-        auto eigen_in_beta2_pow =
-            framework::EigenVector<T>::Flatten(*beta2_pow_ptr);
-        auto eigen_out_beta2_pow = framework::EigenVector<T>::Flatten(
-            *(const_cast<LoDTensor*>(beta2_pow_ptr)));
-        eigen_out_beta2_pow.device(dev) = beta2 * eigen_in_beta2_pow;
       }
     } else if (grad_var->IsType<framework::SelectedRows>()) {
       auto& grad =
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 58ef3da0b2..f831f2313e 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -765,7 +765,7 @@ All parameter, weight, gradient are variables in Paddle.
           R"DOC(The type is INT, num_iteration_per_drop_scope indicates how
                 many iterations to clean up the temp variables which
                 is generated during execution. It may make the execution faster,
-                because the temp variable's shape maybe the same between two iterations. Default 100.
+                because the temp variable's shape maybe the same between two iterations. Default 1.
 
                 NOTES:
                     1. If you fetch data when calling the 'run', the ParallelExecutor
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 1930ac106b..da92826d41 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -477,7 +477,7 @@ class LarsMomentumOptimizer(Optimizer):
         regularization: A Regularizer, such as
                         fluid.regularizer.L2DecayRegularizer.
         name: A optional name prefix.
-
+        
 
     Examples:
         .. code-block:: python
@@ -739,27 +739,26 @@ class AdamOptimizer(Optimizer):
         """
         assert isinstance(block, framework.Block)
         main_block = block.program.global_block()
-        #  for param, grad in param_and_grads:
-
-    #  if grad is None:
-    #  continue
-    #  with param.block.program._optimized_guard(
-    #  [param, grad]), name_scope("optimizer"):
-    #  beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
-    #  param)
-    #  beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
-    #  param)
-    #  main_block.append_op(
-    #  type="scale",
-    #  inputs={"X": beta1_pow_acc},
-    #  outputs={"Out": beta1_pow_acc},
-    #  attrs={"scale": self._beta1})
-
-    #  main_block.append_op(
-    #  type="scale",
-    #  inputs={"X": beta2_pow_acc},
-    #  outputs={"Out": beta2_pow_acc},
-    #  attrs={"scale": self._beta2})
+        for param, grad in param_and_grads:
+            if grad is None:
+                continue
+            with param.block.program._optimized_guard(
+                [param, grad]), name_scope("optimizer"):
+                beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
+                                                      param)
+                beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
+                                                      param)
+                main_block.append_op(
+                    type="scale",
+                    inputs={"X": beta1_pow_acc},
+                    outputs={"Out": beta1_pow_acc},
+                    attrs={"scale": self._beta1})
+
+                main_block.append_op(
+                    type="scale",
+                    inputs={"X": beta2_pow_acc},
+                    outputs={"Out": beta2_pow_acc},
+                    attrs={"scale": self._beta2})
 
 
 class AdamaxOptimizer(Optimizer):

From a81495d6f4a71980b51cc3099f8cd76885cdcb13 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 13 Dec 2018 18:45:20 +0800
Subject: [PATCH 08/77] Fix code

---
 paddle/fluid/framework/scope.cc | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index f05208c5ec..d2856a07a1 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <set>
 #include <unordered_set>
 #include "glog/logging.h"
+#include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/string/printf.h"
 
 DEFINE_bool(benchmark, false,
@@ -47,10 +48,10 @@ DEFINE_double(
 #define SCOPE_VARS_READER_LOCK
 #define SCOPE_VARS_WRITER_LOCK
 #else
-#define SCOPE_KIDS_READER_LOCK AutoRDLock(&kids_lock_);
-#define SCOPE_KIDS_WRITER_LOCK AutoWRLock(&kids_lock_);
-#define SCOPE_VARS_READER_LOCK AutoRDLock(&vars_lock_);
-#define SCOPE_VARS_WRITER_LOCK AutoWRLock(&vars_lock_);
+#define SCOPE_KIDS_READER_LOCK AutoRDLock auto_lock(&kids_lock_);
+#define SCOPE_KIDS_WRITER_LOCK AutoWRLock auto_lock(&kids_lock_);
+#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_);
+#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_);
 #endif
 
 namespace paddle {

From 19a798018f82b9eaa31aa8d84f8aa4306bbf8973 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 13 Dec 2018 18:51:28 +0800
Subject: [PATCH 09/77] Remove dup cmake

test=develop
---
 CMakeLists.txt | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bf724e8aa9..1b2e0ecf6c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -81,12 +81,6 @@ option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
 option(WITH_FAST_MATH   "Make use of fast math library, might affect the precision to some extent" ON)
 
-if (WITH_PROFILER)
-    find_package(Gperftools REQUIRED)
-    include_directories(${GPERFTOOLS_INCLUDE_DIR})
-    add_definitions(-DWITH_GPERFTOOLS)
-endif()
-
 # PY_VERSION
 if(NOT PY_VERSION)
   set(PY_VERSION 2.7)

From 728e7e88fb2c3467f6e28ef968b4e720d290b26c Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 17 Dec 2018 13:37:57 +0800
Subject: [PATCH 10/77] Use xxHash as scope's hash algorithm

test=develop
---
 paddle/fluid/framework/CMakeLists.txt |  2 +-
 paddle/fluid/framework/scope.cc       |  2 +-
 paddle/fluid/framework/scope.h        | 26 ++++++++++++++++++++------
 python/paddle/fluid/profiler.py       |  2 +-
 4 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index cea4a44857..5dca5ac598 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -82,7 +82,7 @@ cc_test(variable_test SRCS variable_test.cc)
 cc_library(threadpool SRCS threadpool.cc DEPS enforce)
 cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
 
-cc_library(scope SRCS scope.cc DEPS glog threadpool)
+cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
 
 cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index b1abe75d76..4f79d98260 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -201,7 +201,7 @@ void Scope::RenameInternal(const std::string& origin_name,
   auto new_it = vars_.find(new_name);
   PADDLE_ENFORCE(new_it == vars_.end(),
                  "The variable with name %s is already in the scope", new_name);
-  vars_[new_name].reset(origin_it.value().release());
+  vars_[new_name].reset(origin_it->second.release());
   vars_.erase(origin_it);
 }
 
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index b232d267db..77ef18414d 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -14,15 +14,18 @@ limitations under the License. */
 
 #pragma once
 
+extern "C" {
+#include <xxhash.h>
+}
+
 #include <functional>
 #include <list>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
-#include <tsl/robin_map.h>  // NOLINT
-
 #include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/macros.h"
@@ -35,6 +38,14 @@ bool IsFastEagerDeletionModeEnabled();
 
 class Scope;
 
+namespace inner {
+struct KeyHasher {
+  std::size_t operator()(const std::string& key) const {
+    return XXH32(key.c_str(), key.size(), 1);
+  }
+};
+}  // namespace inner
+
 /**
  * @brief Scope that manage all variables.
  *
@@ -99,11 +110,14 @@ class Scope {
   std::string Rename(const std::string& origin_name) const;
 
  protected:
-  mutable tsl::robin_map<
-      std::string, std::unique_ptr<Variable>, std::hash<std::string>,
-      std::equal_to<std::string>,
-      std::allocator<std::pair<std::string, std::unique_ptr<Variable>>>, true>
+  mutable std::unordered_map<std::string, std::unique_ptr<Variable>,
+                             inner::KeyHasher>
       vars_;
+  // mutable tsl::robin_map<
+  // std::string, std::unique_ptr<Variable>, std::hash<std::string>,
+  // std::equal_to<std::string>,
+  // std::allocator<std::pair<std::string, std::unique_ptr<Variable>>>, true>
+  // vars_;
 
  private:
   // Call Scope::NewScope for a sub-scope.
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index 8df2e01b03..78f7a6ac08 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -93,7 +93,7 @@ def cuda_profiler(output_file, output_mode=None, config=None):
     with open(config_file, 'wb') as fp:
         fp.writelines([six.b("%s\n" % item) for item in config])
     #Comment this for nvprof
-    #core.nvprof_init(output_file, output_mode, config_file)
+    core.nvprof_init(output_file, output_mode, config_file)
     # Enables profiler collection by the active CUDA profiling tool.
     core.nvprof_start()
     yield

From aa41ee75a16509cb16793d7fdbbbfa3ce2dab69f Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 17 Dec 2018 17:13:26 +0800
Subject: [PATCH 11/77] Accelerate PADDLE_ENFORCE

---
 paddle/fluid/framework/operator.h | 12 ++++--
 paddle/fluid/platform/enforce.h   | 68 +++++++++++++++++++------------
 2 files changed, 50 insertions(+), 30 deletions(-)

diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 0a6a28a5bc..63a8bc574f 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -49,6 +49,8 @@ constexpr char kTempVarName[] = "@TEMP@";
 /// e.g. Variable "x@GRAD" is the gradient of varibale "x".
 constexpr char kGradVarSuffix[] = "@GRAD";
 
+constexpr size_t kGradVarSuffixSize = 5U;
+
 /// Variables with this suffix are supposed to be filled up with zeros.
 constexpr char kZeroVarSuffix[] = "@ZERO";
 
@@ -60,7 +62,11 @@ constexpr char kNewGradSuffix[] = "@NEWGRAD@";
 extern std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority;
 
 inline std::string GradVarName(const std::string& var_name) {
-  return var_name + kGradVarSuffix;
+  std::string result;
+  result.reserve(var_name.size() + kGradVarSuffixSize);
+  result += var_name;
+  result += kGradVarSuffix;
+  return result;
 }
 
 proto::VarType::Type GetDataTypeOfVar(const Variable* var);
@@ -101,8 +107,8 @@ class OperatorBase {
   bool HasAttr(const std::string& name) const { return attrs_.count(name); }
   template <typename T>
   inline const T& Attr(const std::string& name) const {
-    PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
-                   name);
+    PADDLE_ENFORCE(attrs_.find(name) != attrs_.end(),
+                   "%s should be in AttributeMap", name);
     return boost::get<T>(attrs_.at(name));
   }
   const AttributeMap& Attrs() const { return attrs_; }
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 01ee67fd07..3c03a90279 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -140,68 +140,72 @@ struct EOFException : public std::exception {
 #define LIKELY(condition) (condition)
 #endif
 
+inline bool is_error(bool stat) { return !stat; }
+
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     bool stat, const Args&... args) {
-  if (UNLIKELY(!(stat))) {
 #ifndef REPLACE_ENFORCE_GLOG
-    throw std::runtime_error(string::Sprintf(args...));
+  throw std::runtime_error(string::Sprintf(args...));
 #else
-    LOG(FATAL) << string::Sprintf(args...);
+  LOG(FATAL) << string::Sprintf(args...);
 #endif
-  }
 }
 
 #ifdef PADDLE_WITH_CUDA
 
+inline bool is_error(cudaError_t e) { return UNLIKELY(e); }
+
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     cudaError_t e, const Args&... args) {
-  if (UNLIKELY(e)) {
 #ifndef REPLACE_ENFORCE_GLOG
-    throw thrust::system_error(e, thrust::cuda_category(),
-                               string::Sprintf(args...));
+  throw thrust::system_error(e, thrust::cuda_category(),
+                             string::Sprintf(args...));
 #else
-    LOG(FATAL) << string::Sprintf(args...);
+  LOG(FATAL) << string::Sprintf(args...);
 #endif
-  }
+}
+
+inline bool is_error(curandStatus_t stat) {
+  return stat != CURAND_STATUS_SUCCESS;
 }
 
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     curandStatus_t stat, const Args&... args) {
-  if (stat != CURAND_STATUS_SUCCESS) {
 #ifndef REPLACE_ENFORCE_GLOG
-    throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
-                               string::Sprintf(args...));
+  throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
+                             string::Sprintf(args...));
 #else
-    LOG(FATAL) << string::Sprintf(args...);
+  LOG(FATAL) << string::Sprintf(args...);
 #endif
-  }
+}
+
+inline bool is_error(cudnnStatus_t stat) {
+  return stat != CUDNN_STATUS_SUCCESS;
 }
 
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     cudnnStatus_t stat, const Args&... args) {
-  if (stat == CUDNN_STATUS_SUCCESS) {
-    return;
-  } else {
 #ifndef REPLACE_ENFORCE_GLOG
-    throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) +
-                             string::Sprintf(args...));
+  throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) +
+                           string::Sprintf(args...));
 #else
-    LOG(FATAL) << string::Sprintf(args...);
+  LOG(FATAL) << string::Sprintf(args...);
 #endif
-  }
+}
+
+inline bool is_error(cublasStatus_t stat) {
+  return stat != CUBLAS_STATUS_SUCCESS;
 }
 
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     cublasStatus_t stat, const Args&... args) {
   std::string err;
-  if (stat == CUBLAS_STATUS_SUCCESS) {
-    return;
-  } else if (stat == CUBLAS_STATUS_NOT_INITIALIZED) {
+  if (stat == CUBLAS_STATUS_NOT_INITIALIZED) {
     err = "CUBLAS: not initialized, ";
   } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) {
     err = "CUBLAS: alloc failed, ";
@@ -254,11 +258,21 @@ inline void throw_on_error(T e) {
 #define PADDLE_THROW(...) \
   throw ::paddle::platform::EnforceNotMet(__FILE__, __LINE__, __VA_ARGS__)
 
+#define PADDLE_JUDGE
+
+#define __PADDLE_UNARY_COMPARE(COND, ...)                      \
+  do {                                                         \
+    auto cond = COND;                                          \
+    if (UNLIKELY(::paddle::platform::is_error(cond))) {        \
+      ::paddle::platform::throw_on_error(cond, ##__VA_ARGS__); \
+    }                                                          \
+  } while (0)
+
 #ifndef REPLACE_ENFORCE_GLOG
-#define PADDLE_ENFORCE(...)                                             \
+#define PADDLE_ENFORCE(COND, ...)                                       \
   do {                                                                  \
     try {                                                               \
-      ::paddle::platform::throw_on_error(__VA_ARGS__);                  \
+      __PADDLE_UNARY_COMPARE(COND, ##__VA_ARGS__);                      \
     } catch (...) {                                                     \
       throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
                                               __FILE__, __LINE__);      \
@@ -266,7 +280,7 @@ inline void throw_on_error(T e) {
   } while (false)
 
 #else
-#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__);
+#define PADDLE_ENFORCE(COND, ...) __PADDLE_UNARY_COMPARE(COND, ##__VA_ARGS__);
 #endif  // REPLACE_ENFORCE_GLOG
 
 #define PADDLE_THROW_EOF()                                                     \

From a3fa3f85d7bd4fb948b0401d77d5c60498e5a329 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Tue, 18 Dec 2018 15:04:26 +0800
Subject: [PATCH 12/77] Polish code

test=develop
---
 paddle/fluid/platform/enforce.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 3c03a90279..d1dd09f206 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -260,12 +260,12 @@ inline void throw_on_error(T e) {
 
 #define PADDLE_JUDGE
 
-#define __PADDLE_UNARY_COMPARE(COND, ...)                      \
-  do {                                                         \
-    auto cond = COND;                                          \
-    if (UNLIKELY(::paddle::platform::is_error(cond))) {        \
-      ::paddle::platform::throw_on_error(cond, ##__VA_ARGS__); \
-    }                                                          \
+#define __PADDLE_UNARY_COMPARE(COND, ...)                        \
+  do {                                                           \
+    auto __cond = COND;                                          \
+    if (UNLIKELY(::paddle::platform::is_error(__cond))) {        \
+      ::paddle::platform::throw_on_error(__cond, ##__VA_ARGS__); \
+    }                                                            \
   } while (0)
 
 #ifndef REPLACE_ENFORCE_GLOG

From dda28b0e682859c3868efe1ce65d636363faafd6 Mon Sep 17 00:00:00 2001
From: jerrywgz <jerrywgz@126.com>
Date: Wed, 19 Dec 2018 06:50:10 +0000
Subject: [PATCH 13/77] fix bug in if-else op, test=develop

---
 paddle/fluid/operators/split_lod_tensor_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc
index 767449cde9..5ede972c71 100644
--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
@@ -63,7 +63,7 @@ class SplitLoDTensorOp : public framework::OperatorBase {
     }
     auto *mask_data = cpu_mask->data<bool>();
 
-    std::vector<std::vector<CopyRange>> copy_ranges(mask_dim[0]);
+    std::vector<std::vector<CopyRange>> copy_ranges(2);
 
     // set out_true/out_false lod
     for (size_t t = 0; t < 2; t++) {

From ae6f46a1a9029284ba86ac0c783869a4c8468e17 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Wed, 19 Dec 2018 11:11:21 +0000
Subject: [PATCH 14/77] rewrite variable type test=develop

---
 paddle/fluid/framework/CMakeLists.txt         |  16 +-
 .../framework/data_device_transform_test.cu   |   1 +
 .../details/eager_deletion_op_handle.cc       |   2 +-
 .../framework/details/variable_visitor.cc     |   4 +-
 paddle/fluid/framework/executor.cc            |   2 +-
 paddle/fluid/framework/operator.cc            |  12 +-
 paddle/fluid/framework/operator.h             |  12 +
 paddle/fluid/framework/scope.cc               |   4 +-
 paddle/fluid/framework/var_type.h             |  32 ++-
 paddle/fluid/framework/var_type_traits.cc     |  27 ++
 paddle/fluid/framework/var_type_traits.h      | 207 ++++++++++++++
 .../fluid/framework/var_type_traits_test.cc   |  75 ++++++
 paddle/fluid/framework/variable.h             |  64 ++---
 paddle/fluid/framework/variable_test.cc       |  23 +-
 .../api/details/reset_tensor_array.cc         |   2 +-
 .../api/details/reset_tensor_array.h          |   9 +-
 paddle/fluid/operators/affine_grid_op.cc      |   4 +-
 paddle/fluid/operators/clip_by_norm_op.h      |   2 +-
 .../operators/controlflow/parallel_do_op.cc   |   3 +-
 .../fluid/operators/controlflow/while_op.cc   |   7 +-
 paddle/fluid/operators/conv_op.cc             |   4 +-
 paddle/fluid/operators/cudnn_lstm_op.cu.cc    | 241 +----------------
 paddle/fluid/operators/cudnn_rnn_cache.h      | 255 ++++++++++++++++++
 .../distributed/brpc_sendrecvop_utils.cc      |   3 +-
 .../operators/distributed_ops/split_ids_op.h  |   2 +-
 .../elementwise/elementwise_mul_op.h          |   2 +-
 paddle/fluid/operators/grid_sampler_op.cc     |   4 +-
 .../fluid/operators/optimizers/adadelta_op.h  |   6 +-
 .../fluid/operators/optimizers/adagrad_op.h   |   3 +-
 paddle/fluid/operators/optimizers/adam_op.h   |   3 +-
 paddle/fluid/operators/optimizers/adamax_op.h |   6 +-
 .../operators/optimizers/decayed_adagrad_op.h |   6 +-
 paddle/fluid/operators/optimizers/ftrl_op.h   |   6 +-
 .../fluid/operators/optimizers/momentum_op.h  |   2 +-
 paddle/fluid/operators/optimizers/sgd_op.cu   |   3 +-
 paddle/fluid/operators/pool_op.cc             |   4 +-
 paddle/fluid/operators/softmax_op.cc          |   4 +-
 paddle/fluid/operators/sum_mkldnn_op.cc       |   2 +-
 paddle/fluid/operators/sum_op.cc              |   2 +-
 paddle/fluid/operators/sum_op.h               |   2 +-
 paddle/fluid/operators/warpctc_op.cc          |   2 +-
 paddle/fluid/platform/cudnn_helper.h          |  13 -
 42 files changed, 717 insertions(+), 366 deletions(-)
 create mode 100644 paddle/fluid/framework/var_type_traits.cc
 create mode 100644 paddle/fluid/framework/var_type_traits.h
 create mode 100644 paddle/fluid/framework/var_type_traits_test.cc
 create mode 100644 paddle/fluid/operators/cudnn_rnn_cache.h

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 412bc9cbe8..b6372a2ef5 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -78,17 +78,25 @@ cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memor
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
 cc_test(reader_test SRCS reader_test.cc DEPS reader)
 
-cc_test(variable_test SRCS variable_test.cc)
-
 cc_library(threadpool SRCS threadpool.cc DEPS enforce)
 cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
 
-cc_library(scope SRCS scope.cc DEPS glog threadpool)
+cc_library(var_type_traits SRCS var_type_traits DEPS lod_tensor selected_rows framework_proto) 
+if (WITH_GPU)
+  target_link_libraries(var_type_traits cudnn)
+  if (NOT WIN32)
+    target_link_libraries(var_type_traits nccl)
+  endif()
+endif()
+cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits)
+
+cc_library(scope SRCS scope.cc DEPS glog threadpool var_type_traits)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
+cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits)
 
 cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
 nv_test(data_device_transform_test SRCS data_device_transform_test.cu
-        DEPS operator op_registry device_context math_function)
+        DEPS operator op_registry device_context math_function scope)
 
 if(WITH_GPU)
   if (WIN32)
diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
index c9ec5e7a7b..96a2f9250f 100644
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index abacb11e3b..03fbfd7f24 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -88,7 +88,7 @@ void EagerDeletionOpHandle::RunImpl() {
       }
     } else {
       PADDLE_THROW("Type %s of %s is not supported eager deletion",
-                   var->Type().name(), name);
+                   framework::ToTypeName(var->Type()), name);
     }
   }
 
diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc
index 3dfd14419d..134f759081 100644
--- a/paddle/fluid/framework/details/variable_visitor.cc
+++ b/paddle/fluid/framework/details/variable_visitor.cc
@@ -24,7 +24,7 @@ static void VisitVariable(Variable* var, Func* func) {
   } else if (var->IsType<SelectedRows>()) {
     (*func)(var->GetMutable<SelectedRows>());
   } else {
-    PADDLE_THROW("Not supported type %s", var->Type().name());
+    PADDLE_THROW("Not supported type %s", ToTypeName(var->Type()));
   }
 }
 
@@ -35,7 +35,7 @@ static void VisitVariable(const Variable& var, Func* func) {
   } else if (var.IsType<SelectedRows>()) {
     (*func)(var.Get<SelectedRows>());
   } else {
-    PADDLE_THROW("Not supported type %s", var.Type().name());
+    PADDLE_THROW("Not supported type %s", ToTypeName(var.Type()));
   }
 }
 
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index da9556c6c1..594fbb48a6 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -119,7 +119,7 @@ static void DeleteUnusedTensors(
           }
         } else {
           PADDLE_THROW("Type %s of %s is not supported eager deletion",
-                       var->Type().name(), name);
+                       framework::ToTypeName(var->Type()), name);
         }
       }
     }
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index a62afe248b..9b4a5011a8 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -365,7 +365,7 @@ const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var) {
     return &(var.Get<SelectedRows>().value());
   } else {
     PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
-                 var.Type().name());
+                 ToTypeName(var.Type()));
   }
 }
 
@@ -376,7 +376,7 @@ Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var) {
     return var->GetMutable<SelectedRows>()->mutable_value();
   } else {
     PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
-                 var->Type().name());
+                 ToTypeName(var->Type()));
   }
 }
 
@@ -430,7 +430,7 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
                    PADDLE_ENFORCE(
                        var->IsType<LoDTensor>(),
                        "%s should be LoDTensor, but the received type is %s",
-                       sub_name, var->Type().name());
+                       sub_name, ToTypeName(var->Type()));
                    return &(var->Get<LoDTensor>());
                  });
   return res;
@@ -454,7 +454,7 @@ std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
                    PADDLE_ENFORCE(
                        var->IsType<LoDTensor>(),
                        "%s should be LoDTensor, but the received type is %s",
-                       sub_name, var->Type().name());
+                       sub_name, ToTypeName(var->Type()));
                    return var->GetMutable<LoDTensor>();
                  });
   return res;
@@ -641,7 +641,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
       PADDLE_THROW(
           "Only LoDTensor/SelectedRows support 'GetDim', but Variable %s's "
           "type_id is %s.",
-          name, var->Type().name());
+          name, ToTypeName(var->Type()));
     }
   }
 
@@ -657,7 +657,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
       var->GetMutable<SelectedRows>()->set_height(dim[0]);
     } else {
       PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.",
-                   name, var->Type().name());
+                   name, ToTypeName(var->Type()));
     }
   }
 
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 0a6a28a5bc..f8d2f1fe12 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -288,6 +288,18 @@ class ExecutionContext {
   const platform::DeviceContext& device_context_;
 };
 
+inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
+  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace());
+#ifdef PADDLE_WITH_CUDA
+  if (use_cudnn) {
+    auto& dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
+    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+  }
+#endif
+  return use_cudnn;
+}
+
 template <>
 const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const;
 
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 6fa5e99f9f..750b626603 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -165,11 +165,9 @@ std::string Scope::Rename(const std::string& origin_name) const {
 Variable* Scope::VarInternal(const std::string& name) {
   auto* v = FindVarLocally(name);
   if (v != nullptr) return v;
-
   v = new Variable();
-  vars_[name].reset(v);
+  vars_.emplace(name, std::unique_ptr<Variable>(v));
   VLOG(3) << "Create variable " << name;
-  v->name_ = &(vars_.find(name)->first);
   return v;
 }
 
diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h
index 3b6f1cdb8f..f1cbaf3fdc 100644
--- a/paddle/fluid/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
@@ -19,35 +19,33 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/framework/variable.h"
 
 namespace paddle {
 namespace framework {
 
 template <typename T>
-inline bool IsType(const std::type_index& type_index) {
-  return type_index == std::type_index(typeid(T));
+inline bool IsType(const std::type_index& type) {
+  return type == typeid(T);
 }
 
-inline proto::VarType::Type ToVarType(std::type_index type) {
-  if (IsType<LoDTensor>(type)) {
-    return proto::VarType_Type_LOD_TENSOR;
-  } else if (IsType<LoDRankTable>(type)) {
-    return proto::VarType_Type_LOD_RANK_TABLE;
-  } else if (IsType<LoDTensorArray>(type)) {
-    return proto::VarType_Type_LOD_TENSOR_ARRAY;
-  } else if (IsType<SelectedRows>(type)) {
-    return proto::VarType_Type_SELECTED_ROWS;
-  } else if (IsType<ReaderHolder>(type)) {
-    return proto::VarType_Type_READER;
-  } else {
-    PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
+inline proto::VarType::Type ToVarType(int type) {
+  switch (type) {
+    case proto::VarType::LOD_TENSOR:
+    case proto::VarType::SELECTED_ROWS:
+    case proto::VarType::LOD_RANK_TABLE:
+    case proto::VarType::LOD_TENSOR_ARRAY:
+    case proto::VarType::READER:
+      return static_cast<proto::VarType::Type>(type);
+    default:
+      PADDLE_THROW("ToVarType:Unsupported type %d", type);
   }
 }
 
 template <typename Visitor>
 inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
-  switch (ToVarType(var.Type())) {
+  switch (var.Type()) {
     case proto::VarType_Type_LOD_TENSOR:
       visitor(var.Get<LoDTensor>());
       return;
@@ -64,7 +62,7 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
       visitor(var.Get<ReaderHolder>());
       return;
     default:
-      PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type()));
+      PADDLE_THROW("Not supported visit type, %s", ToTypeName(var.Type()));
   }
 }
 
diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc
new file mode 100644
index 0000000000..0171df6f73
--- /dev/null
+++ b/paddle/fluid/framework/var_type_traits.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/var_type_traits.h"
+
+namespace paddle {
+namespace framework {
+
+const char* ToTypeName(int var_id) { return ToTypeIndex(var_id).name(); }
+
+const std::type_index& ToTypeIndex(int var_id) {
+  return detail::VarIdToTypeIndexMapHolder::ToTypeIndex(var_id);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
new file mode 100644
index 0000000000..88f917e74f
--- /dev/null
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -0,0 +1,207 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <tuple>
+#include <typeinfo>
+#include <vector>
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
+#ifdef PADDLE_WITH_CUDA
+#ifndef _WIN32
+#include <nccl.h>
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
+#endif
+#include <cudnn.h>
+#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
+#include "paddle/fluid/operators/cudnn_rnn_cache.h"
+#endif
+
+namespace paddle {
+namespace framework {
+
+namespace detail {
+
+template <bool kStop, int kStart, int kEnd, typename T1, typename T2,
+          typename... Args>
+struct TypePosFinderImpl {
+  static constexpr int kPos =
+      std::is_same<T1, T2>::value
+          ? kStart
+          : TypePosFinderImpl<kStart + 2 == kEnd, kStart + 1, kEnd, T1,
+                              Args...>::kPos;
+};
+
+template <int kStart, int kEnd, typename T1, typename T2>
+struct TypePosFinderImpl<true, kStart, kEnd, T1, T2> {
+  static constexpr int kPos = std::is_same<T1, T2>::value ? kStart : -1;
+};
+
+// TypePosFinder helps to find the position in which T is inside Args...
+// If T is not inside Args..., kPos would be -1
+template <typename T, typename... Args>
+struct TypePosFinder {
+  static constexpr int kPos =
+      TypePosFinderImpl<sizeof...(Args) == 1, 0, sizeof...(Args), T,
+                        Args...>::kPos;
+};
+
+template <typename... Args>
+struct VarTypeRegistryImpl {
+  static constexpr size_t kRegisteredTypeNum = sizeof...(Args);
+  using ArgTuple = std::tuple<Args...>;
+
+  // TypePos() returns the position in which T is inside Args...
+  // If T is not inside Args... or T is void, return -1
+  template <typename T>
+  static constexpr int TypePos() {
+    return std::is_same<T, void>::value ? -1 : TypePosFinder<T, Args...>::kPos;
+  }
+
+  // IsRegistered() returns whether T is registered inside RegistryImpl
+  template <typename T>
+  static constexpr bool IsRegistered() {
+    return TypePos<T>() >= 0;
+  }
+};
+
+}  // namespace detail
+
+#define REG_PROTO_VAR_TYPE_TRAIT(type, proto_id)         \
+  template <>                                            \
+  struct VarTypeTrait<type> {                            \
+    static_assert(VarTypeRegistry::IsRegistered<type>(), \
+                  "Must be registered type");            \
+    using Type = type;                                   \
+    static constexpr int kId = proto_id;                 \
+  }
+
+/**
+ * The following codes are designed to register variable types.
+ * Only registered types can be stored in Variable.
+ * This registry mechanism is designed to speed up Variable.
+ */
+
+// Users should add other variable types below.
+// Paddle would generate unique Ids for each registered variable types.
+class Scope;
+
+using VarTypeRegistry = detail::VarTypeRegistryImpl<
+    LoDTensor, SelectedRows, std::vector<Scope *>, LoDRankTable, LoDTensorArray,
+    platform::PlaceList, ReaderHolder, Tensor, std::string, Scope *,
+    std::map<size_t, Tensor>, operators::reader::LoDTensorBlockingQueueHolder,
+    int, float,
+#ifdef PADDLE_WITH_CUDA
+#ifndef _WIN32
+    ncclUniqueId, platform::Communicator,
+#endif
+    operators::AlgorithmsCache<cudnnConvolutionFwdAlgo_t>,
+    operators::AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>,
+    operators::AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>,
+    operators::CudnnRNNCache,
+#endif
+    void>;  // void indicates end of registration, add other types before void
+
+template <typename T>
+struct VarTypeTrait {
+  static_assert(std::is_same<T, void>::value ||
+                    VarTypeRegistry::IsRegistered<T>(),
+                "Must be registered type");
+  using Type = T;
+  // Default id generation
+  static constexpr int kId = VarTypeRegistry::TypePos<T>() +
+                             static_cast<int>(proto::VarType::TUPLE) * 2;
+};
+
+// Users should set some of variable type ids to be what is defined in
+// framework.proto here
+REG_PROTO_VAR_TYPE_TRAIT(LoDTensor, proto::VarType::LOD_TENSOR);
+REG_PROTO_VAR_TYPE_TRAIT(SelectedRows, proto::VarType::SELECTED_ROWS);
+REG_PROTO_VAR_TYPE_TRAIT(std::vector<Scope *>, proto::VarType::STEP_SCOPES);
+REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE);
+REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY);
+REG_PROTO_VAR_TYPE_TRAIT(platform::PlaceList, proto::VarType::PLACE_LIST);
+REG_PROTO_VAR_TYPE_TRAIT(ReaderHolder, proto::VarType::READER);
+
+/** End of variable type registration */
+
+// Besides register variable id, it is helpful to register a
+// var_id -> std::type_index (for example, get var names according to id)
+namespace detail {
+
+template <int kStart, int kEnd, bool kStop>
+struct VarIdToTypeIndexMapInitializerImpl {
+  static void Init(std::unordered_map<int, std::type_index> *m) {
+    using Type =
+        typename std::tuple_element<kStart, VarTypeRegistry::ArgTuple>::type;
+    constexpr int kId = VarTypeTrait<Type>::kId;
+    if (!std::is_same<Type, void>::value) {
+      m->emplace(kId, std::type_index(typeid(Type)));
+    }
+    VarIdToTypeIndexMapInitializerImpl<kStart + 1, kEnd,
+                                       kStart + 1 == kEnd>::Init(m);
+  }
+};
+
+template <int kStart, int kEnd>
+struct VarIdToTypeIndexMapInitializerImpl<kStart, kEnd, true> {
+  static void Init(std::unordered_map<int, std::type_index> *m) {}
+};
+
+// VarIdToTypeIndexMapInitializer is designed to initialize var_id ->
+// std::type_index map
+using VarIdToTypeIndexMapInitializer =
+    VarIdToTypeIndexMapInitializerImpl<0, VarTypeRegistry::kRegisteredTypeNum,
+                                       VarTypeRegistry::kRegisteredTypeNum ==
+                                           0>;
+
+struct VarIdToTypeIndexMapHolder {
+ public:
+  static const std::type_index &ToTypeIndex(int var_id) {
+    static const VarIdToTypeIndexMapHolder instance;
+    auto it = instance.var_type_map_.find(var_id);
+    PADDLE_ENFORCE(it != instance.var_type_map_.end(),
+                   "VarId %d is not registered.", var_id);
+    return it->second;
+  }
+
+ private:
+  VarIdToTypeIndexMapHolder() {
+    VarIdToTypeIndexMapInitializer::Init(&var_type_map_);
+  }
+  std::unordered_map<int, std::type_index> var_type_map_;
+};
+
+}  // namespace detail
+
+const char *ToTypeName(int var_id);
+const std::type_index &ToTypeIndex(int var_id);
+
+template <typename T>
+inline constexpr bool IsRegisteredVarType() {
+  return VarTypeRegistry::IsRegistered<T>();
+}
+
+#undef REG_PROTO_VAR_TYPE_TRAIT
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc
new file mode 100644
index 0000000000..09fab719c1
--- /dev/null
+++ b/paddle/fluid/framework/var_type_traits_test.cc
@@ -0,0 +1,75 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/var_type_traits.h"
+#include <gtest/gtest.h>
+#include <cstdint>
+
+namespace paddle {
+namespace framework {
+
+template <int kPos, int kEnd, bool kStop>
+struct TypeIndexChecker {
+  static void Check() {
+    using Type =
+        typename std::tuple_element<kPos, VarTypeRegistry::ArgTuple>::type;
+    if (!std::is_same<Type, void>::value) {
+      EXPECT_TRUE(ToTypeIndex(VarTypeTrait<Type>::kId) == typeid(Type));
+      EXPECT_TRUE(std::string(ToTypeName(VarTypeTrait<Type>::kId)) ==
+                  typeid(Type).name());
+    }
+    TypeIndexChecker<kPos + 1, kEnd, kPos + 1 == kEnd>::Check();
+  }
+};
+
+template <int kPos, int kEnd>
+struct TypeIndexChecker<kPos, kEnd, true> {
+  static void Check() {}
+};
+
+TEST(var_type_traits, check_type_index) {
+  constexpr size_t kRegisteredNum = VarTypeRegistry::kRegisteredTypeNum;
+  TypeIndexChecker<0, kRegisteredNum, kRegisteredNum == 0>::Check();
+}
+
+template <typename T>
+bool CheckVarId(int proto_id) {
+  static_assert(std::is_same<typename VarTypeTrait<T>::Type, T>::value,
+                "Type must be the same");
+  return VarTypeTrait<T>::kId == proto_id;
+}
+
+TEST(var_type_traits, check_proto_type_id) {
+  ASSERT_TRUE(CheckVarId<LoDTensor>(proto::VarType::LOD_TENSOR));
+  ASSERT_TRUE(CheckVarId<SelectedRows>(proto::VarType::SELECTED_ROWS));
+  ASSERT_TRUE(CheckVarId<std::vector<Scope *>>(proto::VarType::STEP_SCOPES));
+  ASSERT_TRUE(CheckVarId<LoDRankTable>(proto::VarType::LOD_RANK_TABLE));
+  ASSERT_TRUE(CheckVarId<LoDTensorArray>(proto::VarType::LOD_TENSOR_ARRAY));
+  ASSERT_TRUE(CheckVarId<platform::PlaceList>(proto::VarType::PLACE_LIST));
+  ASSERT_TRUE(CheckVarId<ReaderHolder>(proto::VarType::READER));
+}
+
+TEST(var_type_traits, test_registry) {
+  using Registry =
+      detail::VarTypeRegistryImpl<int8_t, int32_t, size_t, double, void>;
+  ASSERT_TRUE(Registry::TypePos<int8_t>() == 0);
+  ASSERT_TRUE(Registry::TypePos<int32_t>() == 1);
+  ASSERT_TRUE(Registry::TypePos<size_t>() == 2);
+  ASSERT_TRUE(Registry::TypePos<double>() == 3);
+  ASSERT_TRUE(Registry::TypePos<void>() == -1);
+  ASSERT_TRUE(Registry::TypePos<float>() == -1);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h
index 873e1b20a5..8aa68942ad 100644
--- a/paddle/fluid/framework/variable.h
+++ b/paddle/fluid/framework/variable.h
@@ -18,7 +18,7 @@
 #include <typeindex>
 #include <typeinfo>
 
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/framework/var_type_traits.h"
 
 namespace paddle {
 namespace framework {
@@ -27,10 +27,14 @@ class Variable {
  public:
   template <typename T>
   const T& Get() const {
+    static_assert(
+        IsRegisteredVarType<T>(),
+        "Not registered type. Please register T inside var_type_traits.h");
     PADDLE_ENFORCE(holder_ != nullptr, "Variable must hold some thing");
-    PADDLE_ENFORCE(IsType<T>(),
+    PADDLE_ENFORCE(holder_->Type() == VarTypeTrait<T>::kId,
                    "Variable must be type %s, the holding type is %s",
-                   typeid(T).name(), holder_->Type().name());
+                   ToTypeName(VarTypeTrait<T>::kId),
+                   ToTypeName(holder_->Type()));
     return *static_cast<const T*>(holder_->Ptr());
   }
 
@@ -39,61 +43,59 @@ class Variable {
   template <typename T>
   T* GetMutable() {
     if (!holder_) {
-      holder_.reset(new PlaceholderImpl<T>(new T()));
+      holder_.reset(new PlaceholderImpl<T>());
     } else {
-      PADDLE_ENFORCE(IsType<T>(),
+      PADDLE_ENFORCE(holder_->Type() == VarTypeTrait<T>::kId,
                      "Variable must be type %s, the holding type is %s",
-                     typeid(T).name(), holder_->Type().name());
+                     ToTypeName(VarTypeTrait<T>::kId),
+                     ToTypeName(holder_->Type()));
     }
     return static_cast<T*>(holder_->Ptr());
   }
 
   template <typename T>
   bool IsType() const {
-    return holder_ != nullptr &&
-           std::type_index(typeid(T)) == std::type_index(holder_->Type());
+    return holder_ && holder_->Type() == VarTypeTrait<T>::kId;
   }
 
   void Clear() { holder_.reset(); }
 
-  std::type_index Type() const {
+  int Type() const {
     PADDLE_ENFORCE(holder_ != nullptr, "Must hold memory");
     return holder_->Type();
   }
 
  private:
   struct Placeholder {
-    virtual ~Placeholder() {}
-    virtual const std::type_info& Type() const = 0;
-    virtual void* Ptr() const = 0;
+    explicit Placeholder(int type) : type_(type) {}
+    virtual ~Placeholder() = default;
+
+    inline int Type() const { return type_; }
+    inline const void* Ptr() const { return ptr_; }
+    inline void* Ptr() { return ptr_; }
+
+   protected:
+    void* ptr_;
+    int type_;
   };
 
   // Placeholder hides type T, so it doesn't appear as a template
   // parameter of Variable.
   template <typename T>
   struct PlaceholderImpl : public Placeholder {
-    explicit PlaceholderImpl(T* ptr) : ptr_(ptr), type_(typeid(T)) {}
-
-    virtual const std::type_info& Type() const { return type_; }
-    virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); }
+    static_assert(
+        IsRegisteredVarType<T>(),
+        "Not registered type. Please register T inside var_type_traits.h");
+    PlaceholderImpl() : Placeholder(VarTypeTrait<T>::kId) {
+      this->ptr_ = &obj_;
+    }
 
-    std::unique_ptr<T> ptr_;
-    const std::type_info& type_;
+   private:
+    T obj_;
   };
 
-  std::unique_ptr<Placeholder>
-      holder_;  // pointers to a PlaceholderImpl object indeed.
-
-  // name_ is only meaningful with a Scope and accessible by it.
-  //
-  // NOTE: Please don't expose name_ by adding methods like
-  // Variable::Name or Scope::VarName!  A variable could have a human
-  // readable name or an auto-generated scope-unique name.  In the
-  // former case, the caller knows the name and doesn't need to access
-  // the name; in the latter case, the variable should be identified
-  // by its address but not the unreadable name.
-  friend class Scope;
-  const std::string* name_;
+  // pointers to a PlaceholderImpl object indeed.
+  std::unique_ptr<Placeholder> holder_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/variable_test.cc b/paddle/fluid/framework/variable_test.cc
index 003dcfd3df..511c9c5214 100644
--- a/paddle/fluid/framework/variable_test.cc
+++ b/paddle/fluid/framework/variable_test.cc
@@ -16,27 +16,28 @@
 #include <string>
 
 #include "gtest/gtest.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
 
-TEST(Variable, GetMutable) {
-  using paddle::framework::Variable;
-
-  struct Tensor {
-    int content_;
-  };
+namespace paddle {
+namespace framework {
 
+TEST(Variable, GetMutable) {
   std::unique_ptr<Variable> v(new Variable());
 
-  Tensor* t = v->GetMutable<Tensor>();
-  t->content_ = 1234;
+  auto* t = v->GetMutable<std::string>();
+  *t = "1234";
 
-  const Tensor& tt = v->Get<Tensor>();
-  EXPECT_EQ(1234, tt.content_);
+  const auto& tt = v->Get<std::string>();
+  EXPECT_EQ("1234", tt);
 
   try {
-    v->GetMutable<std::string>();
+    v->GetMutable<Tensor>();
   } catch (std::exception& e) {
     return;
   }
   EXPECT_TRUE(false);
 }
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.cc b/paddle/fluid/inference/api/details/reset_tensor_array.cc
index 569a487328..03c2aa3fb8 100644
--- a/paddle/fluid/inference/api/details/reset_tensor_array.cc
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.cc
@@ -25,7 +25,7 @@ void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) {
       // TODO(Superjomn) should avoid the case when a TensorArray is a
       // parameter.
       if (var_name == "feed" || var_name == "fetch") continue;
-      if (var->Type() == typeid(framework::LoDTensorArray)) {
+      if (var->IsType<framework::LoDTensorArray>()) {
         VLOG(4) << "collect " << var_name;
         arrays_.push_back(var->GetMutable<framework::LoDTensorArray>());
       }
diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h
index 6a5ea64de6..213c6891d0 100644
--- a/paddle/fluid/inference/api/details/reset_tensor_array.h
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.h
@@ -27,8 +27,11 @@ namespace details {
 // training phase.
 struct TensorArrayBatchCleaner {
   TensorArrayBatchCleaner() {
-    valid_types_.insert(typeid(framework::Tensor));
-    valid_types_.insert(typeid(framework::LoDTensor));
+    constexpr auto kTensorId = framework::VarTypeTrait<framework::Tensor>::kId;
+    constexpr auto kLoDTensorId =
+        framework::VarTypeTrait<framework::LoDTensor>::kId;
+    valid_types_.insert(kTensorId);
+    valid_types_.insert(kLoDTensorId);
   }
   // Collect the variables that are not Tensor or LoDTensor, and reset them to a
   // bool(trick), because some of them are containers, and some operators just
@@ -46,7 +49,7 @@ struct TensorArrayBatchCleaner {
   bool no_tensor_flag_{true};
   std::vector<framework::LoDTensorArray *> arrays_;
 
-  std::unordered_set<std::type_index> valid_types_;
+  std::unordered_set<int> valid_types_;
   std::unordered_set<framework::Variable *> no_tensor_vars_;
 };
 
diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc
index 1de59a5165..0c04873852 100644
--- a/paddle/fluid/operators/affine_grid_op.cc
+++ b/paddle/fluid/operators/affine_grid_op.cc
@@ -74,7 +74,7 @@ class AffineGridOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     framework::LibraryType library{framework::LibraryType::kPlain};
 #ifdef PADDLE_WITH_CUDA
-    if (platform::CanCUDNNBeUsed(ctx)) {
+    if (framework::CanCUDNNBeUsed(ctx)) {
       library = framework::LibraryType::kCUDNN;
     }
 #endif
@@ -184,7 +184,7 @@ class AffineGridOpGrad : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     framework::LibraryType library_{framework::LibraryType::kPlain};
 #ifdef PADDLE_WITH_CUDA
-    if (platform::CanCUDNNBeUsed(ctx)) {
+    if (framework::CanCUDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kCUDNN;
     }
 #endif
diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h
index 855c4d7067..49e734ce96 100644
--- a/paddle/fluid/operators/clip_by_norm_op.h
+++ b/paddle/fluid/operators/clip_by_norm_op.h
@@ -64,7 +64,7 @@ class ClipByNormKernel : public framework::OpKernel<T> {
       output->mutable_data<T>(context.GetPlace());
     } else {
       PADDLE_THROW("Unexpected branch, input variable type is %s",
-                   in_var->Type().name());
+                   framework::ToTypeName(in_var->Type()));
     }
 
     PADDLE_ENFORCE_NOT_NULL(input);
diff --git a/paddle/fluid/operators/controlflow/parallel_do_op.cc b/paddle/fluid/operators/controlflow/parallel_do_op.cc
index ab25628d45..5bcc597dec 100644
--- a/paddle/fluid/operators/controlflow/parallel_do_op.cc
+++ b/paddle/fluid/operators/controlflow/parallel_do_op.cc
@@ -92,7 +92,8 @@ inline void CopyOrShare(const framework::Variable &src,
       TensorCopy(src_sr.value(), dst_place, dst_sr->mutable_value());
     }
   } else {
-    PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name());
+    PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s",
+                 framework::ToTypeName(src.Type()));
   }
 }
 
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index e91d9ef776..9b5eda17fa 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -175,14 +175,13 @@ class WhileGradOp : public framework::OperatorBase {
         auto &og_inside =
             detail::Ref(cur_scope.Var(inside_og_name),
                         "Cannot find inside gradient %s", inside_og_name);
-        if (framework::IsType<framework::LoDTensor>(og_outside.Type())) {
+        if (og_outside.IsType<framework::LoDTensor>()) {
           auto &outside_tensor = og_outside.Get<framework::LoDTensor>();
           auto &inside_tensor =
               detail::Ref(og_inside.GetMutable<framework::LoDTensor>());
           inside_tensor.set_lod(outside_tensor.lod());
           inside_tensor.ShareDataWith(outside_tensor);
-        } else if (framework::IsType<framework::LoDTensorArray>(
-                       og_outside.Type())) {
+        } else if (og_outside.IsType<framework::LoDTensorArray>()) {
           auto &outside_array = og_outside.Get<framework::LoDTensorArray>();
           auto &inside_array =
               detail::Ref(og_inside.GetMutable<framework::LoDTensorArray>());
@@ -256,7 +255,7 @@ class WhileGradOp : public framework::OperatorBase {
                   var->IsType<LoDTensor>(),
               "Currently the type of var only can be LoDTensorArray, "
               "or LoDTensor, but the received var[%s] is %s.",
-              inside_grad_name, var->Type().name());
+              inside_grad_name, framework::ToTypeName(var->Type()));
 
           if (var->IsType<LoDTensor>()) {
             auto &inside_tensor = var->Get<framework::LoDTensor>();
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 8e0d282495..c76bde99f4 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -84,7 +84,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
   framework::DataLayout layout = framework::StringToDataLayout(data_format);
 
 #ifdef PADDLE_WITH_CUDA
-  if (platform::CanCUDNNBeUsed(ctx)) {
+  if (framework::CanCUDNNBeUsed(ctx)) {
     library = framework::LibraryType::kCUDNN;
   }
 #endif
@@ -369,7 +369,7 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
   framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
 
 #ifdef PADDLE_WITH_CUDA
-  if (platform::CanCUDNNBeUsed(ctx)) {
+  if (framework::CanCUDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kCUDNN;
   }
 #endif
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index f2ba75485c..fae0925149 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/cudnn_rnn_cache.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
 
 namespace paddle {
 namespace operators {
@@ -22,239 +22,6 @@ namespace operators {
 using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
 
-struct CudnnRNNCache {
-  CudnnRNNCache() {
-    x_desc_ = NULL;
-    y_desc_ = NULL;
-    dx_desc_ = NULL;
-    dy_desc_ = NULL;
-  }
-  ~CudnnRNNCache() { release(); }
-
-  cudnnRNNDescriptor_t rnn_desc_;
-  cudnnTensorDescriptor_t *x_desc_;
-  cudnnTensorDescriptor_t *y_desc_;
-  cudnnTensorDescriptor_t *dx_desc_;
-  cudnnTensorDescriptor_t *dy_desc_;
-
-  cudnnTensorDescriptor_t hx_desc_;
-  cudnnTensorDescriptor_t cx_desc_;
-  cudnnTensorDescriptor_t hy_desc_;
-  cudnnTensorDescriptor_t cy_desc_;
-
-  cudnnTensorDescriptor_t dhx_desc_;
-  cudnnTensorDescriptor_t dcx_desc_;
-  cudnnTensorDescriptor_t dhy_desc_;
-  cudnnTensorDescriptor_t dcy_desc_;
-
-  cudnnTensorDescriptor_t output_x_desc_;
-  cudnnTensorDescriptor_t output_y_desc_;
-
-  cudnnDropoutDescriptor_t dropout_desc_;
-
-  size_t weights_size_;
-  cudnnFilterDescriptor_t w_desc_;
-  cudnnFilterDescriptor_t dw_desc_;
-
-  size_t workspace_size_;
-  size_t reserve_size_;
-  Tensor reserve_data_;
-  Tensor workspace_data_;
-
-  Tensor dropout_state_;
-
-  size_t max_length_;
-
-  float dropout_prob_;
-  bool is_bidirec_;
-
-  int batch_size_;
-  int input_size_;
-  int hidden_size_;
-  int num_layers_;
-  int seed_;
-
-  void init(cudnnHandle_t handle, const framework::ExecutionContext &ctx,
-            size_t max_len, int batch_size, int input_size, int hidden_size,
-            int num_layers, float dropout_prob, bool is_bidirec, int seed,
-            int weight_numel) {
-    max_length_ = max_len;
-    batch_size_ = batch_size;
-    input_size_ = input_size;
-    hidden_size_ = hidden_size;
-    num_layers_ = num_layers;
-    dropout_prob_ = dropout_prob;
-    is_bidirec_ = is_bidirec;
-    seed_ = seed;
-
-    x_desc_ = new cudnnTensorDescriptor_t[max_length_];
-    y_desc_ = new cudnnTensorDescriptor_t[max_length_];
-    dx_desc_ = new cudnnTensorDescriptor_t[max_length_];
-    dy_desc_ = new cudnnTensorDescriptor_t[max_length_];
-    int dim_a[3];
-    int stride_a[3];
-
-    for (size_t i = 0; i < max_length_; ++i) {
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i]));
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i]));
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i]));
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i]));
-      dim_a[0] = batch_size_;
-      dim_a[1] = input_size_;
-      dim_a[2] = 1;
-
-      stride_a[0] = dim_a[2] * dim_a[1];
-      stride_a[1] = dim_a[2];
-      stride_a[2] = 1;
-      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-          x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-          dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-
-      dim_a[0] = batch_size_;
-      dim_a[1] = is_bidirec_ ? hidden_size_ * 2 : hidden_size_;
-      dim_a[2] = 1;
-
-      stride_a[0] = dim_a[2] * dim_a[1];
-      stride_a[1] = dim_a[2];
-      stride_a[2] = 1;
-
-      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-          y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-          dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-    }
-
-    dim_a[0] = num_layers_ * (is_bidirec_ ? 2 : 1);
-    dim_a[1] = batch_size_;
-    dim_a[2] = hidden_size_;
-
-    stride_a[0] = dim_a[2] * dim_a[1];
-    stride_a[1] = dim_a[2];
-    stride_a[2] = 1;
-
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_));
-
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_));
-
-    size_t state_size;
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size);
-        dropout_state_.Resize({static_cast<int64_t>(state_size)}));
-    auto *dropout_state_data =
-        dropout_state_.mutable_data<uint8_t>(ctx.GetPlace());
-    CUDNN_ENFORCE(platform::dynload::cudnnSetDropoutDescriptor(
-        dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size,
-        seed_));
-
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_));
-
-#if CUDNN_VERSION >= 6000
-    CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6(
-        handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
-        CUDNN_LINEAR_INPUT,
-        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
-        CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT));
-#else
-    CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor(
-        rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT,
-        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
-        CUDNN_DATA_FLOAT));
-#endif
-
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_));
-
-    CUDNN_ENFORCE(platform::dynload::cudnnGetRNNParamsSize(
-        handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT));
-
-    PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel,
-                      "cudnn lstm weight size should be SAME");
-    int dim_w[3];
-    dim_w[0] = weights_size_ / sizeof(float);
-    dim_w[1] = 1;
-    dim_w[2] = 1;
-    CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor(
-        w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w));
-    CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor(
-        dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w));
-
-    CUDNN_ENFORCE(platform::dynload::cudnnGetRNNWorkspaceSize(
-        handle, rnn_desc_, max_length_, x_desc_, &workspace_size_));
-    CUDNN_ENFORCE(platform::dynload::cudnnGetRNNTrainingReserveSize(
-        handle, rnn_desc_, max_length_, x_desc_, &reserve_size_));
-
-    reserve_data_.Resize({static_cast<int64_t>(reserve_size_)});
-    reserve_data_.mutable_data<uint8_t>(ctx.GetPlace());
-
-    workspace_data_.Resize({static_cast<int64_t>(workspace_size_)});
-    workspace_data_.mutable_data<uint8_t>(ctx.GetPlace());
-  }
-
-  void release() {
-    for (size_t i = 0; i < max_length_; ++i) {
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i]));
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i]));
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i]));
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i]));
-    }
-
-    delete[] x_desc_;
-    delete[] y_desc_;
-    delete[] dx_desc_;
-    delete[] dy_desc_;
-
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_));
-
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_));
-
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(w_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_));
-  }
-};
-
 template <typename T>
 class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
  public:
@@ -315,9 +82,9 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
 
       auto input_w_numel = w->numel();
       auto batch_size = x->dims()[1];
-      cudnn_rnn_cache->init(handle, ctx, max_len, batch_size, input_size,
-                            hidden_size, num_layers, dropout_prob, is_bidirec,
-                            seed, input_w_numel);
+      cudnn_rnn_cache->init(handle, ctx.GetPlace(), max_len, batch_size,
+                            input_size, hidden_size, num_layers, dropout_prob,
+                            is_bidirec, seed, input_w_numel);
     }
 
     auto run_seq_len = x->dims()[0];
diff --git a/paddle/fluid/operators/cudnn_rnn_cache.h b/paddle/fluid/operators/cudnn_rnn_cache.h
new file mode 100644
index 0000000000..7f18b83927
--- /dev/null
+++ b/paddle/fluid/operators/cudnn_rnn_cache.h
@@ -0,0 +1,255 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+struct CudnnRNNCache {
+  CudnnRNNCache() {
+    x_desc_ = NULL;
+    y_desc_ = NULL;
+    dx_desc_ = NULL;
+    dy_desc_ = NULL;
+  }
+  ~CudnnRNNCache() { release(); }
+
+  cudnnRNNDescriptor_t rnn_desc_;
+  cudnnTensorDescriptor_t *x_desc_;
+  cudnnTensorDescriptor_t *y_desc_;
+  cudnnTensorDescriptor_t *dx_desc_;
+  cudnnTensorDescriptor_t *dy_desc_;
+
+  cudnnTensorDescriptor_t hx_desc_;
+  cudnnTensorDescriptor_t cx_desc_;
+  cudnnTensorDescriptor_t hy_desc_;
+  cudnnTensorDescriptor_t cy_desc_;
+
+  cudnnTensorDescriptor_t dhx_desc_;
+  cudnnTensorDescriptor_t dcx_desc_;
+  cudnnTensorDescriptor_t dhy_desc_;
+  cudnnTensorDescriptor_t dcy_desc_;
+
+  cudnnTensorDescriptor_t output_x_desc_;
+  cudnnTensorDescriptor_t output_y_desc_;
+
+  cudnnDropoutDescriptor_t dropout_desc_;
+
+  size_t weights_size_;
+  cudnnFilterDescriptor_t w_desc_;
+  cudnnFilterDescriptor_t dw_desc_;
+
+  size_t workspace_size_;
+  size_t reserve_size_;
+  framework::Tensor reserve_data_;
+  framework::Tensor workspace_data_;
+
+  framework::Tensor dropout_state_;
+
+  size_t max_length_;
+
+  float dropout_prob_;
+  bool is_bidirec_;
+
+  int batch_size_;
+  int input_size_;
+  int hidden_size_;
+  int num_layers_;
+  int seed_;
+
+  void init(cudnnHandle_t handle, const platform::Place &place, size_t max_len,
+            int batch_size, int input_size, int hidden_size, int num_layers,
+            float dropout_prob, bool is_bidirec, int seed, int weight_numel) {
+    max_length_ = max_len;
+    batch_size_ = batch_size;
+    input_size_ = input_size;
+    hidden_size_ = hidden_size;
+    num_layers_ = num_layers;
+    dropout_prob_ = dropout_prob;
+    is_bidirec_ = is_bidirec;
+    seed_ = seed;
+
+    x_desc_ = new cudnnTensorDescriptor_t[max_length_];
+    y_desc_ = new cudnnTensorDescriptor_t[max_length_];
+    dx_desc_ = new cudnnTensorDescriptor_t[max_length_];
+    dy_desc_ = new cudnnTensorDescriptor_t[max_length_];
+    int dim_a[3];
+    int stride_a[3];
+
+    for (size_t i = 0; i < max_length_; ++i) {
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i]));
+      dim_a[0] = batch_size_;
+      dim_a[1] = input_size_;
+      dim_a[2] = 1;
+
+      stride_a[0] = dim_a[2] * dim_a[1];
+      stride_a[1] = dim_a[2];
+      stride_a[2] = 1;
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+
+      dim_a[0] = batch_size_;
+      dim_a[1] = is_bidirec_ ? hidden_size_ * 2 : hidden_size_;
+      dim_a[2] = 1;
+
+      stride_a[0] = dim_a[2] * dim_a[1];
+      stride_a[1] = dim_a[2];
+      stride_a[2] = 1;
+
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    }
+
+    dim_a[0] = num_layers_ * (is_bidirec_ ? 2 : 1);
+    dim_a[1] = batch_size_;
+    dim_a[2] = hidden_size_;
+
+    stride_a[0] = dim_a[2] * dim_a[1];
+    stride_a[1] = dim_a[2];
+    stride_a[2] = 1;
+
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+
+    CUDNN_ENFORCE(
+        platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_));
+
+    size_t state_size;
+    CUDNN_ENFORCE(
+        platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size);
+        dropout_state_.Resize({static_cast<int64_t>(state_size)}));
+    auto *dropout_state_data = dropout_state_.mutable_data<uint8_t>(place);
+    CUDNN_ENFORCE(platform::dynload::cudnnSetDropoutDescriptor(
+        dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size,
+        seed_));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_));
+
+#if CUDNN_VERSION >= 6000
+    CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6(
+        handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
+        CUDNN_LINEAR_INPUT,
+        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
+        CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT));
+#else
+    CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor(
+        rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT,
+        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
+        CUDNN_DATA_FLOAT));
+#endif
+
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnGetRNNParamsSize(
+        handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT));
+
+    PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel,
+                      "cudnn lstm weight size should be SAME");
+    int dim_w[3];
+    dim_w[0] = weights_size_ / sizeof(float);
+    dim_w[1] = 1;
+    dim_w[2] = 1;
+    CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor(
+        w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor(
+        dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnGetRNNWorkspaceSize(
+        handle, rnn_desc_, max_length_, x_desc_, &workspace_size_));
+    CUDNN_ENFORCE(platform::dynload::cudnnGetRNNTrainingReserveSize(
+        handle, rnn_desc_, max_length_, x_desc_, &reserve_size_));
+
+    reserve_data_.Resize({static_cast<int64_t>(reserve_size_)});
+    reserve_data_.mutable_data<uint8_t>(place);
+
+    workspace_data_.Resize({static_cast<int64_t>(workspace_size_)});
+    workspace_data_.mutable_data<uint8_t>(place);
+  }
+
+  void release() {
+    for (size_t i = 0; i < max_length_; ++i) {
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i]));
+    }
+
+    delete[] x_desc_;
+    delete[] y_desc_;
+    delete[] dx_desc_;
+    delete[] dy_desc_;
+
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_));
+
+    CUDNN_ENFORCE(
+        platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(w_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc
index 6fed9ba92c..c35474e3aa 100644
--- a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc
@@ -171,8 +171,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var,
 
   if (var->IsType<framework::SelectedRows>()) {
     auto* slr = var->GetMutable<framework::SelectedRows>();
-    size_t rows_memory_size =
-        slr->rows().size() * framework::SizeOfType(typeid(int64_t));
+    size_t rows_memory_size = slr->rows().size() * sizeof(int64_t);
 
     IOBufWriter::Append(iobuf, ::sendrecv::VariableMessage::kRowsFieldNumber,
                         reinterpret_cast<const char*>(slr->rows().data()),
diff --git a/paddle/fluid/operators/distributed_ops/split_ids_op.h b/paddle/fluid/operators/distributed_ops/split_ids_op.h
index acc9b1e622..6676ecd1c8 100644
--- a/paddle/fluid/operators/distributed_ops/split_ids_op.h
+++ b/paddle/fluid/operators/distributed_ops/split_ids_op.h
@@ -116,7 +116,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
     } else {
       PADDLE_THROW(
           "% should be LoDTensor or SelectedRows, but the received type is %s",
-          ctx.Inputs("Ids")[0], ids_var->Type().name());
+          ctx.Inputs("Ids")[0], framework::ToTypeName(ids_var->Type()));
     }
   }
 };
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index a8b8a67a11..7a7a3989c0 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -83,7 +83,7 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
       z = ctx.Output<framework::LoDTensor>("Out");
     } else {
       PADDLE_THROW("X's type[%s] is not supported by elementwise_op.",
-                   x_var->Type().name());
+                   framework::ToTypeName(x_var->Type()));
     }
 
     z->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
index 14a2524bd8..be53a62cc9 100644
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -59,7 +59,7 @@ class GridSampleOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     framework::LibraryType library_{framework::LibraryType::kPlain};
 #ifdef PADDLE_WITH_CUDA
-    if (platform::CanCUDNNBeUsed(ctx)) {
+    if (framework::CanCUDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kCUDNN;
     }
 #endif
@@ -155,7 +155,7 @@ class GridSampleOpGrad : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     framework::LibraryType library_{framework::LibraryType::kPlain};
 #ifdef PADDLE_WITH_CUDA
-    if (platform::CanCUDNNBeUsed(ctx)) {
+    if (framework::CanCUDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kCUDNN;
     }
 #endif
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.h b/paddle/fluid/operators/optimizers/adadelta_op.h
index 6c616aa03d..3f51bb0b3d 100644
--- a/paddle/fluid/operators/optimizers/adadelta_op.h
+++ b/paddle/fluid/operators/optimizers/adadelta_op.h
@@ -27,12 +27,14 @@ class AdadeltaOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
                    "The Var(%s)'s type should be LoDTensor, "
                    "but the received is %s",
-                   ctx.Inputs("Param").front(), param_var->Type().name());
+                   ctx.Inputs("Param").front(),
+                   framework::ToTypeName(param_var->Type()));
     const auto* grad_var = ctx.InputVar("Grad");
     PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
                    "The Var(%s)'s type should be LoDTensor, "
                    "but the received is %s",
-                   ctx.Inputs("Grad").front(), grad_var->Type().name());
+                   ctx.Inputs("Grad").front(),
+                   framework::ToTypeName(grad_var->Type()));
 
     auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
     auto avg_squared_grad_out_tensor =
diff --git a/paddle/fluid/operators/optimizers/adagrad_op.h b/paddle/fluid/operators/optimizers/adagrad_op.h
index 9f6ef39169..13455fc42c 100644
--- a/paddle/fluid/operators/optimizers/adagrad_op.h
+++ b/paddle/fluid/operators/optimizers/adagrad_op.h
@@ -50,7 +50,8 @@ class AdagradOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
                    "The Var(%s)'s type should be LoDTensor, "
                    "but the received is %s",
-                   ctx.Inputs("Param").front(), param_var->Type().name());
+                   ctx.Inputs("Param").front(),
+                   framework::ToTypeName(param_var->Type()));
 
     auto *param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
     auto *moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
index 3455d1ee54..d8042e3614 100644
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -235,7 +235,8 @@ class AdamOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
                    "The Var(%s)'s type should be LoDTensor, "
                    "but the received is %s",
-                   ctx.Inputs("Param").front(), param_var->Type().name());
+                   ctx.Inputs("Param").front(),
+                   framework::ToTypeName(param_var->Type()));
 
     using paddle::framework::LoDTensor;
     using paddle::operators::detail::Ref;
diff --git a/paddle/fluid/operators/optimizers/adamax_op.h b/paddle/fluid/operators/optimizers/adamax_op.h
index 7137fbd965..55d25ecbdd 100644
--- a/paddle/fluid/operators/optimizers/adamax_op.h
+++ b/paddle/fluid/operators/optimizers/adamax_op.h
@@ -27,12 +27,14 @@ class AdamaxOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
                    "The Var(%s)'s type should be LoDTensor, "
                    "but the received is %s",
-                   ctx.Inputs("Param").front(), param_var->Type().name());
+                   ctx.Inputs("Param").front(),
+                   framework::ToTypeName(param_var->Type()));
     const auto* grad_var = ctx.InputVar("Grad");
     PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
                    "The Var(%s)'s type should be LoDTensor, "
                    "but the received is %s",
-                   ctx.Inputs("Grad").front(), grad_var->Type().name());
+                   ctx.Inputs("Grad").front(),
+                   framework::ToTypeName(grad_var->Type()));
 
     auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
     auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h b/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
index 5df43d33ef..4abd436927 100644
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
@@ -27,12 +27,14 @@ class DecayedAdagradOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
                    "The Var(%s)'s type should be LoDTensor, "
                    "but the received is %s",
-                   ctx.Inputs("Param").front(), param_var->Type().name());
+                   ctx.Inputs("Param").front(),
+                   framework::ToTypeName(param_var->Type()));
     const auto* grad_var = ctx.InputVar("Grad");
     PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
                    "The Var(%s)'s type should be LoDTensor, "
                    "but the received is %s",
-                   ctx.Inputs("Grad").front(), grad_var->Type().name());
+                   ctx.Inputs("Grad").front(),
+                   framework::ToTypeName(grad_var->Type()));
 
     auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
     auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
diff --git a/paddle/fluid/operators/optimizers/ftrl_op.h b/paddle/fluid/operators/optimizers/ftrl_op.h
index 8f812c9a03..bbf34d8316 100644
--- a/paddle/fluid/operators/optimizers/ftrl_op.h
+++ b/paddle/fluid/operators/optimizers/ftrl_op.h
@@ -32,12 +32,14 @@ class FTRLOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
                    "The Var(%s)'s type should be LoDTensor, "
                    "but the received is %s",
-                   ctx.Inputs("Param").front(), param_var->Type().name());
+                   ctx.Inputs("Param").front(),
+                   framework::ToTypeName(param_var->Type()));
     const auto* grad_var = ctx.InputVar("Grad");
     PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
                    "The Var(%s)'s type should be LoDTensor, "
                    "but the received is %s",
-                   ctx.Inputs("Grad").front(), grad_var->Type().name());
+                   ctx.Inputs("Grad").front(),
+                   framework::ToTypeName(grad_var->Type()));
 
     auto* param_out = ctx.Output<Tensor>("ParamOut");
     auto* sq_accum_out = ctx.Output<Tensor>("SquaredAccumOut");
diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h
index 71f079e4d9..84955d3f04 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ b/paddle/fluid/operators/optimizers/momentum_op.h
@@ -393,7 +393,7 @@ class MomentumOpKernel : public framework::OpKernel<T> {
       PADDLE_THROW(
           string::Sprintf("MomentumOp only supports LoDTensor or SelectedRows "
                           "gradient, but the received Variable Type is %s",
-                          grad_var->Type().name()));
+                          framework::ToTypeName(grad_var->Type())));
     }
   }
 };
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu
index a9d303d55d..975e4b8e72 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cu
+++ b/paddle/fluid/operators/optimizers/sgd_op.cu
@@ -60,7 +60,8 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
                    "The Var(%s)'s type should be LoDTensor, "
                    "but the received is %s",
-                   ctx.Inputs("Param").front(), param_var->Type().name());
+                   ctx.Inputs("Param").front(),
+                   framework::ToTypeName(param_var->Type()));
 
     auto* param = ctx.Input<framework::Tensor>("Param");
     auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index 5399ae556e..6781cdf9f3 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -92,7 +92,7 @@ framework::OpKernelType PoolOp::GetExpectedKernelType(
   framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
 
 #ifdef PADDLE_WITH_CUDA
-  if (platform::CanCUDNNBeUsed(ctx)) {
+  if (framework::CanCUDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kCUDNN;
   }
 #endif
@@ -122,7 +122,7 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
   framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
 
 #ifdef PADDLE_WITH_CUDA
-  if (platform::CanCUDNNBeUsed(ctx)) {
+  if (framework::CanCUDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kCUDNN;
   }
 #endif
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index bc889a5a04..ad37967f0a 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -50,7 +50,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
     framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
 
 #ifdef PADDLE_WITH_CUDA
-    if (platform::CanCUDNNBeUsed(ctx)) {
+    if (framework::CanCUDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kCUDNN;
     }
 #endif
@@ -157,7 +157,7 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
     framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
 
 #ifdef PADDLE_WITH_CUDA
-    if (platform::CanCUDNNBeUsed(ctx)) {
+    if (framework::CanCUDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kCUDNN;
     }
 #endif
diff --git a/paddle/fluid/operators/sum_mkldnn_op.cc b/paddle/fluid/operators/sum_mkldnn_op.cc
index f9a16ef35e..c39f94637a 100644
--- a/paddle/fluid/operators/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/sum_mkldnn_op.cc
@@ -245,7 +245,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       }
     } else {
       PADDLE_THROW("Unexpected branch, output variable type is %s",
-                   out_var->Type().name());
+                   framework::ToTypeName(out_var->Type()));
     }
   }
 };
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index 4f717a4355..01996e6bf9 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -126,7 +126,7 @@ class SumOp : public framework::OperatorWithKernel {
       PADDLE_THROW("Cannot find the input data type by all input data");
     }
     PADDLE_THROW("Unexpected branch. Input type is %s",
-                 x_vars[0]->Type().name());
+                 framework::ToTypeName(x_vars[0]->Type()));
   }
 };
 
diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h
index 76cc796a9b..a8b2df186d 100644
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -163,7 +163,7 @@ class SumKernel : public framework::OpKernel<T> {
       }
     } else {
       PADDLE_THROW("Unexpected branch, output variable type is %s",
-                   out_var->Type().name());
+                   framework::ToTypeName(out_var->Type()));
     }
   }
 };
diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc
index e2ae7caae1..add03bad13 100644
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -51,7 +51,7 @@ class WarpCTCOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     framework::LibraryType library_{framework::LibraryType::kPlain};
 #ifdef PADDLE_WITH_CUDA
-    if (platform::CanCUDNNBeUsed(ctx)) {
+    if (framework::CanCUDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kCUDNN;
     }
 #endif
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index 61a25064d1..74b0942379 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
@@ -451,18 +450,6 @@ class ScopedActivationDescriptor {
   DISABLE_COPY_AND_ASSIGN(ScopedActivationDescriptor);
 };
 
-inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
-  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
-  use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace());
-#ifdef PADDLE_WITH_CUDA
-  if (use_cudnn) {
-    auto& dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
-    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
-  }
-#endif
-  return use_cudnn;
-}
-
 #if CUDNN_VERSION >= 7001
 class ScopedCTCLossDescriptor {
  public:

From ce4a26ddad08a9d640f1ec3ddae254d0d0abd004 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Wed, 19 Dec 2018 12:23:11 +0000
Subject: [PATCH 15/77] clean code try to fix mac compile bug? test=develop

---
 paddle/fluid/framework/CMakeLists.txt         |  5 +-
 paddle/fluid/framework/var_type_traits.cc     | 53 +++++++++++++++++-
 paddle/fluid/framework/var_type_traits.h      | 55 +------------------
 .../fluid/framework/var_type_traits_test.cc   | 30 +++++++---
 4 files changed, 77 insertions(+), 66 deletions(-)

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index b6372a2ef5..d0beb8361c 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -83,10 +83,7 @@ cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
 
 cc_library(var_type_traits SRCS var_type_traits DEPS lod_tensor selected_rows framework_proto) 
 if (WITH_GPU)
-  target_link_libraries(var_type_traits cudnn)
-  if (NOT WIN32)
-    target_link_libraries(var_type_traits nccl)
-  endif()
+  target_link_libraries(var_type_traits dynload_cuda)
 endif()
 cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits)
 
diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc
index 0171df6f73..c9f9f8d6c6 100644
--- a/paddle/fluid/framework/var_type_traits.cc
+++ b/paddle/fluid/framework/var_type_traits.cc
@@ -17,9 +17,58 @@
 namespace paddle {
 namespace framework {
 
-const char* ToTypeName(int var_id) { return ToTypeIndex(var_id).name(); }
+// Besides registering variable type id, it is helpful to register a
+// var_id -> std::type_index map (for example, get type names according to id)
+namespace detail {
 
-const std::type_index& ToTypeIndex(int var_id) {
+template <int kStart, int kEnd, bool kStop>
+struct VarIdToTypeIndexMapInitializerImpl {
+  static void Init(std::unordered_map<int, std::type_index> *m) {
+    using Type =
+        typename std::tuple_element<kStart, VarTypeRegistry::ArgTuple>::type;
+    constexpr int kId = VarTypeTrait<Type>::kId;
+    if (!std::is_same<Type, void>::value) {
+      m->emplace(kId, std::type_index(typeid(Type)));
+    }
+    VarIdToTypeIndexMapInitializerImpl<kStart + 1, kEnd,
+                                       kStart + 1 == kEnd>::Init(m);
+  }
+};
+
+template <int kStart, int kEnd>
+struct VarIdToTypeIndexMapInitializerImpl<kStart, kEnd, true> {
+  static void Init(std::unordered_map<int, std::type_index> *m) {}
+};
+
+// VarIdToTypeIndexMapInitializer is designed to initialize var_id ->
+// std::type_index map
+using VarIdToTypeIndexMapInitializer =
+    VarIdToTypeIndexMapInitializerImpl<0, VarTypeRegistry::kRegisteredTypeNum,
+                                       VarTypeRegistry::kRegisteredTypeNum ==
+                                           0>;
+
+struct VarIdToTypeIndexMapHolder {
+ public:
+  static const std::type_index &ToTypeIndex(int var_id) {
+    static const VarIdToTypeIndexMapHolder instance;
+    auto it = instance.var_type_map_.find(var_id);
+    PADDLE_ENFORCE(it != instance.var_type_map_.end(),
+                   "VarId %d is not registered.", var_id);
+    return it->second;
+  }
+
+ private:
+  VarIdToTypeIndexMapHolder() {
+    VarIdToTypeIndexMapInitializer::Init(&var_type_map_);
+  }
+  std::unordered_map<int, std::type_index> var_type_map_;
+};
+
+}  // namespace detail
+
+const char *ToTypeName(int var_id) { return ToTypeIndex(var_id).name(); }
+
+const std::type_index &ToTypeIndex(int var_id) {
   return detail::VarIdToTypeIndexMapHolder::ToTypeIndex(var_id);
 }
 
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index 88f917e74f..c5e0d4707e 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -40,6 +40,9 @@
 namespace paddle {
 namespace framework {
 
+const char *ToTypeName(int var_id);
+const std::type_index &ToTypeIndex(int var_id);
+
 namespace detail {
 
 template <bool kStop, int kStart, int kEnd, typename T1, typename T2,
@@ -145,58 +148,6 @@ REG_PROTO_VAR_TYPE_TRAIT(ReaderHolder, proto::VarType::READER);
 
 /** End of variable type registration */
 
-// Besides register variable id, it is helpful to register a
-// var_id -> std::type_index (for example, get var names according to id)
-namespace detail {
-
-template <int kStart, int kEnd, bool kStop>
-struct VarIdToTypeIndexMapInitializerImpl {
-  static void Init(std::unordered_map<int, std::type_index> *m) {
-    using Type =
-        typename std::tuple_element<kStart, VarTypeRegistry::ArgTuple>::type;
-    constexpr int kId = VarTypeTrait<Type>::kId;
-    if (!std::is_same<Type, void>::value) {
-      m->emplace(kId, std::type_index(typeid(Type)));
-    }
-    VarIdToTypeIndexMapInitializerImpl<kStart + 1, kEnd,
-                                       kStart + 1 == kEnd>::Init(m);
-  }
-};
-
-template <int kStart, int kEnd>
-struct VarIdToTypeIndexMapInitializerImpl<kStart, kEnd, true> {
-  static void Init(std::unordered_map<int, std::type_index> *m) {}
-};
-
-// VarIdToTypeIndexMapInitializer is designed to initialize var_id ->
-// std::type_index map
-using VarIdToTypeIndexMapInitializer =
-    VarIdToTypeIndexMapInitializerImpl<0, VarTypeRegistry::kRegisteredTypeNum,
-                                       VarTypeRegistry::kRegisteredTypeNum ==
-                                           0>;
-
-struct VarIdToTypeIndexMapHolder {
- public:
-  static const std::type_index &ToTypeIndex(int var_id) {
-    static const VarIdToTypeIndexMapHolder instance;
-    auto it = instance.var_type_map_.find(var_id);
-    PADDLE_ENFORCE(it != instance.var_type_map_.end(),
-                   "VarId %d is not registered.", var_id);
-    return it->second;
-  }
-
- private:
-  VarIdToTypeIndexMapHolder() {
-    VarIdToTypeIndexMapInitializer::Init(&var_type_map_);
-  }
-  std::unordered_map<int, std::type_index> var_type_map_;
-};
-
-}  // namespace detail
-
-const char *ToTypeName(int var_id);
-const std::type_index &ToTypeIndex(int var_id);
-
 template <typename T>
 inline constexpr bool IsRegisteredVarType() {
   return VarTypeRegistry::IsRegistered<T>();
diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc
index 09fab719c1..f46608233a 100644
--- a/paddle/fluid/framework/var_type_traits_test.cc
+++ b/paddle/fluid/framework/var_type_traits_test.cc
@@ -15,32 +15,46 @@
 #include "paddle/fluid/framework/var_type_traits.h"
 #include <gtest/gtest.h>
 #include <cstdint>
+#include <unordered_set>
 
 namespace paddle {
 namespace framework {
 
 template <int kPos, int kEnd, bool kStop>
 struct TypeIndexChecker {
-  static void Check() {
+  template <typename SetType1, typename SetType2>
+  static void Check(SetType1 *var_id_set, SetType2 *type_index_set) {
     using Type =
         typename std::tuple_element<kPos, VarTypeRegistry::ArgTuple>::type;
+    static_assert(std::is_same<typename VarTypeTrait<Type>::Type, Type>::value,
+                  "Type must be the same");
+    constexpr auto kId = VarTypeTrait<Type>::kId;
     if (!std::is_same<Type, void>::value) {
-      EXPECT_TRUE(ToTypeIndex(VarTypeTrait<Type>::kId) == typeid(Type));
-      EXPECT_TRUE(std::string(ToTypeName(VarTypeTrait<Type>::kId)) ==
-                  typeid(Type).name());
+      std::type_index actual_type(typeid(Type));
+      EXPECT_EQ(std::string(ToTypeName(kId)), std::string(actual_type.name()));
+      EXPECT_EQ(ToTypeIndex(kId), actual_type);
+      EXPECT_TRUE(var_id_set->count(kId) == 0);              // NOLINT
+      EXPECT_TRUE(type_index_set->count(actual_type) == 0);  // NOLINT
+      var_id_set->insert(kId);
+      type_index_set->insert(std::type_index(typeid(Type)));
     }
-    TypeIndexChecker<kPos + 1, kEnd, kPos + 1 == kEnd>::Check();
+    TypeIndexChecker<kPos + 1, kEnd, kPos + 1 == kEnd>::Check(var_id_set,
+                                                              type_index_set);
   }
 };
 
 template <int kPos, int kEnd>
 struct TypeIndexChecker<kPos, kEnd, true> {
-  static void Check() {}
+  template <typename SetType1, typename SetType2>
+  static void Check(SetType1 *, SetType2 *) {}
 };
 
-TEST(var_type_traits, check_type_index) {
+TEST(var_type_traits, check_no_duplicate_registry) {
   constexpr size_t kRegisteredNum = VarTypeRegistry::kRegisteredTypeNum;
-  TypeIndexChecker<0, kRegisteredNum, kRegisteredNum == 0>::Check();
+  std::unordered_set<int> var_id_set;
+  std::unordered_set<std::type_index> type_index_set;
+  TypeIndexChecker<0, kRegisteredNum, kRegisteredNum == 0>::Check(
+      &var_id_set, &type_index_set);
 }
 
 template <typename T>

From 454db6662e15234df8f0765c098d171e75d5ec1a Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 20 Dec 2018 00:56:05 +0800
Subject: [PATCH 16/77] Accelerate lstm

---
 paddle/fluid/operators/math/concat_and_split.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index 760a065c10..930d851696 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -180,7 +180,7 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
     }
     // Wait() must be called because `inputs_data` may be destructed before
     // kernel ends
-    context.Wait();
+    /* context.Wait(); */
   }
 };
 
@@ -258,7 +258,7 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
     }
     // Wait() must be called because `outputs_data` may be destructed before
     // kernel ends
-    context.Wait();
+    /* context.Wait(); */
   }
 };
 

From 13429c3e9f92877ca8c282e1cae2d752a506b7ac Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Thu, 20 Dec 2018 02:56:11 +0000
Subject: [PATCH 17/77] clean code, remove void registration test why MAC CI
 fail again test=develop

---
 paddle/fluid/framework/var_type_traits.cc     | 58 ++++++++++++++-----
 paddle/fluid/framework/var_type_traits.h      | 33 ++++++-----
 .../fluid/framework/var_type_traits_test.cc   | 33 +++++++----
 3 files changed, 83 insertions(+), 41 deletions(-)

diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc
index c9f9f8d6c6..690c4895c1 100644
--- a/paddle/fluid/framework/var_type_traits.cc
+++ b/paddle/fluid/framework/var_type_traits.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
 namespace framework {
@@ -23,54 +24,83 @@ namespace detail {
 
 template <int kStart, int kEnd, bool kStop>
 struct VarIdToTypeIndexMapInitializerImpl {
-  static void Init(std::unordered_map<int, std::type_index> *m) {
+  template <typename MapType1, typename MapType2>
+  static void Init(MapType1 *id_to_type, MapType2 *type_to_id) {
     using Type =
         typename std::tuple_element<kStart, VarTypeRegistry::ArgTuple>::type;
+    static_assert(!std::is_same<Type, void>::value, "Type cannot be void");
     constexpr int kId = VarTypeTrait<Type>::kId;
-    if (!std::is_same<Type, void>::value) {
-      m->emplace(kId, std::type_index(typeid(Type)));
-    }
+    auto type = std::type_index(typeid(Type));
+    PADDLE_ENFORCE(id_to_type->count(kId) == 0,
+                   "Registered duplicate type id %d for type %s", kId,
+                   type.name());
+    PADDLE_ENFORCE(type_to_id->count(type) == 0,
+                   "Registered duplicate type_index %s for id %d", type.name(),
+                   kId);
+    id_to_type->emplace(kId, type);
+    type_to_id->emplace(type, kId);
     VarIdToTypeIndexMapInitializerImpl<kStart + 1, kEnd,
-                                       kStart + 1 == kEnd>::Init(m);
+                                       kStart + 1 == kEnd>::Init(id_to_type,
+                                                                 type_to_id);
   }
 };
 
 template <int kStart, int kEnd>
 struct VarIdToTypeIndexMapInitializerImpl<kStart, kEnd, true> {
-  static void Init(std::unordered_map<int, std::type_index> *m) {}
+  template <typename MapType1, typename MapType2>
+  static void Init(MapType1 *, MapType2 *) {}
 };
 
 // VarIdToTypeIndexMapInitializer is designed to initialize var_id ->
-// std::type_index map
+// std::type_index map and std::type_index -> var_id map
 using VarIdToTypeIndexMapInitializer =
     VarIdToTypeIndexMapInitializerImpl<0, VarTypeRegistry::kRegisteredTypeNum,
                                        VarTypeRegistry::kRegisteredTypeNum ==
                                            0>;
 
 struct VarIdToTypeIndexMapHolder {
+  DISABLE_COPY_AND_ASSIGN(VarIdToTypeIndexMapHolder);
+
  public:
   static const std::type_index &ToTypeIndex(int var_id) {
-    static const VarIdToTypeIndexMapHolder instance;
-    auto it = instance.var_type_map_.find(var_id);
-    PADDLE_ENFORCE(it != instance.var_type_map_.end(),
+    auto it = Instance().id_to_type_map_.find(var_id);
+    PADDLE_ENFORCE(it != Instance().id_to_type_map_.end(),
                    "VarId %d is not registered.", var_id);
     return it->second;
   }
 
+  static int ToTypeId(const std::type_index &type) {
+    auto it = Instance().type_to_id_map_.find(type);
+    PADDLE_ENFORCE(it != Instance().type_to_id_map_.end(),
+                   "VarType %s is not registered.", type.name());
+    return it->second;
+  }
+
  private:
   VarIdToTypeIndexMapHolder() {
-    VarIdToTypeIndexMapInitializer::Init(&var_type_map_);
+    VarIdToTypeIndexMapInitializer::Init(&id_to_type_map_, &type_to_id_map_);
+  }
+
+  static const VarIdToTypeIndexMapHolder &Instance() {
+    static const VarIdToTypeIndexMapHolder instance;
+    return instance;
   }
-  std::unordered_map<int, std::type_index> var_type_map_;
+
+  std::unordered_map<int, std::type_index> id_to_type_map_;
+  std::unordered_map<std::type_index, int> type_to_id_map_;
 };
 
 }  // namespace detail
 
-const char *ToTypeName(int var_id) { return ToTypeIndex(var_id).name(); }
-
 const std::type_index &ToTypeIndex(int var_id) {
   return detail::VarIdToTypeIndexMapHolder::ToTypeIndex(var_id);
 }
 
+const char *ToTypeName(int var_id) { return ToTypeIndex(var_id).name(); }
+
+int ToTypeId(const std::type_index &type) {
+  return detail::VarIdToTypeIndexMapHolder::ToTypeId(type);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index c5e0d4707e..a58414c3d4 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -42,6 +42,7 @@ namespace framework {
 
 const char *ToTypeName(int var_id);
 const std::type_index &ToTypeIndex(int var_id);
+int ToTypeId(const std::type_index &type);
 
 namespace detail {
 
@@ -75,10 +76,10 @@ struct VarTypeRegistryImpl {
   using ArgTuple = std::tuple<Args...>;
 
   // TypePos() returns the position in which T is inside Args...
-  // If T is not inside Args... or T is void, return -1
+  // If T is not inside Args..., return -1
   template <typename T>
   static constexpr int TypePos() {
-    return std::is_same<T, void>::value ? -1 : TypePosFinder<T, Args...>::kPos;
+    return TypePosFinder<T, Args...>::kPos;
   }
 
   // IsRegistered() returns whether T is registered inside RegistryImpl
@@ -90,19 +91,22 @@ struct VarTypeRegistryImpl {
 
 }  // namespace detail
 
-#define REG_PROTO_VAR_TYPE_TRAIT(type, proto_id)         \
-  template <>                                            \
-  struct VarTypeTrait<type> {                            \
-    static_assert(VarTypeRegistry::IsRegistered<type>(), \
-                  "Must be registered type");            \
-    using Type = type;                                   \
-    static constexpr int kId = proto_id;                 \
+#define REG_PROTO_VAR_TYPE_TRAIT(type, proto_id)           \
+  template <>                                              \
+  struct VarTypeTrait<type> {                              \
+    static_assert(VarTypeRegistry::IsRegistered<type>(),   \
+                  "Must be registered type");              \
+    using Type = type;                                     \
+    static constexpr int kId = static_cast<int>(proto_id); \
   }
 
 /**
  * The following codes are designed to register variable types.
  * Only registered types can be stored in Variable.
  * This registry mechanism is designed to speed up Variable.
+ *
+ * Caution: If you want to add more var types, please consider carefully
+ * whether you really need to add it.
  */
 
 // Users should add other variable types below.
@@ -110,10 +114,9 @@ struct VarTypeRegistryImpl {
 class Scope;
 
 using VarTypeRegistry = detail::VarTypeRegistryImpl<
-    LoDTensor, SelectedRows, std::vector<Scope *>, LoDRankTable, LoDTensorArray,
-    platform::PlaceList, ReaderHolder, Tensor, std::string, Scope *,
+    Tensor, LoDTensor, SelectedRows, std::vector<Scope *>, LoDRankTable,
+    LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *,
     std::map<size_t, Tensor>, operators::reader::LoDTensorBlockingQueueHolder,
-    int, float,
 #ifdef PADDLE_WITH_CUDA
 #ifndef _WIN32
     ncclUniqueId, platform::Communicator,
@@ -123,13 +126,11 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
     operators::AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>,
     operators::CudnnRNNCache,
 #endif
-    void>;  // void indicates end of registration, add other types before void
+    int, float>;
 
 template <typename T>
 struct VarTypeTrait {
-  static_assert(std::is_same<T, void>::value ||
-                    VarTypeRegistry::IsRegistered<T>(),
-                "Must be registered type");
+  static_assert(VarTypeRegistry::IsRegistered<T>(), "Must be registered type");
   using Type = T;
   // Default id generation
   static constexpr int kId = VarTypeRegistry::TypePos<T>() +
diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc
index f46608233a..4dad4cb27b 100644
--- a/paddle/fluid/framework/var_type_traits_test.cc
+++ b/paddle/fluid/framework/var_type_traits_test.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/var_type_traits.h"
 #include <gtest/gtest.h>
 #include <cstdint>
+#include <iostream>
 #include <unordered_set>
 
 namespace paddle {
@@ -29,15 +30,27 @@ struct TypeIndexChecker {
     static_assert(std::is_same<typename VarTypeTrait<Type>::Type, Type>::value,
                   "Type must be the same");
     constexpr auto kId = VarTypeTrait<Type>::kId;
-    if (!std::is_same<Type, void>::value) {
-      std::type_index actual_type(typeid(Type));
-      EXPECT_EQ(std::string(ToTypeName(kId)), std::string(actual_type.name()));
-      EXPECT_EQ(ToTypeIndex(kId), actual_type);
-      EXPECT_TRUE(var_id_set->count(kId) == 0);              // NOLINT
-      EXPECT_TRUE(type_index_set->count(actual_type) == 0);  // NOLINT
-      var_id_set->insert(kId);
-      type_index_set->insert(std::type_index(typeid(Type)));
+    std::type_index actual_type(typeid(Type));
+    EXPECT_EQ(std::string(ToTypeName(kId)), std::string(actual_type.name()));
+    // For some reasons, comparing std::type_index using EXPECT_EQ would fail
+    // in MAC CI
+    bool is_same_type_index = (ToTypeIndex(kId) == actual_type);
+    if (!is_same_type_index) {
+      std::string s1 = ToTypeName(kId);
+      std::string s2 = actual_type.name();
+      PADDLE_THROW("Step %d: type %s is not the same as %s, var_id %d", kPos,
+                   s1.c_str(), s2.c_str(), kId);
     }
+    EXPECT_TRUE(is_same_type_index);
+    EXPECT_TRUE(ToTypeId(actual_type) == kId);  // NOLINT
+    is_same_type_index = (ToTypeIndex(ToTypeId(actual_type)) == actual_type);
+    EXPECT_TRUE(is_same_type_index);
+    EXPECT_EQ(ToTypeId(ToTypeIndex(kId)), kId);
+
+    EXPECT_TRUE(var_id_set->count(kId) == 0);              // NOLINT
+    EXPECT_TRUE(type_index_set->count(actual_type) == 0);  // NOLINT
+    var_id_set->insert(kId);
+    type_index_set->insert(std::type_index(typeid(Type)));
     TypeIndexChecker<kPos + 1, kEnd, kPos + 1 == kEnd>::Check(var_id_set,
                                                               type_index_set);
   }
@@ -75,13 +88,11 @@ TEST(var_type_traits, check_proto_type_id) {
 }
 
 TEST(var_type_traits, test_registry) {
-  using Registry =
-      detail::VarTypeRegistryImpl<int8_t, int32_t, size_t, double, void>;
+  using Registry = detail::VarTypeRegistryImpl<int8_t, int32_t, size_t, double>;
   ASSERT_TRUE(Registry::TypePos<int8_t>() == 0);
   ASSERT_TRUE(Registry::TypePos<int32_t>() == 1);
   ASSERT_TRUE(Registry::TypePos<size_t>() == 2);
   ASSERT_TRUE(Registry::TypePos<double>() == 3);
-  ASSERT_TRUE(Registry::TypePos<void>() == -1);
   ASSERT_TRUE(Registry::TypePos<float>() == -1);
 }
 

From 7f6e513b1fa798745d7cb918bd7a56d66607aed3 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Thu, 20 Dec 2018 12:21:51 +0000
Subject: [PATCH 18/77] fix mac ci bug make forward declaration test=develop

---
 paddle/fluid/framework/var_type_traits.cc     | 13 ++++++
 paddle/fluid/framework/var_type_traits.h      | 43 +++++++++++++++----
 .../fluid/framework/var_type_traits_test.cc   | 31 +++++++------
 3 files changed, 64 insertions(+), 23 deletions(-)

diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc
index 690c4895c1..c3c5bab23b 100644
--- a/paddle/fluid/framework/var_type_traits.cc
+++ b/paddle/fluid/framework/var_type_traits.cc
@@ -13,7 +13,20 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 #include "paddle/fluid/platform/macros.h"
+#ifdef PADDLE_WITH_CUDA
+#ifndef _WIN32
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
+#endif
+#include <cudnn.h>
+#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
+#include "paddle/fluid/operators/cudnn_rnn_cache.h"
+#endif
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index a58414c3d4..b51b4933e6 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -20,23 +20,48 @@
 #include <typeinfo>
 #include <vector>
 #include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
-#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #ifdef PADDLE_WITH_CUDA
+#include <cudnn.h>
 #ifndef _WIN32
 #include <nccl.h>
-#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
-#include <cudnn.h>
-#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
-#include "paddle/fluid/operators/cudnn_rnn_cache.h"
 #endif
 
+// Users should add forward declarations here
+namespace paddle {
+
+namespace platform {
+#ifdef PADDLE_WITH_CUDA
+#ifndef _WIN32
+class Communicator;
+#endif
+#endif
+}  // namespace platform
+
+namespace framework {
+class Tensor;
+class LoDTensor;
+class SelectedRows;
+class LoDRankTable;
+class ReaderHolder;
+class Scope;
+}  // namespace framework
+
+namespace operators {
+template <typename T>
+class AlgorithmsCache;
+
+class CudnnRNNCache;
+
+namespace reader {
+class LoDTensorBlockingQueueHolder;
+}  // namespace reader
+}  // namespace operators
+
+}  // namespace paddle
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc
index 4dad4cb27b..1c7d9f2abe 100644
--- a/paddle/fluid/framework/var_type_traits_test.cc
+++ b/paddle/fluid/framework/var_type_traits_test.cc
@@ -12,12 +12,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/var_type_traits.h"
 #include <gtest/gtest.h>
 #include <cstdint>
 #include <iostream>
 #include <unordered_set>
 
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
+#ifdef PADDLE_WITH_CUDA
+#ifndef _WIN32
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
+#endif
+#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
+#include "paddle/fluid/operators/cudnn_rnn_cache.h"
+#endif
+
 namespace paddle {
 namespace framework {
 
@@ -32,19 +45,9 @@ struct TypeIndexChecker {
     constexpr auto kId = VarTypeTrait<Type>::kId;
     std::type_index actual_type(typeid(Type));
     EXPECT_EQ(std::string(ToTypeName(kId)), std::string(actual_type.name()));
-    // For some reasons, comparing std::type_index using EXPECT_EQ would fail
-    // in MAC CI
-    bool is_same_type_index = (ToTypeIndex(kId) == actual_type);
-    if (!is_same_type_index) {
-      std::string s1 = ToTypeName(kId);
-      std::string s2 = actual_type.name();
-      PADDLE_THROW("Step %d: type %s is not the same as %s, var_id %d", kPos,
-                   s1.c_str(), s2.c_str(), kId);
-    }
-    EXPECT_TRUE(is_same_type_index);
-    EXPECT_TRUE(ToTypeId(actual_type) == kId);  // NOLINT
-    is_same_type_index = (ToTypeIndex(ToTypeId(actual_type)) == actual_type);
-    EXPECT_TRUE(is_same_type_index);
+    EXPECT_EQ(ToTypeIndex(kId), actual_type);
+    EXPECT_EQ(ToTypeId(actual_type), kId);
+    EXPECT_EQ(ToTypeIndex(ToTypeId(actual_type)), actual_type);
     EXPECT_EQ(ToTypeId(ToTypeIndex(kId)), kId);
 
     EXPECT_TRUE(var_id_set->count(kId) == 0);              // NOLINT

From 0a4b6fc0561c1b3f1b5610b2d161c837dc4b8a0e Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 21 Dec 2018 14:12:24 +0800
Subject: [PATCH 19/77] Remove unnessesary code

test=develop
---
 CMakeLists.txt                                |   2 +-
 cmake/external/robin_map.cmake                |  31 ------
 paddle/fluid/framework/CMakeLists.txt         |   2 +-
 .../framework/details/execution_strategy.h    |   2 +-
 .../scope_buffered_ssa_graph_executor.cc      |  11 +-
 paddle/fluid/framework/ir/graph.cc            |  65 +++--------
 paddle/fluid/framework/rw_lock.h              | 101 ++++++++++++------
 paddle/fluid/framework/scope.cc               |  51 ++++-----
 paddle/fluid/framework/scope.h                |  29 +----
 paddle/fluid/framework/spin_lock.h            |  71 ------------
 .../fluid/operators/math/concat_and_split.cu  |   4 +-
 paddle/fluid/pybind/pybind.cc                 |   2 +-
 python/paddle/fluid/profiler.py               |   1 -
 13 files changed, 117 insertions(+), 255 deletions(-)
 delete mode 100644 cmake/external/robin_map.cmake
 delete mode 100644 paddle/fluid/framework/spin_lock.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7fda5d460d..c31f51a3f7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -294,7 +294,7 @@ if(WITH_PSLIB)
     list(APPEND EXTERNAL_LIBS pslib_brpc)
     list(APPEND EXTERNAL_LIBS libmct)
 endif(WITH_PSLIB)
-
+    
 if(WITH_AMD_GPU)
     find_package(HIP)
     include(hip)
diff --git a/cmake/external/robin_map.cmake b/cmake/external/robin_map.cmake
deleted file mode 100644
index ddaf59536c..0000000000
--- a/cmake/external/robin_map.cmake
+++ /dev/null
@@ -1,31 +0,0 @@
-include(ExternalProject)
-
-set(ROBIN_MAP_SOURCE_DIR ${THIRD_PARTY_PATH}/robin_map)
-set(ROBIN_MAP_INCLUDE_DIR ${ROBIN_MAP_SOURCE_DIR}/src/extern_robin_map/include)
-
-include_directories(${ROBIN_MAP_INCLUDE_DIR})
-
-ExternalProject_Add(
-  extern_robin_map
-  ${EXTERNAL_PROJECT_LOG_ARGS}
-  GIT_REPOSITORY "https://github.com/Tessil/robin-map.git"
-  GIT_TAG        "v0.5.0"
-  PREFIX         ${ROBIN_MAP_SOURCE_DIR}
-  UPDATE_COMMAND ""
-  CONFIGURE_COMMAND ""
-  BUILD_COMMAND     ""
-  INSTALL_COMMAND   ""
-  TEST_COMMAND      ""
-)
-
-if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
-  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/robin_map_dummy.c)
-  file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
-  add_library(robin_map STATIC ${dummyfile})
-else()
-  add_library(robin_map INTERFACE)
-endif()
-
-add_dependencies(robin_map extern_robin_map)
-
-LIST(APPEND externl_project_dependencies robin_map)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 10a637af44..412bc9cbe8 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -83,7 +83,7 @@ cc_test(variable_test SRCS variable_test.cc)
 cc_library(threadpool SRCS threadpool.cc DEPS enforce)
 cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
 
-cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash)
+cc_library(scope SRCS scope.cc DEPS glog threadpool)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
 
 cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h
index 37b07e5736..15c496130c 100644
--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -25,7 +25,7 @@ struct ExecutionStrategy {
   size_t num_threads_{0};
   bool use_cuda_{true};
   bool allow_op_delay_{false};
-  size_t num_iteration_per_drop_scope_{1};
+  size_t num_iteration_per_drop_scope_{100};
   ExecutorType type_{kDefault};
   bool dry_run_{false};
 };
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index ea783c6090..57f6fc66c5 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -64,24 +64,21 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
   }
 
   platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
-  ++drop_scope_counter_;
+  drop_scope_counter_ += 1;
 
-  if (!fetch_tensors.empty()) {
+  if (!fetch_tensors.empty() ||
+      drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
+    drop_scope_counter_ = 0;
     // Wait All computational streams
     for (auto p : places_) {
       platform::DeviceContextPool::Instance().Get(p)->Wait();
     }
-  }
-
-  if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
-    drop_scope_counter_ = 0;
     for (auto &scope : local_scopes_) {
       auto &local_scope =
           *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
       scope->DeleteScope(local_scope);
     }
   }
-
   if (eptr) {
     std::rethrow_exception(eptr);
   } else {
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index 8e67f8f610..8670dcfed7 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -20,10 +20,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
 
-DEFINE_bool(enforce_when_check_program, true,
-            "Checking whether the program is correct or not. We will log "
-            "errors rather than throwing exceptions if this flag turned off");
-
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -48,56 +44,27 @@ void CheckProgram(const ProgramDesc &program) {
         break;
       case _INT(OpRole::kBackward):
       case _INT(OpRole::kBackward) | _INT(OpRole::kLoss):
-        if (!FLAGS_enforce_when_check_program) {
-          PADDLE_ENFORCE(
-              visit.find(_INT(OpRole::kOptimize)) == visit.end(),
-              "Cannot add backward operator %s after optimize operator.",
-              op->Type());
-        } else {
-          if (visit.find(_INT(OpRole::kOptimize)) != visit.end()) {
-            LOG(ERROR)
-                << "Cannot add backward operator %s after optimize operator."
-                << op->Type();
-          }
-        }
+        PADDLE_ENFORCE(
+            visit.find(_INT(OpRole::kOptimize)) == visit.end(),
+            "Cannot add backward operator %s after optimize operator.",
+            op->Type());
         break;
       case _INT(OpRole::kForward) | _INT(OpRole::kLoss):
-        if (!FLAGS_enforce_when_check_program) {
-          PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) |
-                                    _INT(OpRole::kLoss)) == visit.end(),
-                         "Cannot add backward|loss operator before "
-                         "forward|loss operator %s.",
-                         op->Type());
-          PADDLE_ENFORCE(
-              visit.find(_INT(OpRole::kOptimize)) == visit.end(),
-              "Cannot add forward|loss operator %s after optimize operator.",
-              op->Type());
-        } else {
-          if (visit.find(_INT(OpRole::kBackward) | _INT(OpRole::kLoss)) !=
-              visit.end()) {
-            LOG(ERROR) << "Cannot add backward|loss operator before "
-                       << "forward|loss operator %s." << op->Type();
-          }
-
-          if (visit.find(_INT(OpRole::kOptimize)) != visit.end()) {
-            LOG(ERROR) << "Cannot add forward|loss operator %s after optimize "
-                          "operator."
-                       << op->Type();
-          }
-        }
+        PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) |
+                                  _INT(OpRole::kLoss)) == visit.end(),
+                       "Cannot add backward|loss operator before "
+                       "forward|loss operator %s.",
+                       op->Type());
+        PADDLE_ENFORCE(
+            visit.find(_INT(OpRole::kOptimize)) == visit.end(),
+            "Cannot add forward|loss operator %s after optimize operator.",
+            op->Type());
         break;
       case _INT(OpRole::kOptimize):
       case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched):
-        if (!FLAGS_enforce_when_check_program) {
-          PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(),
-                         "Optimize operators %s must follow backward operator.",
-                         op->Type());
-        } else {
-          if (visit.find(_INT(OpRole::kBackward)) == visit.end()) {
-            LOG(ERROR) << "Optimize operators %s must follow backward operator."
-                       << op->Type();
-          }
-        }
+        PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(),
+                       "Optimize operators %s must follow backward operator.",
+                       op->Type());
         break;
       case _INT(OpRole::kLRSched):
       case _INT(OpRole::kDist):
diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h
index 75e6bef9bf..dbf00f3a79 100644
--- a/paddle/fluid/framework/rw_lock.h
+++ b/paddle/fluid/framework/rw_lock.h
@@ -16,9 +16,7 @@ limitations under the License. */
 
 #if !defined(_WIN32)
 #include <pthread.h>
-#else
-#include <mutex>  // NOLINT
-#endif            // !_WIN32
+#endif  // !_WIN32
 
 #include "paddle/fluid/platform/enforce.h"
 
@@ -31,17 +29,17 @@ struct RWLock {
 
   ~RWLock() { pthread_rwlock_destroy(&lock_); }
 
-  inline void RDLock() {
+  void RDLock() {
     PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0,
                       "acquire read lock failed");
   }
 
-  inline void WRLock() {
+  void WRLock() {
     PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0,
                       "acquire write lock failed");
   }
 
-  inline void UNLock() {
+  void UNLock() {
     PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
   }
 
@@ -53,44 +51,81 @@ struct RWLock {
 // https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive
 // In windows, rw_lock seems like a hack. Use empty object and do nothing.
 struct RWLock {
-  // FIXME(minqiyang): use mutex here to do fake lock
-  inline void RDLock() { mutex_.lock(); }
-
-  inline void WRLock() { mutex_.lock(); }
-
-  inline void UNLock() { mutex_.unlock(); }
-
- private:
-  std::mutex mutex_;
+  void RDLock() {}
+  void WRLock() {}
+  void UNLock() {}
 };
 #endif
 
-class AutoWRLock {
+class RWLockGuard {
  public:
-  explicit AutoWRLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
-
-  inline void Lock() { lock_->WRLock(); }
-
-  inline void UnLock() { lock_->UNLock(); }
-
-  ~AutoWRLock() { UnLock(); }
-
- private:
-  RWLock* lock_;
-};
+  enum Status { kUnLock, kWRLock, kRDLock };
+
+  RWLockGuard(RWLock* rw_lock, Status init_status)
+      : lock_(rw_lock), status_(Status::kUnLock) {
+    switch (init_status) {
+      case Status::kRDLock: {
+        RDLock();
+        break;
+      }
+      case Status::kWRLock: {
+        WRLock();
+        break;
+      }
+      case Status::kUnLock: {
+        break;
+      }
+    }
+  }
 
-class AutoRDLock {
- public:
-  explicit AutoRDLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
+  void WRLock() {
+    switch (status_) {
+      case Status::kUnLock: {
+        lock_->WRLock();
+        status_ = Status::kWRLock;
+        break;
+      }
+      case Status::kWRLock: {
+        break;
+      }
+      case Status::kRDLock: {
+        PADDLE_THROW(
+            "Please unlock read lock first before invoking write lock.");
+        break;
+      }
+    }
+  }
 
-  inline void Lock() { lock_->RDLock(); }
+  void RDLock() {
+    switch (status_) {
+      case Status::kUnLock: {
+        lock_->RDLock();
+        status_ = Status::kRDLock;
+        break;
+      }
+      case Status::kRDLock: {
+        break;
+      }
+      case Status::kWRLock: {
+        PADDLE_THROW(
+            "Please unlock write lock first before invoking read lock.");
+        break;
+      }
+    }
+  }
 
-  inline void UnLock() { lock_->UNLock(); }
+  void UnLock() {
+    if (status_ != Status::kUnLock) {
+      lock_->UNLock();
+      status_ = Status::kUnLock;
+    }
+  }
 
-  ~AutoRDLock() { UnLock(); }
+  ~RWLockGuard() { UnLock(); }
 
  private:
   RWLock* lock_;
+  Status status_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 4f79d98260..6fa5e99f9f 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -47,15 +47,9 @@ DEFINE_bool(fast_eager_deletion_mode, false,
 // the mutex will cause serious performance issue.
 // So the mutex is disabled when `ON_INFER`.
 #ifdef PADDLE_ON_INFERENCE
-#define SCOPE_KIDS_READER_LOCK
-#define SCOPE_KIDS_WRITER_LOCK
-#define SCOPE_VARS_READER_LOCK
-#define SCOPE_VARS_WRITER_LOCK
+#define SCOPE_LOCK_GUARD
 #else
-#define SCOPE_KIDS_READER_LOCK AutoRDLock auto_lock(&kids_lock_);
-#define SCOPE_KIDS_WRITER_LOCK AutoWRLock auto_lock(&kids_lock_);
-#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_);
-#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_);
+#define SCOPE_LOCK_GUARD std::lock_guard<std::mutex> lock(mutex_);
 #endif
 
 namespace paddle {
@@ -73,69 +67,64 @@ bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; }
 Scope::~Scope() { DropKids(); }
 
 Scope& Scope::NewScope() const {
-  Scope* child = new Scope(this);
-  {
-    SCOPE_KIDS_WRITER_LOCK
-    kids_.push_back(child);
-  }
-  return *child;
+  SCOPE_LOCK_GUARD
+  kids_.push_back(new Scope(this));
+  return *kids_.back();
 }
 
 Variable* Scope::Var(const std::string& name) {
-  SCOPE_VARS_WRITER_LOCK
+  SCOPE_LOCK_GUARD
   return VarInternal(name);
 }
 
 Variable* Scope::Var(std::string* name) {
+  SCOPE_LOCK_GUARD
   auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   if (name != nullptr) {
     *name = new_name;
   }
-  SCOPE_VARS_WRITER_LOCK
   return VarInternal(new_name);
 }
 
 Variable* Scope::FindVar(const std::string& name) const {
-  SCOPE_VARS_READER_LOCK
+  SCOPE_LOCK_GUARD
   return FindVarInternal(name);
 }
 
 Variable* Scope::FindLocalVar(const std::string& name) const {
-  SCOPE_VARS_READER_LOCK
+  SCOPE_LOCK_GUARD
   return FindVarLocally(name);
 }
 
 const Scope* Scope::FindScope(const Variable* var) const {
-  SCOPE_VARS_READER_LOCK
+  SCOPE_LOCK_GUARD
   return FindScopeInternal(var);
 }
 
 void Scope::DropKids() {
-  SCOPE_KIDS_WRITER_LOCK
+  SCOPE_LOCK_GUARD
   for (Scope* s : kids_) delete s;
   kids_.clear();
 }
 
 bool Scope::HasKid(const Scope* scope) const {
-  SCOPE_KIDS_READER_LOCK
+  SCOPE_LOCK_GUARD
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   return it != this->kids_.end();
 }
 
 std::vector<std::string> Scope::LocalVarNames() const {
+  SCOPE_LOCK_GUARD
   std::vector<std::string> known_vars;
-  {
-    SCOPE_VARS_READER_LOCK
-    known_vars.reserve(this->vars_.size());
-    for (auto& p : vars_) {
-      known_vars.emplace_back(p.first);
-    }
+  known_vars.reserve(this->vars_.size());
+  for (auto& p : vars_) {
+    known_vars.emplace_back(p.first);
   }
   return known_vars;
 }
 
 void Scope::DeleteScope(Scope* scope) const {
-  SCOPE_KIDS_WRITER_LOCK
+  SCOPE_LOCK_GUARD
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope",
                  this, scope);
@@ -149,8 +138,8 @@ void Scope::DeleteScope(Scope* scope) const {
 }
 
 void Scope::EraseVars(const std::vector<std::string>& var_names) {
+  SCOPE_LOCK_GUARD
   std::set<std::string> var_set(var_names.begin(), var_names.end());
-  SCOPE_VARS_WRITER_LOCK
   for (auto it = vars_.begin(); it != vars_.end();) {
     if (var_set.find(it->first) != var_set.end()) {
       it = vars_.erase(it);
@@ -162,12 +151,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
 
 void Scope::Rename(const std::string& origin_name,
                    const std::string& new_name) const {
-  SCOPE_VARS_WRITER_LOCK
+  SCOPE_LOCK_GUARD
   RenameInternal(origin_name, new_name);
 }
 
 std::string Scope::Rename(const std::string& origin_name) const {
-  SCOPE_VARS_WRITER_LOCK
+  SCOPE_LOCK_GUARD
   auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   RenameInternal(origin_name, new_name);
   return new_name;
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index 77ef18414d..aded1f771c 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -14,19 +14,12 @@ limitations under the License. */
 
 #pragma once
 
-extern "C" {
-#include <xxhash.h>
-}
-
-#include <functional>
 #include <list>
-#include <memory>
+#include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
-#include <utility>
 #include <vector>
 
-#include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/macros.h"
 
@@ -38,14 +31,6 @@ bool IsFastEagerDeletionModeEnabled();
 
 class Scope;
 
-namespace inner {
-struct KeyHasher {
-  std::size_t operator()(const std::string& key) const {
-    return XXH32(key.c_str(), key.size(), 1);
-  }
-};
-}  // namespace inner
-
 /**
  * @brief Scope that manage all variables.
  *
@@ -110,14 +95,7 @@ class Scope {
   std::string Rename(const std::string& origin_name) const;
 
  protected:
-  mutable std::unordered_map<std::string, std::unique_ptr<Variable>,
-                             inner::KeyHasher>
-      vars_;
-  // mutable tsl::robin_map<
-  // std::string, std::unique_ptr<Variable>, std::hash<std::string>,
-  // std::equal_to<std::string>,
-  // std::allocator<std::pair<std::string, std::unique_ptr<Variable>>>, true>
-  // vars_;
+  mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
 
  private:
   // Call Scope::NewScope for a sub-scope.
@@ -146,8 +124,7 @@ class Scope {
   DISABLE_COPY_AND_ASSIGN(Scope);
 
  private:
-  mutable RWLock kids_lock_;
-  mutable RWLock vars_lock_;
+  mutable std::mutex mutex_;
 };
 
 // Generate some debug string about the inherience structure of scope, quite
diff --git a/paddle/fluid/framework/spin_lock.h b/paddle/fluid/framework/spin_lock.h
deleted file mode 100644
index 11a763d655..0000000000
--- a/paddle/fluid/framework/spin_lock.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#if !defined(_WIN32)
-#include <pthread.h>
-#else
-#include <mutex>  // NOLINT
-#endif            // !_WIN32
-
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-
-#if !defined(_WIN32)
-struct SpinLock {
-  SpinLock() { pthread_spin_init(&lock_, PTHREAD_PROCESS_PRIVATE); }
-
-  ~SpinLock() { pthread_spin_destroy(&lock_); }
-
-  void Lock() {
-    PADDLE_ENFORCE_EQ(pthread_spin_lock(&lock_), 0, "acquire spin lock failed");
-  }
-
-  void Unlock() {
-    PADDLE_ENFORCE_EQ(pthread_spin_unlock(&lock_), 0,
-                      "release spin lock failed");
-  }
-
- private:
-  pthread_spinlock_t lock_;
-};
-#else
-// FIXME(minqiyang): use mutex here to do fake spin lock
-struct SpinLock {
-  void Lock() { mutex_.lock(); }
-
-  void Unlock() { mutex_.lock(); }
-
- private:
-  std::mutex mutex_;
-};
-#endif
-
-class AutoSpinLock {
- public:
-  explicit SpinLockGuard(SpinLock* spin_lock) : lock_(spin_lock) {
-    lock_->Lock();
-  }
-
-  ~SpinLockGuard() { lock_->Unlock(); }
-
- private:
-  SpinLock* lock_;
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index 930d851696..760a065c10 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -180,7 +180,7 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
     }
     // Wait() must be called because `inputs_data` may be destructed before
     // kernel ends
-    /* context.Wait(); */
+    context.Wait();
   }
 };
 
@@ -258,7 +258,7 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
     }
     // Wait() must be called because `outputs_data` may be destructed before
     // kernel ends
-    /* context.Wait(); */
+    context.Wait();
   }
 };
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index c108c82756..88a2a5276a 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -822,7 +822,7 @@ All parameter, weight, gradient are variables in Paddle.
           R"DOC(The type is INT, num_iteration_per_drop_scope indicates how
                 many iterations to clean up the temp variables which
                 is generated during execution. It may make the execution faster,
-                because the temp variable's shape maybe the same between two iterations. Default 1.
+                because the temp variable's shape maybe the same between two iterations. Default 100.
 
                 NOTES:
                     1. If you fetch data when calling the 'run', the ParallelExecutor
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index 78f7a6ac08..e05885f5f5 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -92,7 +92,6 @@ def cuda_profiler(output_file, output_mode=None, config=None):
     config_file = 'nvprof_config_file'
     with open(config_file, 'wb') as fp:
         fp.writelines([six.b("%s\n" % item) for item in config])
-    #Comment this for nvprof
     core.nvprof_init(output_file, output_mode, config_file)
     # Enables profiler collection by the active CUDA profiling tool.
     core.nvprof_start()

From bc6640156600e88e20813a0539ff1cbc7dd9ac3a Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 21 Dec 2018 16:08:06 +0800
Subject: [PATCH 20/77] Polish code

test=develop
---
 paddle/fluid/platform/enforce.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index d1dd09f206..78e8fbc51d 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -260,19 +260,19 @@ inline void throw_on_error(T e) {
 
 #define PADDLE_JUDGE
 
-#define __PADDLE_UNARY_COMPARE(COND, ...)                        \
-  do {                                                           \
-    auto __cond = COND;                                          \
-    if (UNLIKELY(::paddle::platform::is_error(__cond))) {        \
-      ::paddle::platform::throw_on_error(__cond, ##__VA_ARGS__); \
-    }                                                            \
+#define __PADDLE_UNARY_COMPARE(COND, ...)                                     \
+  do {                                                                        \
+    auto __cond = COND;                                                       \
+    if (UNLIKELY(::paddle::platform::is_error(__cond))) {                     \
+      ::paddle::platform::throw_on_error(__cond, ##__VA_ARGS__); /* NOLINT */ \
+    }                                                                         \
   } while (0)
 
 #ifndef REPLACE_ENFORCE_GLOG
 #define PADDLE_ENFORCE(COND, ...)                                       \
   do {                                                                  \
     try {                                                               \
-      __PADDLE_UNARY_COMPARE(COND, ##__VA_ARGS__);                      \
+      __PADDLE_UNARY_COMPARE(COND, ##__VA_ARGS__); /* NOLINT */         \
     } catch (...) {                                                     \
       throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
                                               __FILE__, __LINE__);      \

From 41b81293ab708829459f2314c3c7ec0f14abf506 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 21 Dec 2018 16:13:16 +0800
Subject: [PATCH 21/77] Polish code

test=develop
---
 paddle/fluid/platform/enforce.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 78e8fbc51d..5fed6b804f 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -260,24 +260,24 @@ inline void throw_on_error(T e) {
 
 #define PADDLE_JUDGE
 
-#define __PADDLE_UNARY_COMPARE(COND, ...)                                     \
-  do {                                                                        \
-    auto __cond = COND;                                                       \
-    if (UNLIKELY(::paddle::platform::is_error(__cond))) {                     \
-      ::paddle::platform::throw_on_error(__cond, ##__VA_ARGS__); /* NOLINT */ \
-    }                                                                         \
-  } while (0)
+#define __PADDLE_UNARY_COMPARE(COND, ...)                        \
+  do {                                                           \
+    auto __cond = COND;                                          \
+    if (UNLIKELY(::paddle::platform::is_error(__cond))) {        \
+      ::paddle::platform::throw_on_error(__cond, ##__VA_ARGS__); \
+    }                                                            \
+  } while (0)  // NOLINT
 
 #ifndef REPLACE_ENFORCE_GLOG
 #define PADDLE_ENFORCE(COND, ...)                                       \
   do {                                                                  \
     try {                                                               \
-      __PADDLE_UNARY_COMPARE(COND, ##__VA_ARGS__); /* NOLINT */         \
+      __PADDLE_UNARY_COMPARE(COND, ##__VA_ARGS__);                      \
     } catch (...) {                                                     \
       throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
                                               __FILE__, __LINE__);      \
     }                                                                   \
-  } while (false)
+  } while (0)  // NOLINT
 
 #else
 #define PADDLE_ENFORCE(COND, ...) __PADDLE_UNARY_COMPARE(COND, ##__VA_ARGS__);

From 4af97c6946435e5129e94cf507fc30f798d09e9e Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 21 Dec 2018 17:07:03 +0800
Subject: [PATCH 22/77] Polish code

---
 paddle/fluid/platform/enforce.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 5fed6b804f..eee8173ba5 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -266,7 +266,7 @@ inline void throw_on_error(T e) {
     if (UNLIKELY(::paddle::platform::is_error(__cond))) {        \
       ::paddle::platform::throw_on_error(__cond, ##__VA_ARGS__); \
     }                                                            \
-  } while (0)  // NOLINT
+  } while (0)
 
 #ifndef REPLACE_ENFORCE_GLOG
 #define PADDLE_ENFORCE(COND, ...)                                       \
@@ -277,7 +277,7 @@ inline void throw_on_error(T e) {
       throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
                                               __FILE__, __LINE__);      \
     }                                                                   \
-  } while (0)  // NOLINT
+  } while (0)
 
 #else
 #define PADDLE_ENFORCE(COND, ...) __PADDLE_UNARY_COMPARE(COND, ##__VA_ARGS__);

From 22c71398e3b9a864d8c2d3a3a2e589d42db82098 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Fri, 21 Dec 2018 17:49:27 +0800
Subject: [PATCH 23/77] add MM_DNN inference test

test=develop
---
 .../fluid/inference/tests/api/CMakeLists.txt  |   5 +
 .../tests/api/analyzer_mm_dnn_tester.cc       | 178 ++++++++++++++++++
 2 files changed, 183 insertions(+)
 create mode 100644 paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 46ce61b736..95bbc74a59 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -75,6 +75,11 @@ set(LAC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lac")
 download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" "lac_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_lac ${LAC_INSTALL_DIR} analyzer_lac_tester.cc)
 
+# MM DNN
+set(MM_DNN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mm_dnn")
+download_model_and_data(${MM_DNN_INSTALL_DIR} "MM_DNN_model.tar.gz" "MM_DNN_data.txt.tar.gz")
+inference_analysis_api_test(test_analyzer_mm_dnn ${MM_DNN_INSTALL_DIR} analyzer_mm_dnn_tester.cc)
+
 # text_classification
 set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
 download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz")
diff --git a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
new file mode 100644
index 0000000000..858bc6d4ea
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
@@ -0,0 +1,178 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+using contrib::AnalysisConfig;
+
+struct DataRecord {
+  std::vector<std::vector<int64_t>> query_data_all, title_data_all;
+  std::vector<size_t> lod1, lod2;
+  size_t batch_iter{0};
+  size_t batch_size{1};
+  size_t num_samples;  // total number of samples
+  DataRecord() = default;
+  explicit DataRecord(const std::string &path, int batch_size = 1)
+      : batch_size(batch_size) {
+    Load(path);
+  }
+  DataRecord NextBatch() {
+    DataRecord data;
+    size_t batch_end = batch_iter + batch_size;
+    // NOTE skip the final batch, if no enough data is provided.
+    if (batch_end <= query_data_all.size()) {
+      data.query_data_all.assign(query_data_all.begin() + batch_iter,
+                                 query_data_all.begin() + batch_end);
+      data.title_data_all.assign(title_data_all.begin() + batch_iter,
+                                 title_data_all.begin() + batch_end);
+      // Prepare LoDs
+      data.lod1.push_back(0);
+      data.lod2.push_back(0);
+      CHECK(!data.query_data_all.empty());
+      CHECK(!data.title_data_all.empty());
+      CHECK_EQ(data.query_data_all.size(), data.title_data_all.size());
+      for (size_t j = 0; j < data.query_data_all.size(); j++) {
+        // calculate lod
+        data.lod1.push_back(data.lod1.back() + data.query_data_all[j].size());
+        data.lod2.push_back(data.lod2.back() + data.query_data_all[j].size());
+      }
+    }
+    batch_iter += batch_size;
+    return data;
+  }
+  void Load(const std::string &path) {
+    std::ifstream file(path);
+    std::string line;
+    int num_lines = 0;
+    while (std::getline(file, line)) {
+      num_lines++;
+      std::vector<std::string> data;
+      split(line, '\t', &data);
+      // load query data
+      std::vector<int64_t> query_data;
+      split_to_int64(data[0], ' ', &query_data);
+      // load title data
+      std::vector<int64_t> title_data;
+      split_to_int64(data[1], ' ', &title_data);
+      query_data_all.push_back(std::move(query_data));
+      title_data_all.push_back(std::move(title_data));
+    }
+    num_samples = num_lines;
+  }
+};
+
+void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
+                   int batch_size) {
+  PaddleTensor lod_query_tensor, lod_title_tensor;
+  lod_query_tensor.name = "left";
+  lod_title_tensor.name = "right";
+  auto one_batch = data->NextBatch();
+  int size1 = one_batch.lod1[one_batch.lod1.size() - 1];  // token batch size
+  int size2 = one_batch.lod2[one_batch.lod2.size() - 1];  // token batch size
+  lod_query_tensor.shape.assign({size1, 1});
+  lod_query_tensor.lod.assign({one_batch.lod1});
+  lod_title_tensor.shape.assign({size2, 1});
+  lod_title_tensor.lod.assign({one_batch.lod2});
+  // assign data
+  TensorAssignData<int64_t>(&lod_query_tensor, one_batch.query_data_all);
+  TensorAssignData<int64_t>(&lod_title_tensor, one_batch.title_data_all);
+  // Set inputs.
+  input_slots->assign({lod_query_tensor, lod_title_tensor});
+  for (auto &tensor : *input_slots) {
+    tensor.dtype = PaddleDType::INT64;
+  }
+}
+
+void SetConfig(contrib::AnalysisConfig *cfg) {
+  cfg->model_dir = FLAGS_infer_model;
+  cfg->use_gpu = false;
+  cfg->device = 0;
+  cfg->specify_input_name = true;
+  cfg->enable_ir_optim = true;
+}
+
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
+  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+  std::vector<PaddleTensor> input_slots;
+  int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
+  LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
+  for (int bid = 0; bid < epoch; ++bid) {
+    PrepareInputs(&input_slots, &data, FLAGS_batch_size);
+    (*inputs).emplace_back(input_slots);
+  }
+}
+
+// Easy for profiling independently.
+TEST(Analyzer_MM_DNN, profile) {
+  contrib::AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<PaddleTensor> outputs;
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
+
+  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
+    PADDLE_ENFORCE_EQ(outputs.size(), 2UL);
+    for (auto &output : outputs) {
+      size_t size = GetSize(output);
+      PADDLE_ENFORCE_GT(size, 0);
+      float *result = static_cast<float *>(output.data.data());
+      // output is probability, which is in (-1, 1).
+      for (size_t i = 0; i < size; i++) {
+        EXPECT_GT(result[i], -1);
+        EXPECT_LT(result[i], 1);
+      }
+    }
+  }
+}
+
+// Check the fuse status
+TEST(Analyzer_MM_DNN, fuse_statis) {
+  contrib::AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  int num_ops;
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  auto fuse_statis = GetFuseStatis(
+      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
+}
+
+// Compare result of NativeConfig and AnalysisConfig
+TEST(Analyzer_MM_DNN, compare) {
+  contrib::AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
+}
+
+// Compare Deterministic result
+TEST(Analyzer_MM_DNN, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
+
+}  // namespace inference
+}  // namespace paddle

From 099186cd41f8aba32ef8f70afd507ee344f3e75c Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 21 Dec 2018 20:01:59 +0800
Subject: [PATCH 24/77] Support one argument PADDLE_ENFORCE

test=develop
---
 paddle/fluid/platform/enforce.h | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index eee8173ba5..ec4d0bf910 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -258,21 +258,33 @@ inline void throw_on_error(T e) {
 #define PADDLE_THROW(...) \
   throw ::paddle::platform::EnforceNotMet(__FILE__, __LINE__, __VA_ARGS__)
 
-#define PADDLE_JUDGE
-
-#define __PADDLE_UNARY_COMPARE(COND, ...)                        \
-  do {                                                           \
-    auto __cond = COND;                                          \
-    if (UNLIKELY(::paddle::platform::is_error(__cond))) {        \
-      ::paddle::platform::throw_on_error(__cond, ##__VA_ARGS__); \
-    }                                                            \
+#define PADDLE_THROW_ERROR(COND, ...)                                   \
+  PADDLE_THROW_I(__VA_ARGS__,                                           \
+                 ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
+                 ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
+                 ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
+                 ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
+                 ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
+                 ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
+                 ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
+                 ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
+                 ::paddle::platform::throw_on_error(COND))
+
+#define PADDLE_THROW_I(_, _9, _8, _7, _6, _5, _4, _3, _2, X_, ...) X_;
+
+#define __PADDLE_UNARY_COMPARE(COND, ...)                 \
+  do {                                                    \
+    auto __cond = COND;                                   \
+    if (UNLIKELY(::paddle::platform::is_error(__cond))) { \
+      PADDLE_THROW_ERROR(COND, __VA_ARGS__);              \
+    }                                                     \
   } while (0)
 
 #ifndef REPLACE_ENFORCE_GLOG
 #define PADDLE_ENFORCE(COND, ...)                                       \
   do {                                                                  \
     try {                                                               \
-      __PADDLE_UNARY_COMPARE(COND, ##__VA_ARGS__);                      \
+      __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__);                        \
     } catch (...) {                                                     \
       throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
                                               __FILE__, __LINE__);      \
@@ -280,7 +292,7 @@ inline void throw_on_error(T e) {
   } while (0)
 
 #else
-#define PADDLE_ENFORCE(COND, ...) __PADDLE_UNARY_COMPARE(COND, ##__VA_ARGS__);
+#define PADDLE_ENFORCE(COND, ...) __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__);
 #endif  // REPLACE_ENFORCE_GLOG
 
 #define PADDLE_THROW_EOF()                                                     \

From 5a5c577529bdfe60f584bd490f3dedc6aa991fa6 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 21 Dec 2018 20:03:12 +0800
Subject: [PATCH 25/77] Polish code

test=develop
---
 paddle/fluid/platform/enforce.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index ec4d0bf910..efead29303 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -276,7 +276,7 @@ inline void throw_on_error(T e) {
   do {                                                    \
     auto __cond = COND;                                   \
     if (UNLIKELY(::paddle::platform::is_error(__cond))) { \
-      PADDLE_THROW_ERROR(COND, __VA_ARGS__);              \
+      PADDLE_THROW_ERROR(__cond, __VA_ARGS__);            \
     }                                                     \
   } while (0)
 

From e4719eb4625e695fc1fcc786444c1a9c8d78fc57 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 21 Dec 2018 20:42:29 +0800
Subject: [PATCH 26/77] Fix bug in Windows VC 2010

test=develop
---
 paddle/fluid/operators/lrn_mkldnn_op.cc |  2 +-
 paddle/fluid/platform/enforce.h         | 35 ++++++++++++++-----------
 2 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc
index 0a18882e81..adcf694454 100644
--- a/paddle/fluid/operators/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/lrn_mkldnn_op.cc
@@ -50,7 +50,7 @@ template <typename T>
 class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
   void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(std::is_same<T, float>::value,
+    PADDLE_ENFORCE(std::is_same<typename T, float>::value,
                    "MKLDNN LRN must use float data.");
     PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                    "MKLDNN LRN must use CPUPlace.");
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index efead29303..dd83686b9d 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -258,30 +258,30 @@ inline void throw_on_error(T e) {
 #define PADDLE_THROW(...) \
   throw ::paddle::platform::EnforceNotMet(__FILE__, __LINE__, __VA_ARGS__)
 
-#define PADDLE_THROW_ERROR(COND, ...)                                   \
-  PADDLE_THROW_I(__VA_ARGS__,                                           \
-                 ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
-                 ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
-                 ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
-                 ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
-                 ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
-                 ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
-                 ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
-                 ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
-                 ::paddle::platform::throw_on_error(COND))
-
-#define PADDLE_THROW_I(_, _9, _8, _7, _6, _5, _4, _3, _2, X_, ...) X_;
+#define __PADDLE_THROW_ERROR(COND, ...)                                   \
+  __PADDLE_THROW_ERROR_I(                                                 \
+      __VA_ARGS__, ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
+      ::paddle::platform::throw_on_error(COND, __VA_ARGS__),              \
+      ::paddle::platform::throw_on_error(COND, __VA_ARGS__),              \
+      ::paddle::platform::throw_on_error(COND, __VA_ARGS__),              \
+      ::paddle::platform::throw_on_error(COND, __VA_ARGS__),              \
+      ::paddle::platform::throw_on_error(COND, __VA_ARGS__),              \
+      ::paddle::platform::throw_on_error(COND, __VA_ARGS__),              \
+      ::paddle::platform::throw_on_error(COND, __VA_ARGS__),              \
+      ::paddle::platform::throw_on_error(COND))
+
+#define __PADDLE_THROW_ERROR_I(_, _9, _8, _7, _6, _5, _4, _3, _2, X_, ...) X_;
 
 #define __PADDLE_UNARY_COMPARE(COND, ...)                 \
   do {                                                    \
     auto __cond = COND;                                   \
     if (UNLIKELY(::paddle::platform::is_error(__cond))) { \
-      PADDLE_THROW_ERROR(__cond, __VA_ARGS__);            \
+      __PADDLE_THROW_ERROR(__cond, __VA_ARGS__);          \
     }                                                     \
   } while (0)
 
 #ifndef REPLACE_ENFORCE_GLOG
-#define PADDLE_ENFORCE(COND, ...)                                       \
+#define __PADDLE_ENFORCE_I(COND, ...)                                   \
   do {                                                                  \
     try {                                                               \
       __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__);                        \
@@ -292,9 +292,12 @@ inline void throw_on_error(T e) {
   } while (0)
 
 #else
-#define PADDLE_ENFORCE(COND, ...) __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__);
+#define __PADDLE_ENFORCE_I(COND, ...) __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__);
 #endif  // REPLACE_ENFORCE_GLOG
 
+#define __PADDLE_ENFORCE(args) __PADDLE_ENFORCE_I args
+#define PADDLE_ENFORCE(...) __PADDLE_ENFORCE((__VA_ARGS__))
+
 #define PADDLE_THROW_EOF()                                                     \
   do {                                                                         \
     throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \

From 0cf1461ccc17672aa93acb32883c56830f0dfa29 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 24 Dec 2018 12:44:11 +0800
Subject: [PATCH 27/77] Avoid comma in macro

test=develop
---
 paddle/fluid/operators/lrn_mkldnn_op.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc
index adcf694454..c96dd63516 100644
--- a/paddle/fluid/operators/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/lrn_mkldnn_op.cc
@@ -50,8 +50,8 @@ template <typename T>
 class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
   void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(std::is_same<typename T, float>::value,
-                   "MKLDNN LRN must use float data.");
+    bool is_float_type = std::is_same<T, float>::value;
+    PADDLE_ENFORCE(is_float_type, "MKLDNN LRN must use float data.");
     PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                    "MKLDNN LRN must use CPUPlace.");
 

From e811e06555d0a458fb885a4956bb5128d1bc37b6 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 24 Dec 2018 12:48:52 +0800
Subject: [PATCH 28/77] Avoid comma in macro

test=develop
---
 paddle/fluid/operators/lrn_mkldnn_op.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc
index c96dd63516..4e4f977fcc 100644
--- a/paddle/fluid/operators/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/lrn_mkldnn_op.cc
@@ -50,7 +50,7 @@ template <typename T>
 class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
   void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    bool is_float_type = std::is_same<T, float>::value;
+    const bool is_float_type = std::is_same<T, float>::value;
     PADDLE_ENFORCE(is_float_type, "MKLDNN LRN must use float data.");
     PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                    "MKLDNN LRN must use CPUPlace.");
@@ -132,8 +132,8 @@ template <typename T>
 class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
  public:
   void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(std::is_same<T, float>::value,
-                   "MKLDNN LRN must use float data.");
+    const bool is_float_type = std::is_same<T, float>::value;
+    PADDLE_ENFORCE(is_float_type, "MKLDNN LRN must use float data.");
     PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                    "MKLDNN LRN must use CPUPlace.");
     PADDLE_ENFORCE(

From 7d1533216dd6776ce17a857b082c25d5d5cccf49 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 24 Dec 2018 13:02:13 +0800
Subject: [PATCH 29/77] Fix syntax error in unit test

test=develop
---
 paddle/fluid/inference/analysis/analyzer_tester.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index cb88333d15..1fc5a00858 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -69,9 +69,9 @@ void TestWord2vecPrediction(const std::string& model_path) {
   std::vector<PaddleTensor> outputs;
   CHECK(predictor->Run(slots, &outputs));
 
-  PADDLE_ENFORCE(outputs.size(), 1UL);
+  PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
   // Check the output buffer size and result of each tid.
-  PADDLE_ENFORCE(outputs.front().data.length(), 33168UL);
+  PADDLE_ENFORCE_EQ(outputs.front().data.length(), 33168UL);
   float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
                      0.000932706};
   const size_t num_elements = outputs.front().data.length() / sizeof(float);
@@ -80,8 +80,8 @@ void TestWord2vecPrediction(const std::string& model_path) {
        i++) {
     LOG(INFO) << "data: "
               << static_cast<float*>(outputs.front().data.data())[i];
-    PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
-                   result[i]);
+    PADDLE_ENFORCE_EQ(static_cast<float*>(outputs.front().data.data())[i],
+                      result[i]);
   }
 }
 

From b1d0a14c144c71f0f912d1e8ec0d0b4170546c12 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 24 Dec 2018 13:06:11 +0800
Subject: [PATCH 30/77] Change the ut back

test=develop
---
 paddle/fluid/inference/analysis/analyzer_tester.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 1fc5a00858..f84e1ab6b8 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -78,10 +78,10 @@ void TestWord2vecPrediction(const std::string& model_path) {
   // The outputs' buffers are in CPU memory.
   for (size_t i = 0; i < std::min(static_cast<size_t>(5UL), num_elements);
        i++) {
-    LOG(INFO) << "data: "
-              << static_cast<float*>(outputs.front().data.data())[i];
-    PADDLE_ENFORCE_EQ(static_cast<float*>(outputs.front().data.data())[i],
-                      result[i]);
+    LOG(INFO) << "data: " << static_cast<float*>(outputs.front().data.data())[i]
+              << " result: " << result[i];
+    PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
+                   result[i]);
   }
 }
 

From 45acfbd0118ffaa2661148904667235e3c9b134b Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 24 Dec 2018 17:56:04 +0800
Subject: [PATCH 31/77] 1. Add specific condition for one or no arg in
 PADDLE_ENFORCE

2. Add unit test for new enforce feature

test=develop
---
 paddle/fluid/platform/enforce.h       | 13 ++++++++-----
 paddle/fluid/platform/enforce_test.cc | 19 +++++++++++++++++++
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index dd83686b9d..7eb4be2137 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -258,7 +258,12 @@ inline void throw_on_error(T e) {
 #define PADDLE_THROW(...) \
   throw ::paddle::platform::EnforceNotMet(__FILE__, __LINE__, __VA_ARGS__)
 
-#define __PADDLE_THROW_ERROR(COND, ...)                                   \
+#define __PADDLE_THROW_ERROR_I(_, _9, _8, _7, _6, _5, _4, _3, _2, X_, ...) X_;
+
+#define __THROW_ON_ERROR_ONE_ARG(COND, ARG) \
+  ::paddle::platform::throw_on_error(COND, "%s", std::string(ARG));
+
+#define __PADDLE_THROW_ON_ERROR(COND, ...)                                \
   __PADDLE_THROW_ERROR_I(                                                 \
       __VA_ARGS__, ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
       ::paddle::platform::throw_on_error(COND, __VA_ARGS__),              \
@@ -268,15 +273,13 @@ inline void throw_on_error(T e) {
       ::paddle::platform::throw_on_error(COND, __VA_ARGS__),              \
       ::paddle::platform::throw_on_error(COND, __VA_ARGS__),              \
       ::paddle::platform::throw_on_error(COND, __VA_ARGS__),              \
-      ::paddle::platform::throw_on_error(COND))
-
-#define __PADDLE_THROW_ERROR_I(_, _9, _8, _7, _6, _5, _4, _3, _2, X_, ...) X_;
+      __THROW_ON_ERROR_ONE_ARG(COND, __VA_ARGS__))
 
 #define __PADDLE_UNARY_COMPARE(COND, ...)                 \
   do {                                                    \
     auto __cond = COND;                                   \
     if (UNLIKELY(::paddle::platform::is_error(__cond))) { \
-      __PADDLE_THROW_ERROR(__cond, __VA_ARGS__);          \
+      __PADDLE_THROW_ON_ERROR(__cond, __VA_ARGS__);       \
     }                                                     \
   } while (0)
 
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index d521829655..1091badae5 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -37,6 +37,25 @@ TEST(ENFORCE, FAILED) {
         HasPrefix(StringPiece(error.what()), "Enforce is not ok 123 at all"));
   }
   EXPECT_TRUE(caught_exception);
+
+  caught_exception = false;
+  try {
+    PADDLE_ENFORCE(false, "Enforce is not ok at all");
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    EXPECT_TRUE(
+        HasPrefix(StringPiece(error.what()), "Enforce is not ok at all"));
+  }
+  EXPECT_TRUE(caught_exception);
+
+  caught_exception = false;
+  try {
+    PADDLE_ENFORCE(false);
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    EXPECT_NE(std::string(error.what()).find("  at "), 0);
+  }
+  EXPECT_TRUE(caught_exception);
 }
 
 TEST(ENFORCE, NO_ARG_OK) {

From 010f657b336944556d190d9054c328a7dc6e87c9 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 24 Dec 2018 18:31:54 +0800
Subject: [PATCH 32/77] Polish code

test=develop
---
 paddle/fluid/operators/detail/safe_ref.h | 2 +-
 paddle/fluid/platform/enforce.h          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/detail/safe_ref.h b/paddle/fluid/operators/detail/safe_ref.h
index a800d5df0a..8660bc219c 100644
--- a/paddle/fluid/operators/detail/safe_ref.h
+++ b/paddle/fluid/operators/detail/safe_ref.h
@@ -25,7 +25,7 @@ namespace detail {
  */
 template <typename T, typename... ARGS>
 inline T& Ref(T* ptr, ARGS&&... args) {
-  PADDLE_ENFORCE(ptr != nullptr, args...);
+  PADDLE_ENFORCE(ptr != nullptr, ::paddle::string::Sprintf(args...));
   return *ptr;
 }
 
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 7eb4be2137..e9b98aee1f 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -298,7 +298,7 @@ inline void throw_on_error(T e) {
 #define __PADDLE_ENFORCE_I(COND, ...) __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__);
 #endif  // REPLACE_ENFORCE_GLOG
 
-#define __PADDLE_ENFORCE(args) __PADDLE_ENFORCE_I args
+#define __PADDLE_ENFORCE(__args) __PADDLE_ENFORCE_I __args
 #define PADDLE_ENFORCE(...) __PADDLE_ENFORCE((__VA_ARGS__))
 
 #define PADDLE_THROW_EOF()                                                     \

From 52b4821a6eab9fc496de2e132ef0744c1e573ca4 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 24 Dec 2018 19:24:02 +0800
Subject: [PATCH 33/77] Fix Sprintf problem

test=develop
---
 paddle/fluid/platform/enforce.h | 2 +-
 paddle/fluid/string/printf.h    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index e9b98aee1f..0668053950 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -261,7 +261,7 @@ inline void throw_on_error(T e) {
 #define __PADDLE_THROW_ERROR_I(_, _9, _8, _7, _6, _5, _4, _3, _2, X_, ...) X_;
 
 #define __THROW_ON_ERROR_ONE_ARG(COND, ARG) \
-  ::paddle::platform::throw_on_error(COND, "%s", std::string(ARG));
+  ::paddle::platform::throw_on_error(COND, ::paddle::string::Sprintf(ARG));
 
 #define __PADDLE_THROW_ON_ERROR(COND, ...)                                \
   __PADDLE_THROW_ERROR_I(                                                 \
diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h
index a2eec6e3c4..0b94b60018 100644
--- a/paddle/fluid/string/printf.h
+++ b/paddle/fluid/string/printf.h
@@ -87,7 +87,7 @@ void Fprintf(std::ostream& out, const char* fmt, const Args&... args) {
 template <typename... Args>
 std::string Sprintf(const Args&... args) {
   std::ostringstream oss;
-  Fprintf(oss, "");
+  Fprintf(oss, "%s", args...);
   return oss.str();
 }
 

From f8fc6ba5954ed44706319ecde5fc8752221412ed Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Mon, 24 Dec 2018 11:42:21 +0000
Subject: [PATCH 34/77] test=develop, fix ci by install requirement and add pip
 install validation

---
 paddle/scripts/installation_validate.py | 18 ++++++++++++++++++
 paddle/scripts/paddle_build.sh          |  3 +++
 2 files changed, 21 insertions(+)
 create mode 100644 paddle/scripts/installation_validate.py

diff --git a/paddle/scripts/installation_validate.py b/paddle/scripts/installation_validate.py
new file mode 100644
index 0000000000..f84e2f4b17
--- /dev/null
+++ b/paddle/scripts/installation_validate.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import paddle as pd
+
+print(pd.__version__)
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 2e6b40148d..99a661f464 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -79,6 +79,7 @@ function cmake_gen() {
                 PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/2.7/bin/python2.7
             -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/2.7/include/python2.7
             -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/2.7/lib/libpython2.7.dylib"
+            pip install -r ${PADDLE_ROOT}/python/requirements.txt
             else
                 exit 1
             fi
@@ -441,7 +442,9 @@ EOF
         # make install should also be test when unittest
         make install -j 8
         if [ "$1" == "cp27-cp27m" ]; then
+            set -e
             pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+            python -c installation_validate.py
         elif [ "$1" == "cp35-cp35m" ]; then
             pip3.5 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         elif [ "$1" == "cp36-cp36m" ]; then

From e9c86ac41d655eae27df5728da8250773803b660 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Mon, 24 Dec 2018 11:51:06 +0000
Subject: [PATCH 35/77] test=develop, install requirements.txt with user
 previlige

---
 paddle/scripts/paddle_build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 99a661f464..6ccbb0c37f 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -79,7 +79,7 @@ function cmake_gen() {
                 PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/2.7/bin/python2.7
             -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/2.7/include/python2.7
             -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/2.7/lib/libpython2.7.dylib"
-            pip install -r ${PADDLE_ROOT}/python/requirements.txt
+            pip install --user -r ${PADDLE_ROOT}/python/requirements.txt
             else
                 exit 1
             fi

From ea6e057e40ae7451302a20b37bdfbc8f485b9483 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Mon, 24 Dec 2018 13:21:06 +0000
Subject: [PATCH 36/77] test=develop, fix bug

---
 paddle/scripts/paddle_build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 6ccbb0c37f..c44d1be75c 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -444,7 +444,7 @@ EOF
         if [ "$1" == "cp27-cp27m" ]; then
             set -e
             pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
-            python -c installation_validate.py
+            python installation_validate.py
         elif [ "$1" == "cp35-cp35m" ]; then
             pip3.5 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         elif [ "$1" == "cp36-cp36m" ]; then

From 91408e3122de7d578ff69a3fa401e34297c21248 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Tue, 25 Dec 2018 11:13:28 +0800
Subject: [PATCH 37/77] fix analyzer_mm_dnn_tester fails when bs > 1

test=develop
---
 paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
index 858bc6d4ea..8aaab6d664 100644
--- a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
@@ -47,7 +47,7 @@ struct DataRecord {
       for (size_t j = 0; j < data.query_data_all.size(); j++) {
         // calculate lod
         data.lod1.push_back(data.lod1.back() + data.query_data_all[j].size());
-        data.lod2.push_back(data.lod2.back() + data.query_data_all[j].size());
+        data.lod2.push_back(data.lod2.back() + data.title_data_all[j].size());
       }
     }
     batch_iter += batch_size;

From 3e40c79c4f70cd0600be32b6101bb267177646ab Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Tue, 25 Dec 2018 05:09:21 +0000
Subject: [PATCH 38/77] test=develop, using absolute dir

---
 paddle/scripts/paddle_build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index c44d1be75c..25c945c8ce 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -444,7 +444,7 @@ EOF
         if [ "$1" == "cp27-cp27m" ]; then
             set -e
             pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
-            python installation_validate.py
+            python ${PADDLE_ROOT}/paddle/scripts/installation_validate.py
         elif [ "$1" == "cp35-cp35m" ]; then
             pip3.5 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         elif [ "$1" == "cp36-cp36m" ]; then

From 3a2afbf02e9bcc3d0a690564b8ea811b6cb10685 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Tue, 25 Dec 2018 04:24:44 +0000
Subject: [PATCH 39/77] polish code test=develop

---
 paddle/fluid/framework/operator.h               | 12 ------------
 paddle/fluid/framework/var_type.h               | 10 +++++-----
 .../fluid/framework/var_type_inference_test.cc  |  2 +-
 paddle/fluid/framework/var_type_traits.h        |  6 +++---
 paddle/fluid/framework/var_type_traits_test.cc  | 17 +++++++++++++++++
 paddle/fluid/framework/variable.h               | 10 ++++++----
 paddle/fluid/operators/affine_grid_op.cc        |  4 ++--
 paddle/fluid/operators/conv_op.cc               |  4 ++--
 paddle/fluid/operators/grid_sampler_op.cc       |  4 ++--
 paddle/fluid/operators/pool_op.cc               |  4 ++--
 paddle/fluid/operators/softmax_op.cc            |  4 ++--
 paddle/fluid/operators/warpctc_op.cc            |  2 +-
 paddle/fluid/platform/cudnn_helper.h            | 13 +++++++++++++
 13 files changed, 56 insertions(+), 36 deletions(-)

diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 4492470e2a..39190d07b4 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -310,18 +310,6 @@ class ExecutionContext {
   const RuntimeContext& ctx_;
 };
 
-inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
-  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
-  use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace());
-#ifdef PADDLE_WITH_CUDA
-  if (use_cudnn) {
-    auto& dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
-    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
-  }
-#endif
-  return use_cudnn;
-}
-
 template <>
 const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const;
 
diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h
index f1cbaf3fdc..73be446f71 100644
--- a/paddle/fluid/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
@@ -46,19 +46,19 @@ inline proto::VarType::Type ToVarType(int type) {
 template <typename Visitor>
 inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
   switch (var.Type()) {
-    case proto::VarType_Type_LOD_TENSOR:
+    case proto::VarType::LOD_TENSOR:
       visitor(var.Get<LoDTensor>());
       return;
-    case proto::VarType_Type_LOD_RANK_TABLE:
+    case proto::VarType::LOD_RANK_TABLE:
       visitor(var.Get<LoDRankTable>());
       return;
-    case proto::VarType_Type_LOD_TENSOR_ARRAY:
+    case proto::VarType::LOD_TENSOR_ARRAY:
       visitor(var.Get<LoDTensorArray>());
       return;
-    case proto::VarType_Type_SELECTED_ROWS:
+    case proto::VarType::SELECTED_ROWS:
       visitor(var.Get<SelectedRows>());
       return;
-    case proto::VarType_Type_READER:
+    case proto::VarType::READER:
       visitor(var.Get<ReaderHolder>());
       return;
     default:
diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc
index 7842168f60..2a75394fca 100644
--- a/paddle/fluid/framework/var_type_inference_test.cc
+++ b/paddle/fluid/framework/var_type_inference_test.cc
@@ -108,7 +108,7 @@ TEST(InferVarType, sum_op_without_infer_var_type) {
 
   op->InferVarType(prog.MutableBlock(0));
 
-  ASSERT_EQ(proto::VarType_Type_LOD_TENSOR,
+  ASSERT_EQ(proto::VarType::LOD_TENSOR,
             prog.MutableBlock(0)->Var("test2_out")->GetType());
 }
 
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index b51b4933e6..1b535219c1 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -17,7 +17,7 @@
 #include <map>
 #include <string>
 #include <tuple>
-#include <typeinfo>
+#include <typeindex>
 #include <vector>
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
@@ -136,8 +136,6 @@ struct VarTypeRegistryImpl {
 
 // Users should add other variable types below.
 // Paddle would generate unique Ids for each registered variable types.
-class Scope;
-
 using VarTypeRegistry = detail::VarTypeRegistryImpl<
     Tensor, LoDTensor, SelectedRows, std::vector<Scope *>, LoDRankTable,
     LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *,
@@ -171,6 +169,8 @@ REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE);
 REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY);
 REG_PROTO_VAR_TYPE_TRAIT(platform::PlaceList, proto::VarType::PLACE_LIST);
 REG_PROTO_VAR_TYPE_TRAIT(ReaderHolder, proto::VarType::READER);
+REG_PROTO_VAR_TYPE_TRAIT(int, proto::VarType::INT32);
+REG_PROTO_VAR_TYPE_TRAIT(float, proto::VarType::FP32);
 
 /** End of variable type registration */
 
diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc
index 1c7d9f2abe..00840d634d 100644
--- a/paddle/fluid/framework/var_type_traits_test.cc
+++ b/paddle/fluid/framework/var_type_traits_test.cc
@@ -88,6 +88,23 @@ TEST(var_type_traits, check_proto_type_id) {
   ASSERT_TRUE(CheckVarId<LoDTensorArray>(proto::VarType::LOD_TENSOR_ARRAY));
   ASSERT_TRUE(CheckVarId<platform::PlaceList>(proto::VarType::PLACE_LIST));
   ASSERT_TRUE(CheckVarId<ReaderHolder>(proto::VarType::READER));
+  ASSERT_TRUE(CheckVarId<int>(proto::VarType::INT32));
+  ASSERT_TRUE(CheckVarId<float>(proto::VarType::FP32));
+
+  ASSERT_EQ(proto::VarType_Type_LOD_TENSOR, proto::VarType::LOD_TENSOR);
+  ASSERT_EQ(proto::VarType_Type_SELECTED_ROWS, proto::VarType::SELECTED_ROWS);
+  ASSERT_EQ(proto::VarType_Type_STEP_SCOPES, proto::VarType::STEP_SCOPES);
+  ASSERT_EQ(proto::VarType_Type_LOD_RANK_TABLE, proto::VarType::LOD_RANK_TABLE);
+  ASSERT_EQ(proto::VarType_Type_LOD_TENSOR_ARRAY,
+            proto::VarType::LOD_TENSOR_ARRAY);
+  ASSERT_EQ(proto::VarType_Type_PLACE_LIST, proto::VarType::PLACE_LIST);
+  ASSERT_EQ(proto::VarType_Type_READER, proto::VarType::READER);
+  ASSERT_EQ(proto::VarType_Type_FEED_MINIBATCH, proto::VarType::FEED_MINIBATCH);
+  ASSERT_EQ(proto::VarType_Type_FETCH_LIST, proto::VarType::FETCH_LIST);
+  ASSERT_EQ(proto::VarType_Type_RAW, proto::VarType::RAW);
+  ASSERT_EQ(proto::VarType_Type_TUPLE, proto::VarType::TUPLE);
+  ASSERT_EQ(proto::VarType_Type_INT32, proto::VarType::INT32);
+  ASSERT_EQ(proto::VarType_Type_FP32, proto::VarType::FP32);
 }
 
 TEST(var_type_traits, test_registry) {
diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h
index 8aa68942ad..b9d07da822 100644
--- a/paddle/fluid/framework/variable.h
+++ b/paddle/fluid/framework/variable.h
@@ -67,7 +67,6 @@ class Variable {
 
  private:
   struct Placeholder {
-    explicit Placeholder(int type) : type_(type) {}
     virtual ~Placeholder() = default;
 
     inline int Type() const { return type_; }
@@ -75,6 +74,11 @@ class Variable {
     inline void* Ptr() { return ptr_; }
 
    protected:
+    inline void Init(void* p, int type) {
+      ptr_ = p;
+      type_ = type;
+    }
+
     void* ptr_;
     int type_;
   };
@@ -86,9 +90,7 @@ class Variable {
     static_assert(
         IsRegisteredVarType<T>(),
         "Not registered type. Please register T inside var_type_traits.h");
-    PlaceholderImpl() : Placeholder(VarTypeTrait<T>::kId) {
-      this->ptr_ = &obj_;
-    }
+    PlaceholderImpl() { this->Init(&obj_, VarTypeTrait<T>::kId); }
 
    private:
     T obj_;
diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc
index 0c04873852..1de59a5165 100644
--- a/paddle/fluid/operators/affine_grid_op.cc
+++ b/paddle/fluid/operators/affine_grid_op.cc
@@ -74,7 +74,7 @@ class AffineGridOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     framework::LibraryType library{framework::LibraryType::kPlain};
 #ifdef PADDLE_WITH_CUDA
-    if (framework::CanCUDNNBeUsed(ctx)) {
+    if (platform::CanCUDNNBeUsed(ctx)) {
       library = framework::LibraryType::kCUDNN;
     }
 #endif
@@ -184,7 +184,7 @@ class AffineGridOpGrad : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     framework::LibraryType library_{framework::LibraryType::kPlain};
 #ifdef PADDLE_WITH_CUDA
-    if (framework::CanCUDNNBeUsed(ctx)) {
+    if (platform::CanCUDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kCUDNN;
     }
 #endif
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index c76bde99f4..8e0d282495 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -84,7 +84,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
   framework::DataLayout layout = framework::StringToDataLayout(data_format);
 
 #ifdef PADDLE_WITH_CUDA
-  if (framework::CanCUDNNBeUsed(ctx)) {
+  if (platform::CanCUDNNBeUsed(ctx)) {
     library = framework::LibraryType::kCUDNN;
   }
 #endif
@@ -369,7 +369,7 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
   framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
 
 #ifdef PADDLE_WITH_CUDA
-  if (framework::CanCUDNNBeUsed(ctx)) {
+  if (platform::CanCUDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kCUDNN;
   }
 #endif
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
index be53a62cc9..14a2524bd8 100644
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -59,7 +59,7 @@ class GridSampleOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     framework::LibraryType library_{framework::LibraryType::kPlain};
 #ifdef PADDLE_WITH_CUDA
-    if (framework::CanCUDNNBeUsed(ctx)) {
+    if (platform::CanCUDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kCUDNN;
     }
 #endif
@@ -155,7 +155,7 @@ class GridSampleOpGrad : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     framework::LibraryType library_{framework::LibraryType::kPlain};
 #ifdef PADDLE_WITH_CUDA
-    if (framework::CanCUDNNBeUsed(ctx)) {
+    if (platform::CanCUDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kCUDNN;
     }
 #endif
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index 6781cdf9f3..5399ae556e 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -92,7 +92,7 @@ framework::OpKernelType PoolOp::GetExpectedKernelType(
   framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
 
 #ifdef PADDLE_WITH_CUDA
-  if (framework::CanCUDNNBeUsed(ctx)) {
+  if (platform::CanCUDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kCUDNN;
   }
 #endif
@@ -122,7 +122,7 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
   framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
 
 #ifdef PADDLE_WITH_CUDA
-  if (framework::CanCUDNNBeUsed(ctx)) {
+  if (platform::CanCUDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kCUDNN;
   }
 #endif
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index ad37967f0a..bc889a5a04 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -50,7 +50,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
     framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
 
 #ifdef PADDLE_WITH_CUDA
-    if (framework::CanCUDNNBeUsed(ctx)) {
+    if (platform::CanCUDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kCUDNN;
     }
 #endif
@@ -157,7 +157,7 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
     framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
 
 #ifdef PADDLE_WITH_CUDA
-    if (framework::CanCUDNNBeUsed(ctx)) {
+    if (platform::CanCUDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kCUDNN;
     }
 #endif
diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc
index add03bad13..e2ae7caae1 100644
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -51,7 +51,7 @@ class WarpCTCOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     framework::LibraryType library_{framework::LibraryType::kPlain};
 #ifdef PADDLE_WITH_CUDA
-    if (framework::CanCUDNNBeUsed(ctx)) {
+    if (platform::CanCUDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kCUDNN;
     }
 #endif
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index 74b0942379..61a25064d1 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
@@ -450,6 +451,18 @@ class ScopedActivationDescriptor {
   DISABLE_COPY_AND_ASSIGN(ScopedActivationDescriptor);
 };
 
+inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
+  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace());
+#ifdef PADDLE_WITH_CUDA
+  if (use_cudnn) {
+    auto& dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
+    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+  }
+#endif
+  return use_cudnn;
+}
+
 #if CUDNN_VERSION >= 7001
 class ScopedCTCLossDescriptor {
  public:

From 7f6d8acecb0c1d61dad645c581cd8cef9d554841 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Tue, 25 Dec 2018 14:12:01 +0800
Subject: [PATCH 40/77] cherry-pick the #12759 test=develop

---
 paddle/fluid/framework/op_proto_maker.cc      |  4 ++
 paddle/fluid/framework/op_proto_maker.h       |  1 +
 paddle/fluid/framework/operator.cc            | 71 ++++++++++++++-----
 paddle/fluid/operators/top_k_op.cc            |  2 +
 paddle/fluid/pybind/const_value.cc            |  3 +
 python/paddle/fluid/framework.py              |  5 ++
 .../tests/unittests/test_operator_desc.py     |  2 +-
 7 files changed, 70 insertions(+), 18 deletions(-)

diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
index ca31303f77..2311614c33 100644
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -82,6 +82,10 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
   AddAttr<std::string>(OpNamescopeAttrName(), "Operator name with namesope.")
       .SetDefault("");
 
+  AddAttr<std::vector<std::string>>(OpCreationCallstackAttrName(),
+                                    "Callstack for Op Creatation.")
+      .SetDefault({});
+
   Validate();
 }
 
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index 4c59c73d87..0a0f8f4655 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -47,6 +47,7 @@ class OpProtoAndCheckerMaker {
   static const char *OpRoleAttrName() { return "op_role"; }
   static const char *OpRoleVarAttrName() { return "op_role_var"; }
   static const char *OpNamescopeAttrName() { return "op_namescope"; }
+  static const char *OpCreationCallstackAttrName() { return "op_callstack"; }
 
   void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index fec311e3ee..4527e66191 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -16,10 +16,15 @@ limitations under the License. */
 #include <glog/logging.h>
 
 #include <algorithm>
-
+#include <sstream>
+#include <string>
+#include <vector>
+#include "gflags/gflags.h"
+#include "glog/logging.h"
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
@@ -157,27 +162,59 @@ RuntimeContext::RuntimeContext(const VariableNameMap& innames,
 }
 
 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
-  VLOG(4) << place << " " << DebugStringEx(&scope);
-  if (platform::is_gpu_place(place)) {
+  try {
+    if (VLOG_IS_ON(4)) {
+      VLOG(4) << place << " " << DebugStringEx(&scope);
+    }
+    if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
-    PADDLE_THROW("Cannot run operator on place %s", place);
+      PADDLE_THROW("Cannot run operator on place %s", place);
 #else
-    auto dev_id = boost::get<platform::CUDAPlace>(place).device;
-    platform::SetDeviceId(dev_id);
+      auto dev_id = boost::get<platform::CUDAPlace>(place).device;
+      platform::SetDeviceId(dev_id);
 #endif
-  }
+    }
 
-  // The profile has a process-wide mutex, results in serious performance issue
-  // in concurrency scenerio. Here use an `if` to fix this issue.
-  // Please not remove the `if`, ask @Superjomn if there are any concern.
-  if (platform::IsProfileEnabled()) {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    platform::RecordEvent record_event(Type(), pool.Get(place));
-    RunImpl(scope, place);
-  } else {
-    RunImpl(scope, place);
+    // The profile has a process-wide mutex, results in serious performance
+    // issue
+    // in concurrency scenerio. Here use an `if` to fix this issue.
+    // Please not remove the `if`, ask @Superjomn if there are any concern.
+    if (platform::IsProfileEnabled()) {
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      platform::RecordEvent record_event(Type(), pool.Get(place));
+      RunImpl(scope, place);
+    } else {
+      RunImpl(scope, place);
+    }
+
+    if (VLOG_IS_ON(3)) {
+      VLOG(3) << place << " " << DebugStringEx(&scope);
+    }
+  } catch (platform::EnforceNotMet exception) {
+    if (Attrs().count("sub_block") != 0) {
+      throw exception;
+    }
+
+    auto& callstack = Attr<std::vector<std::string>>(
+        OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
+
+    if (callstack.empty()) {
+      throw exception;
+    }
+    std::ostringstream sout;
+    sout << "Invoke operator " << Type() << " error.\n";
+    sout << "Python Callstacks: \n";
+    for (auto& line : callstack) {
+      sout << line;
+    }
+    sout << "C++ Callstacks: \n";
+    sout << exception.err_str_;
+    exception.err_str_ = sout.str();
+    throw exception;
+  } catch (...) {
+    std::rethrow_exception(std::current_exception());
   }
-  VLOG(3) << place << " " << DebugStringEx(&scope);
 }
 
 bool OperatorBase::HasInputs(const std::string& name) const {
diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc
index c17d1afc30..e634f7c9a5 100644
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -30,6 +30,8 @@ class TopkOp : public framework::OperatorWithKernel {
                    "Output(Indices) of TopkOp should not be null.");
 
     auto input_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(input_dims.size(), 2,
+                    "Rank of TopK op's input must be 2.");
     const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
 
     PADDLE_ENFORCE_GE(k, 1, "k must >= 1");
diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc
index 06d8b65fb1..f8ded9f94e 100644
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -49,6 +49,9 @@ void BindConstValue(pybind11::module* m) {
   op_proto_and_checker_maker.def(
       "kOpNameScopeAttrName",
       framework::OpProtoAndCheckerMaker::OpNamescopeAttrName);
+  op_proto_and_checker_maker.def(
+      "kOpCreationCallstackAttrName",
+      framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName);
 }
 
 }  // namespace pybind
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index de30ed2fc5..3427fb0c4a 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -20,6 +20,7 @@ import os
 import re
 import six
 import sys
+import traceback
 
 import numpy as np
 
@@ -604,6 +605,10 @@ class Operator(object):
         if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0:
             del op_attrs[role_var_name]
 
+        callstack_var_name = op_maker.kOpCreationCallstackAttrName()
+        op_attrs[callstack_var_name] = list(
+            reversed(traceback.format_stack()))[1:]
+
         if len(self.desc.type()) != 0:
             return
         if type is None:
diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py
index 4153394c1d..37b9a9188a 100644
--- a/python/paddle/fluid/tests/unittests/test_operator_desc.py
+++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py
@@ -69,7 +69,7 @@ class TestOperator(unittest.TestCase):
             set(mul_op.attr_names),
             set([
                 "x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var",
-                "op_namescope"
+                "op_namescope", "op_callstack"
             ]))
         self.assertEqual(mul_op.has_attr("x_num_col_dims"), True)
         self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT)

From e05fb128bc7045a0bd27f15e34129c7a5b7e6b53 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Tue, 25 Dec 2018 14:30:54 +0800
Subject: [PATCH 41/77] fix code style test=develop

---
 paddle/fluid/operators/top_k_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc
index e634f7c9a5..c80eaefaae 100644
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -31,7 +31,7 @@ class TopkOp : public framework::OperatorWithKernel {
 
     auto input_dims = ctx->GetInputDim("X");
     PADDLE_ENFORCE_EQ(input_dims.size(), 2,
-                    "Rank of TopK op's input must be 2.");
+                      "Rank of TopK op's input must be 2.");
     const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
 
     PADDLE_ENFORCE_GE(k, 1, "k must >= 1");

From 8ec3d863b0eb932cf6921f1e860537baa4d1028f Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Tue, 25 Dec 2018 15:50:24 +0800
Subject: [PATCH 42/77] Fix throw_on_error direct call bug

test=develop
---
 paddle/fluid/operators/distributed/proto_encoder_helper.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/distributed/proto_encoder_helper.h b/paddle/fluid/operators/distributed/proto_encoder_helper.h
index d2b0eb6ca6..27ca1f4edc 100644
--- a/paddle/fluid/operators/distributed/proto_encoder_helper.h
+++ b/paddle/fluid/operators/distributed/proto_encoder_helper.h
@@ -84,7 +84,9 @@ class ProtoEncodeHelper {
   ~ProtoEncodeHelper() {
 #define REPLACE_ENFORCE_GLOG 1
     // Make sure callers didn't do operations that went over max_size promised
-    paddle::platform::throw_on_error(p_ <= limit_);
+    if (paddle::platform::is_error(p_ <= limit_)) {
+      paddle::platform::throw_on_error(p_ <= limit_);
+    }
 #undef REPLACE_ENFORCE_GLOG
   }
 

From fa33eae9aaf830d4bf85b1ae5a6873546de660fd Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Tue, 25 Dec 2018 09:10:44 +0000
Subject: [PATCH 43/77] test=develop, fix python exetension on python3.x

---
 paddle/scripts/paddle_build.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 25c945c8ce..418dc13468 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -92,6 +92,7 @@ function cmake_gen() {
             -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.5/include/python3.5m/
             -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/libpython3.5m.dylib"
                 WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
+                pip3.5 install --user -r ${PADDLE_ROOT}/python/requirements.txt
             else
                 exit 1
             fi
@@ -104,6 +105,7 @@ function cmake_gen() {
             -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/
             -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/libpython3.6m.dylib"
                 WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
+                pip3.6 install --user -r ${PADDLE_ROOT}/python/requirements.txt
             else
                 exit 1
             fi
@@ -116,6 +118,7 @@ function cmake_gen() {
             -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/
             -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib"
                 WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
+                pip3.7 install --user -r ${PADDLE_ROOT}/python/requirements.txt
             else
                 exit 1
             fi

From 170e78b397da7f4b64024c4c6b59758b07791ec6 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Tue, 25 Dec 2018 17:30:56 +0800
Subject: [PATCH 44/77] restore the top-k test=develop

---
 paddle/fluid/operators/top_k_op.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc
index c80eaefaae..c17d1afc30 100644
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -30,8 +30,6 @@ class TopkOp : public framework::OperatorWithKernel {
                    "Output(Indices) of TopkOp should not be null.");
 
     auto input_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(input_dims.size(), 2,
-                      "Rank of TopK op's input must be 2.");
     const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
 
     PADDLE_ENFORCE_GE(k, 1, "k must >= 1");

From ce3782c193947fc3241528d3ede2e5e22f4dacd9 Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Tue, 25 Dec 2018 11:10:46 +0000
Subject: [PATCH 45/77] add affine_channel fuse. fix conv+elemenwise fuse bug.

---
 paddle/fluid/framework/ir/CMakeLists.txt      |   1 +
 .../ir/conv_affine_channel_fuse_pass.cc       | 222 ++++++++++++++++++
 .../ir/conv_affine_channel_fuse_pass.h        |  49 ++++
 .../framework/ir/graph_pattern_detector.cc    |  76 ++++++
 .../framework/ir/graph_pattern_detector.h     |  32 +++
 paddle/fluid/inference/api/analysis_config.cc |   2 +-
 .../fluid/inference/api/paddle_pass_builder.h |   4 +-
 paddle/fluid/operators/conv_fusion_op.cu.cc   |   4 +-
 8 files changed, 385 insertions(+), 5 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
 create mode 100644 paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index b7f7e2ee8e..6d795e1e2d 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -45,6 +45,7 @@ pass_library(is_test_pass base)
 pass_library(conv_elementwise_add_act_fuse_pass inference)
 pass_library(conv_elementwise_add2_act_fuse_pass inference)
 pass_library(conv_elementwise_add_fuse_pass inference)
+pass_library(conv_affine_channel_fuse_pass inference)
 if(WITH_MKLDNN)
     pass_library(mkldnn_placement_pass base)
     pass_library(depthwise_conv_mkldnn_pass base)
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
new file mode 100644
index 0000000000..a7bfb8cf1e
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
@@ -0,0 +1,222 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h"
+#include <functional>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/operators/math/cpu_vec.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+#define GET_CONV_BN_NODES(pattern_name)                                    \
+  /* OPERATORS */                                                          \
+  GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name);                     \
+  GET_IR_NODE_FROM_SUBGRAPH(affine_channel, affine_channel, pattern_name); \
+  /* CONV inputs */                                                        \
+  GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name);       \
+  /* CONV outputs */                                                       \
+  GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name);             \
+  /* Affine Channel inputs */                                              \
+  GET_IR_NODE_FROM_SUBGRAPH(ac_scale, ac_scale, pattern_name);             \
+  GET_IR_NODE_FROM_SUBGRAPH(ac_bias, ac_bias, pattern_name);               \
+  /* Affine channel outputs */                                             \
+  GET_IR_NODE_FROM_SUBGRAPH(ac_out, ac_out, pattern_name); /* Out */
+
+void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight,
+                                const ir::Node& ac_scale,
+                                const LoDTensor& ac_bias_tensor,
+                                LoDTensor* eltwise_y_in_tensor) {
+  using EigenVectorArrayMap =
+      Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
+  using ConstEigenVectorArrayMap =
+      Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
+  using EigenMatrixArrayMap = Eigen::Map<
+      Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+
+  // Re-compute bias of conv2d from AffineChannel
+  PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), ac_bias_tensor.dims());
+
+  auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable<LoDTensor>();
+
+  ConstEigenVectorArrayMap scale_array(scale_tensor->data<float>(),
+                                       scale_tensor->numel(), 1);
+  ConstEigenVectorArrayMap ac_bias_array(ac_bias_tensor.data<float>(),
+                                         ac_bias_tensor.numel(), 1);
+
+  EigenVectorArrayMap eltwise_y_in_array(
+      eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
+      eltwise_y_in_tensor->numel(), 1);
+
+  eltwise_y_in_array = (eltwise_y_in_array * scale_array) + ac_bias_array;
+
+  // Re-compute weight of conv2d from AffineChannel
+  auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
+  auto weights_shape = weights->dims();
+  auto weights_shape_2d = flatten_to_2d(weights_shape, 1);
+
+  EigenMatrixArrayMap weights_array_2d(
+      weights->mutable_data<float>(platform::CPUPlace()), weights_shape_2d[0],
+      weights_shape_2d[1]);
+
+  weights_array_2d.colwise() *= scale_array;
+}
+
+std::unique_ptr<ir::Graph> ConvAffineChannelFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init(name_scope_, graph.get());
+
+  auto* scope = param_scope();
+  PADDLE_ENFORCE(scope);
+
+  GraphPatternDetector gpd;
+  auto* conv_input =
+      gpd.mutable_pattern()
+          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
+          ->AsInput()
+          ->assert_is_op_input("conv2d", "Input");
+  patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
+                                              name_scope_);
+  conv_ac_pattern(conv_input, false /*with_eltwise_add*/);
+
+  int found_conv_ac_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "handle ConvAffineChannel fuse";
+
+    GET_CONV_BN_NODES(conv_ac_pattern);
+
+    // check if fuse can be done and if MKL-DNN should be used
+    FuseOptions fuse_option = FindFuseOption(*conv, *affine_channel);
+    if (fuse_option == DO_NOT_FUSE) {
+      VLOG(3) << "do not perform conv+affinechannel fuse";
+      return;
+    }
+
+    // Create eltwise_y (conv bias) variable
+    VarDesc eltwise_y_in_desc(
+        patterns::PDNodeName(name_scope_, "eltwise_y_in"));
+    eltwise_y_in_desc.SetPersistable(true);
+    auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc);
+    auto* eltwise_y_in_tensor =
+        scope->Var(eltwise_y_in_node->Name())->GetMutable<LoDTensor>();
+
+    // Get affine_channel bias
+    auto* ac_bias_tensor =
+        scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
+
+    // Initialize eltwise_y
+    eltwise_y_in_tensor->Resize(ac_bias_tensor->dims());
+    std::fill_n(eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
+                eltwise_y_in_tensor->numel(), 0.0f);
+
+    // update weights and biases
+    recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
+                               eltwise_y_in_tensor);
+
+    // create an elementwise add node.
+    OpDesc desc;
+    desc.SetInput("X", std::vector<std::string>({conv_out->Name()}));
+    desc.SetInput("Y", std::vector<std::string>({eltwise_y_in_node->Name()}));
+    desc.SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
+    desc.SetType("elementwise_add");
+    desc.SetAttr("axis", 1);
+    auto eltwise_op = g->CreateOpNode(&desc);  // OpDesc will be copied.
+
+    GraphSafeRemoveNodes(graph.get(), {ac_scale, ac_bias, affine_channel});
+
+    IR_NODE_LINK_TO(conv_out, eltwise_op);
+    IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op);
+    IR_NODE_LINK_TO(eltwise_op, ac_out);
+    found_conv_ac_count++;
+  };
+
+  gpd(graph.get(), handler);
+
+  AddStatis(found_conv_ac_count);
+  return graph;
+}
+
+std::unique_ptr<ir::Graph> ConvEltwiseAddAffineChannelFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init(name_scope_, graph.get());
+
+  auto* scope = param_scope();
+  PADDLE_ENFORCE(scope);
+
+  GraphPatternDetector gpd;
+  auto* conv_input =
+      gpd.mutable_pattern()
+          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
+          ->AsInput()
+          ->assert_is_op_input("conv2d", "Input");
+  patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
+                                              name_scope_);
+  conv_ac_pattern(conv_input, true /*with_eltwise_add*/);
+
+  int found_conv_ac_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "handle ConvBN fuse";
+
+    GET_CONV_BN_NODES(conv_ac_pattern);
+    // OPERATORS
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_ac_pattern);
+    // BIAS inputs
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_y_in, eltwise_y_in, conv_ac_pattern);
+    // BIAS outputs
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_ac_pattern);
+
+    // Get eltwise_y (conv bias) variable
+    auto* eltwise_y_in_tensor =
+        scope->FindVar(eltwise_y_in->Name())->GetMutable<LoDTensor>();
+
+    // Get batch norm bias
+    auto* ac_bias_tensor =
+        scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
+
+    recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
+                               eltwise_y_in_tensor);
+
+    // Update the elementwise_add node
+    eltwise->Op()->SetAttr("axis", 1);
+    eltwise->Op()->SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
+
+    GraphSafeRemoveNodes(graph.get(),
+                         {ac_scale, ac_bias, affine_channel, eltwise_out});
+
+    IR_NODE_LINK_TO(eltwise, ac_out);
+
+    found_conv_ac_count++;
+  };
+
+  gpd(graph.get(), handler);
+  AddStatis(found_conv_ac_count);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(conv_affine_channel_fuse_pass,
+              paddle::framework::ir::ConvAffineChannelFusePass);
+REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass,
+              paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass);
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
new file mode 100644
index 0000000000..ad966e11e6
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Fuse the Conv and ConvAffineChannel.
+ */
+class ConvAffineChannelFusePass : public FusePassBase {
+ public:
+  virtual ~ConvAffineChannelFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  const std::string name_scope_{"conv_affine_channel_fuse"};
+};
+
+class ConvEltwiseAddAffineChannelFusePass : public FusePassBase {
+ public:
+  virtual ~ConvEltwiseAddAffineChannelFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 13d752e516..6ef3417901 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1101,9 +1101,13 @@ PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) {
   return out_var;
 }
 
+// only support "identity" and "relu" now.
+/*
 std::unordered_set<std::string> conv_act_set({"identity", "sigmoid", "relu",
                                               "relu6", "relux", "tanh",
                                               "band_pass"});
+*/
+std::unordered_set<std::string> conv_act_set({"identity", "relu"});
 
 PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) {
   conv_in->AsInput();
@@ -1236,6 +1240,78 @@ PDNode *patterns::ConvElementwiseadd::operator()(PDNode *conv_in) {
   return elementwise_add_out;
 }
 
+PDNode *patterns::ConvAffineChannel::operator()(
+    paddle::framework::ir::PDNode *conv_input, bool with_eltwise_add) {
+  // Create Operators
+  conv_input->assert_is_op_input("conv2d", "Input");
+  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
+
+  PDNode *eltwise_op = nullptr;
+  if (with_eltwise_add) {
+    eltwise_op =
+        pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add");
+  }
+
+  auto *affine_channel_op =
+      pattern->NewNode(affine_channel_repr())->assert_is_op("affine_channel");
+  // Create variables
+  // Conv Filter
+  auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
+                              ->AsInput()
+                              ->assert_is_persistable_var()
+                              ->assert_is_op_input("conv2d", "Filter");
+
+  auto *conv_out_var = pattern->NewNode(conv_out_repr())
+                           ->AsIntermediate()
+                           ->assert_is_only_output_of_op("conv2d");
+
+  PDNode *eltwise_y_in_var = nullptr;
+  PDNode *eltwise_out_var = nullptr;
+  if (with_eltwise_add) {
+    // Conv output as Bias input
+    conv_out_var->assert_is_op_input("elementwise_add", "X");
+    // Bias
+    eltwise_y_in_var = pattern->NewNode(eltwise_y_in_repr())
+                           ->assert_is_op_input("elementwise_add", "Y")
+                           ->AsInput();
+    eltwise_out_var = pattern->NewNode(eltwise_out_repr())
+                          ->AsIntermediate()
+                          ->assert_is_only_output_of_op("elementwise_add");
+  } else {
+    // Conv output as AffineChannel input
+    conv_out_var->assert_is_op_input("affine_channel", "X");
+  }
+
+  // AC Scale
+  auto *ac_scale_var = pattern->NewNode(ac_scale_repr())
+                           ->AsInput()
+                           ->assert_is_persistable_var()
+                           ->assert_is_op_input("affine_channel", "Scale");
+  // AC Bias
+  auto *ac_bias_var = pattern->NewNode(ac_bias_repr())
+                          ->AsInput()
+                          ->assert_is_persistable_var()
+                          ->assert_is_op_input("affine_channel", "Bias");
+
+  // AC output
+  auto *ac_out_var = pattern->NewNode(ac_out_repr())
+                         ->AsOutput()
+                         ->assert_is_op_output("affine_channel");
+
+  conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
+
+  if (with_eltwise_add) {
+    eltwise_op->LinksFrom({conv_out_var, eltwise_y_in_var})
+        .LinksTo({eltwise_out_var});
+    affine_channel_op->LinksFrom({eltwise_out_var, ac_scale_var, ac_bias_var})
+        .LinksTo({ac_out_var});
+  } else {
+    affine_channel_op->LinksFrom({conv_out_var, ac_scale_var, ac_bias_var})
+        .LinksTo({ac_out_var});
+  }
+  return ac_out_var;
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index eaedd9d08e..61a5300344 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -734,6 +734,38 @@ struct ConvElementwiseadd : public PatternBase {
   PATTERN_DECL_NODE(elementwise_add_out);
 };
 
+// Conv with affine_channel
+// op: conv + (elementwise_add +) affine_channel
+// named nodes:
+// conv_weight, conv_out, conv,
+// ac_x, ac_scale, ac_bias
+// affine_channel, ac_out
+struct ConvAffineChannel : public PatternBase {
+  ConvAffineChannel(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "conv_affine_channel") {}
+
+  PDNode* operator()(PDNode* conv_input, bool with_eltwise_add);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(conv);
+  PATTERN_DECL_NODE(affine_channel);
+  PATTERN_DECL_NODE(eltwise);  // ELEMENTWISE_ADD
+  // CONV inputs
+  PATTERN_DECL_NODE(conv_weight);  // Filter
+  // CONV outputs
+  PATTERN_DECL_NODE(conv_out);  // tmp
+  // ELTWISE inputs
+  PATTERN_DECL_NODE(eltwise_y_in);
+  // ELTWISE outputs
+  PATTERN_DECL_NODE(eltwise_out);  // tmp
+
+  // AC(Affine_Channel) inputs
+  PATTERN_DECL_NODE(ac_scale);
+  PATTERN_DECL_NODE(ac_bias);
+  // AC outputs
+  PATTERN_DECL_NODE(ac_out);  // Out
+};
+
 }  // namespace patterns
 
 // Link two ir::Nodes from each other.
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index dcefdd92f5..8a0ddfbab4 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -110,7 +110,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
   tensorrt_workspace_size_ = workspace_size;
   tensorrt_max_batchsize_ = max_batch_size;
   // Append after the infer_clean pass.
-  pass_builder()->InsertPass(1, "tensorrt_subgraph_pass");
+  pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
 }
 
 void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 40ca0d287c..d327f2bcec 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -118,7 +118,9 @@ class GpuPassStrategy : public PassStrategy {
  public:
   GpuPassStrategy() : PassStrategy({}) {
     passes_.assign({
-        "infer_clean_graph_pass",               //
+        "infer_clean_graph_pass",  //
+        "conv_affine_channel_fuse_pass",
+        "conv_eltwiseadd_affine_channel_fuse_pass",
         "conv_bn_fuse_pass",                    //
         "conv_elementwise_add_act_fuse_pass",   //
         "conv_elementwise_add2_act_fuse_pass",  //
diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc
index 3235ad52b9..d63e0fa030 100644
--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cu.cc
@@ -161,9 +161,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
                       "workspace_size to be allocated exceeds the limit");
 
-    if ((activation == "identity") &&
-        (algo != CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) &&
-        (!residual)) {
+    if ((activation == "identity") && (!residual)) {
       // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
       // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
       // But test in some case, the speed is slower, change to use

From d4931a2abc6648bd652e0444972e41735f45dcf0 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Tue, 25 Dec 2018 11:36:26 +0000
Subject: [PATCH 46/77] support more input fake data

---
 .../fluid/inference/tests/api/tester_helper.h | 47 +++++++++++--------
 1 file changed, 27 insertions(+), 20 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index b0c8f395ce..ef7e2198c5 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -132,7 +132,8 @@ std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
 void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
                        const std::string &dirname, bool is_combined = true,
                        std::string model_filename = "model",
-                       std::string params_filename = "params") {
+                       std::string params_filename = "params",
+                       const std::vector<std::string> *feed_names = nullptr) {
   // Set fake_image_data
   PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
   std::vector<std::vector<int64_t>> feed_target_shapes = GetFeedTargetShapes(
@@ -146,26 +147,32 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
     os << "}\n";
   }
   LOG(INFO) << os.str();
-
-  int dim1 = feed_target_shapes[0][1];
-  int dim2 = feed_target_shapes[0][2];
-  int dim3 = feed_target_shapes[0][3];
-
-  PaddleTensor input;
-  std::vector<int> shape({FLAGS_batch_size, dim1, dim2, dim3});
-  input.shape = shape;
-  input.dtype = PaddleDType::FLOAT32;
-
-  // fill input data, for profile easily, do not use random data here.
-  size_t size = FLAGS_batch_size * dim1 * dim2 * dim3;
-  input.data.Resize(size * sizeof(float));
-  float *input_data = static_cast<float *>(input.data.data());
-  for (size_t i = 0; i < size; i++) {
-    *(input_data + i) = static_cast<float>(i) / size;
+  if (feed_names) {
+    PADDLE_ENFORCE_EQ(feed_names->size(), feed_target_shapes.size());
+  }
+  std::vector<PaddleTensor> input_slots(feed_target_shapes.size());
+  for (size_t i = 0; i < feed_target_shapes.size(); ++i) {
+    const auto &feed_shape = feed_target_shapes[i];
+    auto &input = input_slots[i];
+    std::vector<int> shape({FLAGS_batch_size});
+    for (size_t s = 1; s < feed_shape.size(); ++s) {
+      shape.push_back(static_cast<int>(feed_shape[s]));
+    }
+    if (feed_names) {
+      input.name = (*feed_names)[i];
+    }
+    input.shape = shape;
+    input.dtype = PaddleDType::FLOAT32;
+    size_t len = std::accumulate(shape.begin(), shape.end(), 1,
+                                 [](int a, int b) { return a * b; });
+    input.data.Resize(len * sizeof(float));
+    input.lod.assign({{0, static_cast<size_t>(FLAGS_batch_size)}});
+    float *input_data = static_cast<float *>(input.data.data());
+    // fill input data, for profile easily, do not use random data here.
+    for (size_t j = 0; j < len; ++j) {
+      *(input_data + j) = static_cast<float>(j) / len;
+    }
   }
-
-  std::vector<PaddleTensor> input_slots;
-  input_slots.assign({input});
   (*inputs).emplace_back(input_slots);
 }
 

From d46a140dd94406c669acedb78353131bfe89a115 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Tue, 25 Dec 2018 11:58:09 +0000
Subject: [PATCH 47/77] add seq pool inference test

test=develop
---
 .../fluid/inference/tests/api/CMakeLists.txt  |   4 +
 .../tests/api/analyzer_seq_pool1_tester.cc    | 117 ++++++++++++++++++
 2 files changed, 121 insertions(+)
 create mode 100644 paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 95bbc74a59..9aa9db031c 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -108,6 +108,10 @@ inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose
 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
   "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz")
 
+# seq_pool1
+inference_analysis_api_test_with_fake_data(test_analyzer_seq_pool1
+"${INFERENCE_DEMO_INSTALL_DIR}/seq_pool1" analyzer_seq_pool1_tester.cc "seq_pool1.tar.gz")
+
 # mobilenet with depthwise_conv op
 inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv
   "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz")
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
new file mode 100644
index 0000000000..2ae840fd11
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
@@ -0,0 +1,117 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void SetConfig(AnalysisConfig *cfg) {
+  cfg->param_file = FLAGS_infer_model + "/params";
+  cfg->prog_file = FLAGS_infer_model + "/model";
+  cfg->use_gpu = false;
+  cfg->device = 0;
+  cfg->enable_ir_optim = true;
+  cfg->specify_input_name = true;
+  cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
+}
+
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
+  std::vector<std::string> feed_names = {
+      "slot10000_embed", "slot10001_embed", "slot10004_embed",
+      "slot10005_embed", "slot10008_embed", "slot10009_embed",
+      "slot10012_embed", "slot10013_embed", "slot10108_embed",
+      "slot13324_embed", "slot13325_embed", "slot13326_embed",
+      "slot13327_embed", "slot13328_embed", "slot13329_embed",
+      "slot13330_embed", "slot13331_embed", "slot15501_embed",
+      "slot15502_embed", "slot15503_embed", "slot15504_embed",
+      "slot15505_embed", "slot15506_embed", "slot15507_embed",
+      "slot15508_embed", "slot15516_embed", "slot15519_embed",
+      "slot15523_embed", "slot15531_embed", "slot15533_embed",
+      "slot15548_embed", "slot15564_embed", "slot15565_embed",
+      "slot15566_embed", "slot15570_embed", "slot15571_embed",
+      "slot15572_embed", "slot15573_embed", "slot15574_embed",
+      "slot15575_embed", "slot15576_embed", "slot15577_embed",
+      "slot15579_embed", "slot15581_embed", "slot15582_embed",
+      "slot15583_embed", "slot15584_embed", "slot5016_embed",
+      "slot5021_embed",  "slot6002_embed",  "slot6003_embed",
+      "slot6004_embed",  "slot6005_embed",  "slot6006_embed",
+      "slot6007_embed",  "slot6008_embed",  "slot6009_embed",
+      "slot6011_embed",  "slot6014_embed",  "slot6015_embed",
+      "slot6023_embed",  "slot6024_embed",  "slot6025_embed",
+      "slot6027_embed",  "slot6029_embed",  "slot6031_embed",
+      "slot6034_embed",  "slot6035_embed",  "slot6036_embed",
+      "slot6037_embed",  "slot6039_embed",  "slot6048_embed",
+      "slot6050_embed",  "slot6058_embed",  "slot6059_embed",
+      "slot6060_embed",  "slot6066_embed",  "slot6067_embed",
+      "slot6068_embed",  "slot6069_embed",  "slot6070_embed",
+      "slot6071_embed",  "slot6072_embed",  "slot6073_embed",
+      "slot6182_embed",  "slot6183_embed",  "slot6184_embed",
+      "slot6185_embed",  "slot6186_embed",  "slot6188_embed",
+      "slot6189_embed",  "slot6190_embed",  "slot6201_embed",
+      "slot6202_embed",  "slot6203_embed",  "slot6247_embed",
+      "slot6248_embed",  "slot6250_embed",  "slot6251_embed",
+      "slot6807_embed",  "slot6808_embed",  "slot6809_embed",
+      "slot6810_embed",  "slot6811_embed",  "slot6812_embed",
+      "slot6813_embed",  "slot6814_embed",  "slot6815_embed",
+      "slot6816_embed",  "slot6817_embed",  "slot6818_embed",
+      "slot6819_embed",  "slot6820_embed",  "slot6822_embed",
+      "slot6823_embed",  "slot6826_embed",  "slot7002_embed",
+      "slot7003_embed",  "slot7004_embed",  "slot7005_embed",
+      "slot7006_embed",  "slot7008_embed",  "slot7009_embed",
+      "slot7010_embed",  "slot7011_embed",  "slot7013_embed",
+      "slot7014_embed",  "slot7015_embed",  "slot7016_embed",
+      "slot7017_embed",  "slot7019_embed",  "slot7100_embed",
+      "slot7506_embed",  "slot7507_embed",  "slot7514_embed",
+      "slot7515_embed",  "slot7516_embed"};
+  SetFakeImageInput(inputs, FLAGS_infer_model, true, "model", "params",
+                    &feed_names);
+}
+
+// Easy for profiling independently.
+void profile(bool use_mkldnn = false) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+  }
+  std::vector<PaddleTensor> outputs;
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
+}
+
+TEST(Analyzer_seq_pool1, profile) { profile(); }
+
+// Check the fuse status
+TEST(Analyzer_seq_pool1, fuse_statis) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  int num_ops;
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  auto fuse_statis = GetFuseStatis(
+      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
+  LOG(INFO) << "num_ops: " << num_ops;
+  EXPECT_EQ(num_ops, 314);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle

From b9fb03cf54d4594269d7c78be521600f75cb6f49 Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Tue, 25 Dec 2018 07:17:23 -0600
Subject: [PATCH 48/77] Move GetTensor to tensor_util (#15011)

* refine tensor
test=develop

* refine tensor
test=develop

* fix device_context log
test=develop
---
 paddle/fluid/framework/CMakeLists.txt         |  4 +-
 paddle/fluid/framework/tensor.cc              |  3 +-
 paddle/fluid/framework/tensor.h               |  2 +-
 paddle/fluid/framework/tensor_util.h          | 22 ++++++++++
 paddle/fluid/operators/conv_op.h              | 12 ++----
 .../fluid/operators/math/concat_and_split.cu  |  5 ++-
 .../create_tensor_with_allocationptr.h        | 42 -------------------
 paddle/fluid/platform/device_context.cc       |  7 ++--
 paddle/fluid/platform/device_context.h        | 23 +++++++++-
 paddle/fluid/platform/temporary_allocator.h   | 13 ++++++
 .../platform/temporary_allocator_test.cc      | 18 ++++----
 11 files changed, 80 insertions(+), 71 deletions(-)
 delete mode 100644 paddle/fluid/platform/create_tensor_with_allocationptr.h

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 412bc9cbe8..867970717b 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -48,10 +48,10 @@ if(WITH_GPU)
     nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
     add_dependencies(tensor tensor_util)
   else()
-    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
+    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context )
   endif(WIN32)
 else()
-  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context)
+  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context )
 endif()
 
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index 5b09cad06c..ef096c2b81 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -28,8 +28,7 @@ void Tensor::check_memory_size() const {
       "or maybe the required data-type mismatches the data already stored.");
 }
 
-Tensor::Tensor(std::type_index type)
-    : type_(framework::ToDataType(type)), offset_(0) {}
+Tensor::Tensor(const proto::VarType::Type& dtype) : type_(dtype), offset_(0) {}
 
 size_t Tensor::memory_size() const {
   return holder_ == nullptr ? 0UL : holder_->size() - offset_;
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 2e110133a3..40606d9b06 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -69,7 +69,7 @@ class Tensor {
  public:
   Tensor() : type_(proto::VarType::FP32), offset_(0) {}
 
-  explicit Tensor(std::type_index type);
+  explicit Tensor(const proto::VarType::Type&);
 
   /*! Return a pointer to mutable memory block. */
   template <typename T>
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index cab6d9b67e..871c7bd2a7 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/temporary_allocator.h"
 
 namespace paddle {
 namespace framework {
@@ -151,5 +152,26 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
                src_ptr, size);
 }
 
+template <typename T>
+paddle::framework::Tensor GetTensor(
+    memory::allocation::AllocationPtr temp_allocation_ptr,
+    const framework::DDim& dim) {
+  auto& deleter = temp_allocation_ptr.get_deleter();
+  auto* allocation_ptr = temp_allocation_ptr.release();
+  auto shared_allocation =
+      std::shared_ptr<memory::allocation::Allocation>(allocation_ptr, deleter);
+
+  PADDLE_ENFORCE(
+      dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr,
+      "The AllocationPtr must be TemporaryAllocation.");
+  PADDLE_ENFORCE_EQ(allocation_ptr->size(),
+                    framework::product(dim) * sizeof(T));
+
+  paddle::framework::Tensor temp_tensor(
+      framework::ToDataType(std::type_index(typeid(T))));
+  temp_tensor.Resize(dim);
+  temp_tensor.ResetHolder(std::move(shared_allocation));
+  return temp_tensor;
+}
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index 4a7b31c7d4..2519f5e7ac 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -18,11 +18,11 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/operators/math/vol2col.h"
-#include "paddle/fluid/platform/create_tensor_with_allocationptr.h"
 
 namespace paddle {
 namespace operators {
@@ -161,10 +161,7 @@ class GemmConvKernel : public framework::OpKernel<T> {
       auto tmp_allocation_ptr =
           platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
               framework::product(col_shape) * sizeof(T));
-      Tensor tep_tensor =
-          platform::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
-
-      col.ShareDataWith(tep_tensor);
+      col = framework::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
       col_matrix.ShareDataWith(col);
       col_matrix.Resize(col_matrix_shape);
     }
@@ -299,10 +296,7 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
       auto tmp_allocation_ptr =
           platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
               framework::product(col_shape) * sizeof(T));
-      Tensor tep_tensor =
-          platform::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
-
-      col.ShareDataWith(tep_tensor);
+      col = framework::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
       col_matrix.ShareDataWith(col);
       col_matrix.Resize(col_matrix_shape);
     }
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index b10a19b658..e925e7bb59 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -131,8 +131,9 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
     int in_col = input[0].numel() / in_row;
     int out_row = in_row, out_col = 0;
 
-    std::vector<T*> inputs_data(in_num);
+    std::vector<const T*> inputs_data;
     std::vector<int> inputs_col(in_num + 1);
+    inputs_data.reserve(in_num);
 
     inputs_col[0] = 0;
     bool sameShape = true;
@@ -143,7 +144,7 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
       }
       out_col += t_cols;
       inputs_col[i + 1] = out_col;
-      inputs_data[i] = const_cast<T*>(input[i].data<T>());
+      inputs_data.emplace_back(input[i].data<T>());
     }
 
     // computation
diff --git a/paddle/fluid/platform/create_tensor_with_allocationptr.h b/paddle/fluid/platform/create_tensor_with_allocationptr.h
deleted file mode 100644
index 00fcc5f862..0000000000
--- a/paddle/fluid/platform/create_tensor_with_allocationptr.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/temporary_allocator.h"
-namespace paddle {
-namespace platform {
-
-template <typename T>
-paddle::framework::Tensor GetTensor(
-    memory::allocation::AllocationPtr temp_allocation_ptr,
-    const framework::DDim &dim) {
-  auto &deleter = temp_allocation_ptr.get_deleter();
-  auto *allocation_ptr = temp_allocation_ptr.release();
-  auto shared_allocation =
-      std::shared_ptr<memory::allocation::Allocation>(allocation_ptr, deleter);
-
-  PADDLE_ENFORCE(dynamic_cast<TemporaryAllocation *>(allocation_ptr) != nullptr,
-                 "The AllocationPtr must be TemporaryAllocation.");
-  PADDLE_ENFORCE_EQ(allocation_ptr->size(),
-                    framework::product(dim) * sizeof(T));
-
-  paddle::framework::Tensor temp_tensor(std::type_index(typeid(T)));
-  temp_tensor.Resize(dim);
-  temp_tensor.ResetHolder(std::move(shared_allocation));
-  return temp_tensor;
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 81c443d758..022afb686b 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -256,10 +256,11 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
 
   LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: " << place_.device
                           << ", CUDA Capability: " << compute_capability_
-                          << ", Driver Version: " << driver_version_ / 1000
+                          << ", Driver API Version: " << driver_version_ / 1000
                           << "." << (driver_version_ % 100) / 10
-                          << ", Runtime Version: " << runtime_version_ / 1000
-                          << "." << (runtime_version_ % 100) / 10;
+                          << ", Runtime API Version: "
+                          << runtime_version_ / 1000 << "."
+                          << (runtime_version_ % 100) / 10;
   size_t cudnn_dso_ver = dynload::cudnnGetVersion();
   LOG_FIRST_N(WARNING, 1) << "device: " << place_.device
                           << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index af9744dcb8..7e87580189 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -41,7 +41,28 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-/*! \brief device temporary allocator singleton */
+/*! \brief device temporary allocator singleton.
+ *
+ * Some operator needs temporary memory during computation, for example,
+ * conv_gemm, which needs use col to store the result of im2col. If we
+ * create a stack memory which is used by CUDA Kernel, before the
+ * Computation(...) returns, we should add ctx->Wait(), because the
+ * execution of CUDA is async, if there doesn't have ctx->Wait(),
+ * the temporary memory will be released before the CUDA Kernel uses
+ * it.
+ *
+ * DeviceTemporaryAllocator is a singleton, which contains a
+ * `TemporaryAllocator` for each <Place, Stream>. And the TemporaryAllocator
+ * contains a temp_allocation_queue which is used to store the temporary
+ * allocations. The allocation, which is allocated by TemporaryAllocator,
+ * is a unique_ptr,  and when it is not held by any variable, it will be
+ * pushed into the temp_allocation_queue. There are two opportunities to free
+ * the allocations of temp_allocation_queue:
+ *  - when the Stream calls cudaStreamSynchronize;
+ *  - when the allocation size of opportunities exceeds a certain threshold
+ *    (defined by FLAGS_limit_of_temporary_allocation).
+ *
+ * */
 class DeviceTemporaryAllocator {
  public:
   static DeviceTemporaryAllocator& Instance() {
diff --git a/paddle/fluid/platform/temporary_allocator.h b/paddle/fluid/platform/temporary_allocator.h
index 4e32d2d695..812c4a3331 100644
--- a/paddle/fluid/platform/temporary_allocator.h
+++ b/paddle/fluid/platform/temporary_allocator.h
@@ -29,6 +29,19 @@ class TemporaryAllocation : public memory::allocation::Allocation {
   memory::allocation::AllocationPtr underlying_allocation_;
 };
 
+/*! \brief the TemporaryAllocator is used to alloc the temporary allocation
+ * which used by CUDA's async operation.
+ *
+ * The TemporaryAllocator contains a temp_allocation_queue which
+ * is used to store the temporary allocations. The allocation, which is
+ * allocated by TemporaryAllocator, is a unique_ptr, and when it is not held
+ * by any variable, it will be pushed into the  temp_allocation_queue.
+ *
+ * There is one opportunity to free the allocations of temp_allocation_queue:
+ *   - when the allocation size of opportunities exceeds a certain threshold
+ *     (defined by FLAGS_limit_of_temporary_allocation).
+ *
+ * */
 class TemporaryAllocator : public memory::allocation::Allocator {
  public:
   explicit TemporaryAllocator(platform::Place place);
diff --git a/paddle/fluid/platform/temporary_allocator_test.cc b/paddle/fluid/platform/temporary_allocator_test.cc
index 3b940b0e82..e4e5be5b89 100644
--- a/paddle/fluid/platform/temporary_allocator_test.cc
+++ b/paddle/fluid/platform/temporary_allocator_test.cc
@@ -14,8 +14,7 @@
 
 #include "paddle/fluid/platform/temporary_allocator.h"
 #include <gtest/gtest.h>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/create_tensor_with_allocationptr.h"
+#include "paddle/fluid/framework/tensor_util.h"
 DECLARE_double(limit_of_temporary_allocation);
 
 namespace paddle {
@@ -47,6 +46,7 @@ TEST(temporary_allocator, temporary_allocator) {
 
 TEST(temporary_allocator, add_callback) {
 #ifdef PADDLE_WITH_CUDA
+  const double limit = FLAGS_limit_of_temporary_allocation;
   FLAGS_limit_of_temporary_allocation = 10;
   platform::CUDAPlace gpu_place(0);
   TemporaryAllocator gpu_alloc(gpu_place);
@@ -63,7 +63,7 @@ TEST(temporary_allocator, add_callback) {
   });
   { gpu_alloc.Allocate(100); }
   PADDLE_ENFORCE(deleted);
-  FLAGS_limit_of_temporary_allocation = -1;
+  FLAGS_limit_of_temporary_allocation = limit;
 #endif
 }
 
@@ -75,8 +75,8 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) {
     auto allocation = cpu_alloc.Allocate(memory_size);
     void* address = allocation->ptr();
     int numel = memory_size / sizeof(float);
-    framework::Tensor tensor =
-        GetTensor<float>(std::move(allocation), framework::make_ddim({numel}));
+    framework::Tensor tensor = framework::GetTensor<float>(
+        std::move(allocation), framework::make_ddim({numel}));
     PADDLE_ENFORCE_EQ(address, tensor.data<float>());
     PADDLE_ENFORCE_EQ(tensor.numel(), numel);
   }
@@ -90,8 +90,8 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) {
     auto allocation = gpu_alloc.Allocate(memory_size);
     void* address = allocation->ptr();
     int numel = memory_size / sizeof(float);
-    framework::Tensor tensor =
-        GetTensor<float>(std::move(allocation), framework::make_ddim({numel}));
+    framework::Tensor tensor = framework::GetTensor<float>(
+        std::move(allocation), framework::make_ddim({numel}));
     PADDLE_ENFORCE_EQ(address, tensor.data<float>());
     PADDLE_ENFORCE_EQ(tensor.numel(), numel);
   }
@@ -116,7 +116,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) {
     {
       auto allocation = cpu_alloc.Allocate(memory_size);
       address = allocation->ptr();
-      framework::Tensor tensor = GetTensor<float>(
+      framework::Tensor tensor = framework::GetTensor<float>(
           std::move(allocation), framework::make_ddim({numel}));
       PADDLE_ENFORCE_EQ(address, tensor.data<float>());
       PADDLE_ENFORCE_EQ(tensor.numel(), numel);
@@ -138,7 +138,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) {
     {
       auto allocation = gpu_alloc.Allocate(memory_size);
       address = allocation->ptr();
-      framework::Tensor tensor = GetTensor<float>(
+      framework::Tensor tensor = framework::GetTensor<float>(
           std::move(allocation), framework::make_ddim({numel}));
       PADDLE_ENFORCE_EQ(address, tensor.data<float>());
       PADDLE_ENFORCE_EQ(tensor.numel(), numel);

From e821b12f57487f2ecab8debb13531adc05dd9453 Mon Sep 17 00:00:00 2001
From: Brian Liu <brian.liu@intel.com>
Date: Tue, 25 Dec 2018 14:48:02 +0800
Subject: [PATCH 49/77] Fix issue which cause abnormal CPU usage in stack op

Stack OP has much higher CPU cost than expected in release mode.
Caused by DebugStringEx() in base class OperatorWithKernel. Actually
this issue occur for each OP which hasn't implement it's own
GetExpectedKernelType().

test=develop
---
 paddle/fluid/framework/operator.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index fec311e3ee..f48e403cef 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1061,8 +1061,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
           t = &(var->Get<SelectedRows>().value());
         }
         if (t != nullptr) {
-          PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized: %s",
-                         ipt_name, DebugString());
+          PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized",
+                         ipt_name);
           int tmp = static_cast<int>(t->type());
           PADDLE_ENFORCE(
               tmp == data_type || data_type == -1,

From a28df3eb0b874168abb00809ffe5715c426b010e Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Wed, 26 Dec 2018 07:52:12 +0800
Subject: [PATCH 50/77] Fix the unstack layer (#15047)

test=develop
---
 python/paddle/fluid/layers/nn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index bdfcc8c4e2..489433cff5 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -7943,7 +7943,7 @@ def unstack(x, axis=0, num=None):
             num = x.shape[axis]
 
     outs = []
-    for _ in num:
+    for _ in range(num):
         outs.append(helper.create_variable_for_type_inference(x.dtype))
 
     helper.append_op(

From 856f0da0fe040b3e8f25e9a1939fa6b4c7c7293d Mon Sep 17 00:00:00 2001
From: Wu Yi <typhoonzero1986@gmail.com>
Date: Wed, 26 Dec 2018 08:59:01 +0800
Subject: [PATCH 51/77] Fp16 training (#14992)

* wip

* wip

* wip

* wip for test

* add fp16 tests test=develop

* fix cpu build test=develop

* fix test=develop

* fix py3 tests test=develop

* fix lr_scheduler dtype test=develop

* fix test=dvelop

* test fix ci compile test=develop

* fix build and merge test=develop

* fallback momentumop change to general test=develop

* make fp16 lr schedule simple test=develop

* fix ut test=develop

* fix tests test=develop

* remove fp16 learning rate cast test=develop
---
 .../details/multi_devices_graph_pass.cc       |  8 ++-
 .../details/multi_devices_graph_pass.h        |  3 +-
 .../details/scale_loss_grad_op_handle.cc      | 61 +++++++++++++------
 .../details/scale_loss_grad_op_handle.h       |  5 +-
 .../elementwise/elementwise_div_op.cu         |  5 ++
 .../elementwise/elementwise_mul_op.cu         | 22 ++++---
 .../fluid/operators/fill_zeros_like_op.cu.cc  |  3 +
 paddle/fluid/operators/metrics/accuracy_op.cu |  8 ++-
 .../fluid/operators/optimizers/momentum_op.cu |  5 +-
 .../fluid/operators/optimizers/momentum_op.h  |  6 +-
 paddle/fluid/operators/top_k_op.cu            | 15 +++--
 paddle/fluid/platform/nccl_helper.h           |  3 +
 python/paddle/fluid/data_feeder.py            |  2 +
 python/paddle/fluid/initializer.py            | 54 ++++++++++++++--
 python/paddle/fluid/layers/nn.py              |  8 ++-
 .../paddle/fluid/tests/unittests/op_test.py   |  2 +
 .../fluid/tests/unittests/test_accuracy_op.py | 17 +++++-
 .../unittests/test_elementwise_div_op.py      | 25 +++++++-
 .../unittests/test_elementwise_mul_op.py      |  5 ++
 .../unittests/test_fill_zeros_like_op.py      | 12 +++-
 .../fluid/tests/unittests/test_momentum_op.py | 21 +++++--
 .../fluid/tests/unittests/test_top_k_op.py    | 13 +++-
 22 files changed, 242 insertions(+), 61 deletions(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 036cef1daa..7e320a0894 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -355,7 +355,9 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
           BuildStrategy::GradientScaleStrategy::kCustomized) {
         // TODO(paddle-dev): Why is there no input for this op_handle?
         auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
-        CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0]);
+        auto out_dtype = all_vars_.at(loss_grad_name)->GetDataType();
+        CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0],
+                              out_dtype);
       }
       // This assumes the backward generating code will ensure IsScaleLossOp
       // is true only for the op that scale the final scalar loss.
@@ -658,13 +660,13 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(
 
 void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
     ir::Graph *result, const std::string &loss_grad_name,
-    ir::Node *out_var_node) const {
+    ir::Node *out_var_node, proto::VarType::Type dtype) const {
   for (size_t i = 0; i < places_.size(); ++i) {
     // Insert ScaleCost OpHandle
     auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]);
     auto *op_handle = new ScaleLossGradOpHandle(
         result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation),
-        local_scopes_.size(), local_scopes_[i], places_[i], dev_ctx);
+        local_scopes_.size(), local_scopes_[i], places_[i], dev_ctx, dtype);
     result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);
 
     // FIXME: Currently ScaleLossGradOp only use device_count as scale
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index 0556232aa4..5736102ddc 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -68,7 +68,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
 
   void CreateScaleLossGradOp(ir::Graph *result,
                              const std::string &loss_grad_name,
-                             ir::Node *out_var_node) const;
+                             ir::Node *out_var_node,
+                             proto::VarType::Type dtype) const;
 
   VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
                             int dst_dev_id) const;
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index ef16265997..e1b8e8fe05 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -22,39 +22,66 @@ namespace details {
 ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev,
                                              Scope *scope,
                                              platform::Place place,
-                                             platform::DeviceContext *dev_ctx)
+                                             platform::DeviceContext *dev_ctx,
+                                             proto::VarType::Type dtype)
     : OpHandleBase(node),
       coeff_(static_cast<float>(1.0 / num_dev)),
       scope_(scope),
-      place_(place) {
+      place_(place),
+      out_dtype_(dtype) {
   this->SetDeviceContext(place_, dev_ctx);
 }
 
 ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
 
+struct ScaleLossGradFunctor {
+  float coeff_;
+  Tensor *out_;
+  platform::Place place_;
+  OpHandleBase *op_handle_;
+  proto::VarType::Type out_dtype_;
+  platform::DeviceContext *ctx_;
+
+  ScaleLossGradFunctor(float coeff, Tensor *out, platform::Place place,
+                       OpHandleBase *op_handle, proto::VarType::Type dtype,
+                       platform::DeviceContext *ctx)
+      : coeff_(coeff), out_(out), place_(place), out_dtype_(dtype), ctx_(ctx) {}
+
+  template <typename OutT>
+  void apply() const {
+    auto *out_data = out_->mutable_data<OutT>(place_);
+    if (platform::is_cpu_place(place_)) {
+      *out_data = static_cast<OutT>(coeff_);
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      OutT cast_coeff = static_cast<OutT>(coeff_);
+      auto stream = static_cast<platform::CUDADeviceContext *>(ctx_)->stream();
+      memory::Copy(boost::get<platform::CUDAPlace>(place_), out_data,
+                   platform::CPUPlace(), &cast_coeff, SizeOfType(out_dtype_),
+                   stream);
+      VLOG(10) << place_ << "RUN Scale loss grad op";
+
+#endif
+    }
+  }
+};
+
 void ScaleLossGradOpHandle::RunImpl() {
   // Doesn't wait any event
   std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_;
   auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
 
-  float *tmp = local_scope.FindVar(var_name)
-                   ->GetMutable<LoDTensor>()
-                   ->mutable_data<float>(make_ddim({1}), place_);
+  auto *tensor = local_scope.FindVar(var_name)->GetMutable<LoDTensor>();
+  tensor->Resize(make_ddim({1}));
 
-  if (platform::is_cpu_place(place_)) {
-    *tmp = coeff_;
-  } else {
 #ifdef PADDLE_WITH_CUDA
-    this->RunAndRecordEvent([&] {
-      auto stream = static_cast<platform::CUDADeviceContext *>(
-                        this->dev_ctxes_.at(place_))
-                        ->stream();
-      memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
-                   platform::CPUPlace(), &coeff_, sizeof(float), stream);
-      VLOG(10) << place_ << "RUN Scale loss grad op";
-    });
+  ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_,
+                            this->dev_ctxes_.at(place_));
+  this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); });
+#else
+  ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_, nullptr);
+  framework::VisitDataType(out_dtype_, func);
 #endif
-  }
 }
 
 std::string ScaleLossGradOpHandle::Name() const { return "Scale LossGrad"; }
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
index 523b55724c..8bedd1643e 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
@@ -26,8 +26,8 @@ namespace details {
 
 struct ScaleLossGradOpHandle : public OpHandleBase {
   ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, Scope *scope,
-                        platform::Place place,
-                        platform::DeviceContext *context);
+                        platform::Place place, platform::DeviceContext *context,
+                        proto::VarType::Type dtype);
 
   ~ScaleLossGradOpHandle() final;
 
@@ -40,6 +40,7 @@ struct ScaleLossGradOpHandle : public OpHandleBase {
   float coeff_;
   Scope *scope_;
   platform::Place place_;
+  proto::VarType::Type out_dtype_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
index 1a149298fd..ae669f5525 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
@@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
 
 REGISTER_OP_CUDA_KERNEL(
     elementwise_div,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
+                              paddle::platform::float16>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_div_grad,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
+                                  paddle::platform::float16>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index 833c407282..50b2322b17 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -12,19 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
-    elementwise_mul,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    elementwise_mul, ops::ElementwiseMulKernel<plat::CUDADeviceContext, float>,
+    ops::ElementwiseMulKernel<plat::CUDADeviceContext, double>,
+    ops::ElementwiseMulKernel<plat::CUDADeviceContext, int>,
+    ops::ElementwiseMulKernel<plat::CUDADeviceContext, int64_t>,
+    ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
+    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, float>,
+    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, double>,
+    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int>,
+    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int64_t>,
+    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/fill_zeros_like_op.cu.cc b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
index 9538177460..e80a703c30 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.cu.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fill_zeros_like_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
@@ -22,4 +23,6 @@ REGISTER_OP_CUDA_KERNEL(
     ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, float>,
     ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
+                             paddle::platform::float16>,
     ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>);
diff --git a/paddle/fluid/operators/metrics/accuracy_op.cu b/paddle/fluid/operators/metrics/accuracy_op.cu
index b255d2a7c4..4682940f7e 100644
--- a/paddle/fluid/operators/metrics/accuracy_op.cu
+++ b/paddle/fluid/operators/metrics/accuracy_op.cu
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <thrust/reduce.h>
 #include "paddle/fluid/operators/metrics/accuracy_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
@@ -94,6 +95,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
 
 // FIXME(typhoonzero): types of T is for inference data.
 // label data is always int64
-REGISTER_OP_CUDA_KERNEL(accuracy,
-                        paddle::operators::AccuracyOpCUDAKernel<float>,
-                        paddle::operators::AccuracyOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(
+    accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
+    paddle::operators::AccuracyOpCUDAKernel<double>,
+    paddle::operators::AccuracyOpCUDAKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/optimizers/momentum_op.cu b/paddle/fluid/operators/optimizers/momentum_op.cu
index 8ce739de8d..7f9e724640 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/momentum_op.cu
@@ -14,8 +14,11 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/optimizers/momentum_op.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     momentum, ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MomentumOpKernel<paddle::platform::CUDADeviceContext,
+                          paddle::platform::float16>);
diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h
index 71f079e4d9..f6ef83c3ba 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ b/paddle/fluid/operators/optimizers/momentum_op.h
@@ -237,7 +237,8 @@ class SparseMomentumFunctor<T, UseNesterov> {
   inline HOSTDEVICE void operator()(size_t i) {
     auto row_idx =
         math::BinarySearch<int64_t>(rows_, row_height_, i / row_numel_);
-    T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] : 0;
+    T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_]
+                       : static_cast<T>(0);
     // put memory access in register
     const T p = p_[i];
     const T lr = lr_[0];
@@ -282,7 +283,8 @@ class SparseMomentumFunctor<T, NoNesterov> {
   inline HOSTDEVICE void operator()(size_t i) {
     auto row_idx =
         math::BinarySearch<int64_t>(rows_, row_height_, i / row_numel_);
-    T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] : 0;
+    T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_]
+                       : static_cast<T>(0);
     // put memory access in register
     const T p = p_[i];
     const T lr = lr_[0];
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index 0cad224ca8..99a4b1b7b0 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/top_k_op.h"
 #include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -150,7 +151,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
         if (k < MaxLength - (*beam)) {
           topk[k] = topk[k + *beam];
         } else {
-          topk[k].set(-INFINITY, -1);
+          topk[k].set(-static_cast<T>(INFINITY), -1);
         }
       }
       if (!(*is_empty)) {
@@ -160,7 +161,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
     }
 
     *max = topk[MaxLength - 1];
-    if ((*max).v == -1) *is_empty = true;
+    if ((*max).v == -static_cast<T>(1)) *is_empty = true;
     *beam = 0;
   }
 }
@@ -181,7 +182,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
         if (k < MaxLength - *beam) {
           topk[k] = topk[k + *beam];
         } else {
-          topk[k].set(-INFINITY, -1);
+          topk[k].set(-static_cast<T>(INFINITY), -1);
         }
       }
       if (!(*is_empty)) {
@@ -278,7 +279,7 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
     bool firststep = true;
 
     for (int j = 0; j < MaxLength; j++) {
-      topk[j].set(-INFINITY, -1);
+      topk[j].set(-static_cast<T>(INFINITY), -1);
     }
     while (top_num) {
       ThreadGetTopK<T, MaxLength, BlockSize>(
@@ -362,5 +363,7 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>,
-                        paddle::operators::TopkOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(
+    top_k, paddle::operators::TopkOpCUDAKernel<float>,
+    paddle::operators::TopkOpCUDAKernel<double>,
+    paddle::operators::TopkOpCUDAKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index cbb090adef..6ce4bf8f13 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -23,6 +23,7 @@
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/platform/dynload/nccl.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
 
 #define NCCL_ID_VARNAME "NCCLID"
 
@@ -38,6 +39,8 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) {
     return ncclInt;
   } else if (type == framework::proto::VarType::INT64) {
     return ncclInt64;
+  } else if (type == framework::proto::VarType::FP16) {
+    return ncclFloat16;
   } else {
     PADDLE_THROW("Not supported");
   }
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index 13d2893fd1..af02721eb7 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -44,6 +44,8 @@ class DataToLoDTensorConverter(object):
             self.dtype = 'int64'
         elif dtype == core.VarDesc.VarType.FP64:
             self.dtype = 'float64'
+        elif dtype == core.VarDesc.VarType.FP16:
+            self.dtype = 'float16'
         elif dtype == core.VarDesc.VarType.INT32:
             self.dtype = 'int32'
         elif dtype == core.VarDesc.VarType.UINT8:
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index b37ebbe517..26d1f8f4d2 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -18,6 +18,7 @@ from . import framework
 import numpy as np
 import contextlib
 from .core import VarDesc
+from . import unique_name
 
 __all__ = [
     'Constant', 'Uniform', 'Normal', 'TruncatedNormal', 'Xavier', 'Bilinear',
@@ -207,16 +208,39 @@ class UniformInitializer(Initializer):
         # Initialization Ops should be prepended and not appended
         if self._seed == 0:
             self._seed = block.program.random_seed
+
+        # to be compatible of fp16 initalizers
+        if var.dtype == VarDesc.VarType.FP16:
+            out_dtype = VarDesc.VarType.FP32
+            out_var = block.create_var(
+                name=unique_name.generate(".".join(['gaussian_random', 'tmp'])),
+                shape=var.shape,
+                dtype=out_dtype,
+                type=VarDesc.VarType.LOD_TENSOR,
+                persistable=False)
+        else:
+            out_dtype = var.dtype
+            out_var = var
+
         op = block._prepend_op(
             type="uniform_random",
-            outputs={"Out": var},
+            outputs={"Out": out_var},
             attrs={
                 "shape": var.shape,
-                "dtype": int(var.dtype),
+                "dtype": out_dtype,
                 "min": self._low,
                 "max": self._high,
                 "seed": self._seed
             })
+
+        if var.dtype == VarDesc.VarType.FP16:
+            block.append_op(
+                type="cast",
+                inputs={"X": out_var},
+                outputs={"Out": var},
+                attrs={"in_dtype": out_var.dtype,
+                       "out_dtype": var.dtype})
+
         var.op = op
         return op
 
@@ -261,17 +285,39 @@ class NormalInitializer(Initializer):
         # Initialization Ops should be prepended and not appended
         if self._seed == 0:
             self._seed = block.program.random_seed
+
+        # to be compatible of fp16 initalizers
+        if var.dtype == VarDesc.VarType.FP16:
+            out_dtype = VarDesc.VarType.FP32
+            out_var = block.create_var(
+                name=unique_name.generate(".".join(['gaussian_random', 'tmp'])),
+                shape=var.shape,
+                dtype=out_dtype,
+                type=VarDesc.VarType.LOD_TENSOR,
+                persistable=False)
+        else:
+            out_dtype = var.dtype
+            out_var = var
+
         op = block._prepend_op(
             type="gaussian_random",
-            outputs={"Out": var},
+            outputs={"Out": out_var},
             attrs={
                 "shape": var.shape,
-                "dtype": int(var.dtype),
+                "dtype": out_dtype,
                 "mean": self._mean,
                 "std": self._std_dev,
                 "seed": self._seed,
                 "use_mkldnn": False
             })
+
+        if var.dtype == VarDesc.VarType.FP16:
+            block.append_op(
+                type="cast",
+                inputs={"X": out_var},
+                outputs={"Out": var},
+                attrs={"in_dtype": out_var.dtype,
+                       "out_dtype": var.dtype})
         var.op = op
         return op
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 489433cff5..8ac7efee50 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -2801,6 +2801,10 @@ def batch_norm(input,
     helper = LayerHelper('batch_norm', **locals())
     dtype = helper.input_dtype()
 
+    # use fp32 for bn parameter
+    if dtype == core.VarDesc.VarType.FP16:
+        dtype = core.VarDesc.VarType.FP32
+
     input_shape = input.shape
     if data_layout == 'NCHW':
         channel_num = input_shape[1]
@@ -2835,7 +2839,7 @@ def batch_norm(input,
             trainable=False,
             do_model_average=do_model_average_for_mean_and_var),
         shape=param_shape,
-        dtype=input.dtype)
+        dtype=dtype)
     mean.stop_gradient = True
 
     variance = helper.create_parameter(
@@ -2845,7 +2849,7 @@ def batch_norm(input,
             trainable=False,
             do_model_average=do_model_average_for_mean_and_var),
         shape=param_shape,
-        dtype=input.dtype)
+        dtype=dtype)
     variance.stop_gradient = True
 
     # create output
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 76a707efdc..0fe836683b 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -368,6 +368,8 @@ class OpTest(unittest.TestCase):
                 place = core.CUDAPlace(0)
                 if core.is_float16_supported(place):
                     return [place]
+                else:
+                    return []
             else:
                 return []
         places = [fluid.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/test_accuracy_op.py b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
index 1b2b53f2d4..5257b0be6f 100644
--- a/python/paddle/fluid/tests/unittests/test_accuracy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
@@ -22,8 +22,10 @@ from op_test import OpTest
 class TestAccuracyOp(OpTest):
     def setUp(self):
         self.op_type = "accuracy"
+        self.dtype = np.float32
+        self.init_dtype()
         n = 8192
-        infer = np.random.random((n, 1)).astype("float32")
+        infer = np.random.random((n, 1)).astype(self.dtype)
         indices = np.random.randint(0, 2, (n, 1))
         label = np.random.randint(0, 2, (n, 1))
         self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
@@ -34,14 +36,25 @@ class TestAccuracyOp(OpTest):
                     num_correct += 1
                     break
         self.outputs = {
-            'Accuracy': np.array([num_correct / float(n)]).astype("float32"),
+            'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
             'Correct': np.array([num_correct]).astype("int32"),
             'Total': np.array([n]).astype("int32")
         }
 
+    def init_dtype(self):
+        pass
+
     def test_check_output(self):
         self.check_output()
 
 
+class TestAccuracyOpFp16(TestAccuracyOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output(atol=1e-3)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
index cadaf1df53..15d4db590e 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
@@ -21,14 +21,16 @@ from op_test import OpTest
 class ElementwiseDivOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_div"
+        self.dtype = np.float32
+        self.init_dtype()
         """ Warning
         CPU gradient check error!
         'X': np.random.random((32,84)).astype("float32"),
         'Y': np.random.random((32,84)).astype("float32")
         """
         self.inputs = {
-            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
-            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+            'X': np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype),
+            'Y': np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
         }
         self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
 
@@ -46,6 +48,9 @@ class ElementwiseDivOp(OpTest):
         self.check_grad(
             ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Y'))
 
+    def init_dtype(self):
+        pass
+
 
 class TestElementwiseDivOp_scalar(ElementwiseDivOp):
     def setUp(self):
@@ -126,5 +131,21 @@ class TestElementwiseDivOp_broadcast_3(ElementwiseDivOp):
         }
 
 
+class TestElementwiseDivOpFp16(ElementwiseDivOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=1)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=1, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=1, no_grad_set=set('Y'))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
index 57ba34f833..0484099188 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
@@ -135,5 +135,10 @@ class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
         }
 
 
+class TestElementwiseMulOpFp16(ElementwiseMulOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py
index eec73d0beb..20f1a110c3 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py
@@ -22,12 +22,22 @@ from op_test import OpTest
 class TestFillZerosLikeOp(OpTest):
     def setUp(self):
         self.op_type = "fill_zeros_like"
-        self.inputs = {'X': np.random.random((219, 232)).astype("float32")}
+        self.dtype = np.float32
+        self.init_dtype()
+        self.inputs = {'X': np.random.random((219, 232)).astype(self.dtype)}
         self.outputs = {'Out': np.zeros_like(self.inputs["X"])}
 
+    def init_dtype(self):
+        pass
+
     def test_check_output(self):
         self.check_output()
 
 
+class TestFillZerosLikeOpFp16(TestFillZerosLikeOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py
index cf4346cf2e..77ec6f9b6b 100644
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@@ -24,11 +24,13 @@ from op_test import OpTest
 class TestMomentumOp1(OpTest):
     def setUp(self):
         self.op_type = "momentum"
+        self.dtype = np.float32
+        self.init_dtype()
 
-        param = np.random.random((123, 321)).astype("float32")
-        grad = np.random.random((123, 321)).astype("float32")
-        velocity = np.zeros((123, 321)).astype("float32")
-        learning_rate = np.array([0.001]).astype("float32")
+        param = np.random.random((123, 321)).astype(self.dtype)
+        grad = np.random.random((123, 321)).astype(self.dtype)
+        velocity = np.zeros((123, 321)).astype(self.dtype)
+        learning_rate = np.array([0.001]).astype(self.dtype)
         mu = 0.0001
         use_nesterov = False
 
@@ -50,10 +52,21 @@ class TestMomentumOp1(OpTest):
 
         self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
 
+    def init_dtype(self):
+        pass
+
     def test_check_output(self):
         self.check_output()
 
 
+class TestMomentumOpFp16(TestMomentumOp1):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output(atol=1e-3)
+
+
 class TestMomentumOp2(OpTest):
     '''Test Momentum with default values for attributes
     '''
diff --git a/python/paddle/fluid/tests/unittests/test_top_k_op.py b/python/paddle/fluid/tests/unittests/test_top_k_op.py
index 69b29db83a..21b5a62baf 100644
--- a/python/paddle/fluid/tests/unittests/test_top_k_op.py
+++ b/python/paddle/fluid/tests/unittests/test_top_k_op.py
@@ -23,8 +23,11 @@ class TestTopkOp(OpTest):
     def setUp(self):
         self.set_args()
         self.op_type = "top_k"
+        self.dtype = np.float32
+        self.init_dtype()
+
         k = self.top_k
-        input = np.random.random((self.row, k)).astype("float32")
+        input = np.random.random((self.row, k)).astype(self.dtype)
         output = np.ndarray((self.row, k))
         indices = np.ndarray((self.row, k)).astype("int64")
 
@@ -38,6 +41,9 @@ class TestTopkOp(OpTest):
 
         self.outputs = {'Out': output, 'Indices': indices}
 
+    def init_dtype(self):
+        pass
+
     def set_args(self):
         self.row = 32
         self.top_k = 1
@@ -46,6 +52,11 @@ class TestTopkOp(OpTest):
         self.check_output()
 
 
+class TestTopkOpFp16(TestTopkOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
 class TestTopkOp3d(OpTest):
     def setUp(self):
         self.op_type = "top_k"

From 3ea2f415dcf2829d0f8af9a24793024292416a15 Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Wed, 26 Dec 2018 10:06:09 +0800
Subject: [PATCH 52/77] fix ci error. test=develop

---
 paddle/fluid/operators/distributed_ops/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
index 3c0b7ff24f..a8bb597cbd 100644
--- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
@@ -33,7 +33,7 @@ register_operators(EXCLUDES gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS})
 
 if(WITH_GPU AND NOT WIN32)
     set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} nccl_common)
-    op_library(gen_nccl_id_op ${DISTRIBUTE_DEPS} nccl_common)
+    op_library(gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS} nccl_common)
 endif()
 
 set(OPERATOR_DEPS ${OPERATOR_DEPS} ${DISTRIBUTE_DEPS} PARENT_SCOPE)

From 179acc60b3859545bec0c77009ac3e63eb9dd4ca Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Wed, 26 Dec 2018 03:20:28 +0000
Subject: [PATCH 53/77] fix conflict with develop test=develop

---
 paddle/fluid/framework/var_type_traits.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index 1b535219c1..cc68cf2ab8 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -155,13 +155,24 @@ template <typename T>
 struct VarTypeTrait {
   static_assert(VarTypeRegistry::IsRegistered<T>(), "Must be registered type");
   using Type = T;
-  // Default id generation
+  /**
+   * Unique VarType Id generation.
+   *
+   * The auto-generated id should not be the same as any protobuf id defined in
+   * framework.proto. Therefore, we generate id by adding the type pos and
+   * maximum protobuf id (i.e., proto::VarType::TUPLE).
+   *
+   * However, we may need more protobuf id in the future.
+   * To avoid changing this auto id generation algorithm frequently, we
+   * generate id by adding the type pos and twice of maximum protobuf id (i.e.,
+   * proto::VarType::TUPLE).
+   */
   static constexpr int kId = VarTypeRegistry::TypePos<T>() +
                              static_cast<int>(proto::VarType::TUPLE) * 2;
 };
 
 // Users should set some of variable type ids to be what is defined in
-// framework.proto here
+// framework.proto below
 REG_PROTO_VAR_TYPE_TRAIT(LoDTensor, proto::VarType::LOD_TENSOR);
 REG_PROTO_VAR_TYPE_TRAIT(SelectedRows, proto::VarType::SELECTED_ROWS);
 REG_PROTO_VAR_TYPE_TRAIT(std::vector<Scope *>, proto::VarType::STEP_SCOPES);

From 2314f2ebb3489d891b895a22a1495d5ba2a08381 Mon Sep 17 00:00:00 2001
From: whs <wanghaoshuang@baidu.com>
Date: Wed, 26 Dec 2018 12:00:23 +0800
Subject: [PATCH 54/77] Make topk op support variable k. (#15044)

* Make topk op support variable k.
test=develop

* Fix tensor type.
test=develop
---
 paddle/fluid/operators/top_k_op.cc                | 15 ++++++++++++++-
 paddle/fluid/operators/top_k_op.cu                | 11 +++++++++++
 paddle/fluid/operators/top_k_op.h                 | 12 ++++++++++--
 python/paddle/fluid/layers/nn.py                  | 12 +++++++++---
 .../paddle/fluid/tests/unittests/test_top_k_op.py | 15 +++++++++++++--
 5 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc
index c17d1afc30..9e77f7252d 100644
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -21,7 +21,7 @@ class TopkOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of TopkOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -44,12 +44,25 @@ class TopkOp : public framework::OperatorWithKernel {
     ctx->ShareLoD("X", "Out");
     ctx->ShareLoD("X", "Indices");
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.device_context(), layout_, library_);
+  }
 };
 
 class TopkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X", "(Tensor) The input of Topk op");
+    AddInput("K",
+             "(Tensor)  Number of top elements to look for along "
+             "the last dimension (along each row for matrices).")
+        .AsDispensable();
     AddOutput("Out", "(Tensor) The output tensor of Topk op");
     AddOutput("Indices", "(Tensor) The indices of Topk elements of input");
     AddComment(R"DOC(
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index 99a4b1b7b0..c27039dd0a 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -327,6 +327,17 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
     auto* indices = ctx.Output<Tensor>("Indices");
     size_t k = static_cast<int>(ctx.Attr<int>("k"));
 
+    auto* k_t = ctx.Input<Tensor>("K");
+    if (k_t) {
+      Tensor k_host;
+      framework::TensorCopySync(*k_t, platform::CPUPlace(), &k_host);
+      k = k_host.data<int>()[0];
+      framework::DDim output_dims = output->dims();
+      output_dims[output_dims.size() - 1] = k;
+      output->Resize(output_dims);
+      indices->Resize(output_dims);
+    }
+
     const T* input_data = input->data<T>();
     T* output_data = output->mutable_data<T>(ctx.GetPlace());
     // FIXME(typhoonzero): data is always converted to type T?
diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h
index 76ece57b39..f7bac67300 100644
--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -37,8 +37,16 @@ class TopkKernel : public framework::OpKernel<T> {
     auto* input = ctx.Input<Tensor>("X");
     auto* output = ctx.Output<Tensor>("Out");
     auto* indices = ctx.Output<Tensor>("Indices");
-    // k is determined by Attr
-    const size_t k = static_cast<int>(ctx.Attr<int>("k"));
+
+    size_t k = static_cast<int>(ctx.Attr<int>("k"));
+    auto* k_t = ctx.Input<Tensor>("K");
+    if (k_t) {
+      k = k_t->data<int>()[0];
+      framework::DDim output_dims = output->dims();
+      output_dims[output_dims.size() - 1] = k;
+      output->Resize(output_dims);
+      indices->Resize(output_dims);
+    }
 
     T* output_data = output->mutable_data<T>(ctx.GetPlace());
     int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 8ac7efee50..cc1fdbd285 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -4530,7 +4530,7 @@ def topk(input, k, name=None):
     Args:
         input(Variable): The input variable which can be a vector or Tensor with
             higher rank.
-        k(int):  The number of top elements to look for along the last dimension
+        k(int | Variable):  The number of top elements to look for along the last dimension
                  of input.
         name(str|None): A name for this layer(optional). If set None, the layer
                        will be named automatically.
@@ -4553,12 +4553,18 @@ def topk(input, k, name=None):
     helper = LayerHelper("top_k", **locals())
     values = helper.create_variable_for_type_inference(dtype=input.dtype)
     indices = helper.create_variable_for_type_inference(dtype="int64")
+    inputs = {"X": [input]}
+    attrs = None
+    if isinstance(k, Variable):
+        inputs['K'] = k
+    else:
+        attrs = {'k': k}
     helper.append_op(
         type="top_k",
-        inputs={"X": [input]},
+        inputs=inputs,
         outputs={"Out": [values],
                  "Indices": [indices]},
-        attrs={"k": k})
+        attrs=attrs)
     values.stop_gradient = True
     indices.stop_gradient = True
     return values, indices
diff --git a/python/paddle/fluid/tests/unittests/test_top_k_op.py b/python/paddle/fluid/tests/unittests/test_top_k_op.py
index 21b5a62baf..9fbf59ed66 100644
--- a/python/paddle/fluid/tests/unittests/test_top_k_op.py
+++ b/python/paddle/fluid/tests/unittests/test_top_k_op.py
@@ -21,6 +21,7 @@ from op_test import OpTest
 
 class TestTopkOp(OpTest):
     def setUp(self):
+        self.variable_k = False
         self.set_args()
         self.op_type = "top_k"
         self.dtype = np.float32
@@ -30,9 +31,12 @@ class TestTopkOp(OpTest):
         input = np.random.random((self.row, k)).astype(self.dtype)
         output = np.ndarray((self.row, k))
         indices = np.ndarray((self.row, k)).astype("int64")
-
         self.inputs = {'X': input}
-        self.attrs = {'k': k}
+
+        if self.variable_k:
+            self.inputs['K'] = np.array([k]).astype("int32")
+        else:
+            self.attrs = {'k': k}
 
         for rowid in range(self.row):
             row = input[rowid]
@@ -118,5 +122,12 @@ class TestTopkOp4(TestTopkOp):
         self.top_k = 1
 
 
+class TestTopkOp5(TestTopkOp):
+    def set_args(self):
+        self.row = 40000
+        self.top_k = 3
+        self.variable_k = True
+
+
 if __name__ == "__main__":
     unittest.main()

From 956cf92145842f1e7ff760434074b42479fe704b Mon Sep 17 00:00:00 2001
From: hjchen2 <chenhoujiangcug@gmail.com>
Date: Wed, 26 Dec 2018 05:54:51 +0000
Subject: [PATCH 55/77] Fix conv_elementwise_add2_act pass test=develop

---
 .../ir/conv_elementwise_add2_act_fuse_pass.cc | 25 +++++++++++--------
 .../framework/ir/graph_pattern_detector.cc    | 12 ++++-----
 paddle/fluid/operators/conv_fusion_op.cu.cc   |  4 +--
 3 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
index 23f343f631..c6121777e8 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
@@ -40,18 +40,20 @@ framework::proto::OpDesc PrepareOpDesc(
     const std::string& output) {
   auto proto = base_desc;
   framework::OpDesc desc(proto, nullptr);
+  desc.SetType("conv2d_fusion");
   desc.SetInput("Bias", {bias});
   desc.SetInput("ResidualData", {bias1});
   desc.SetAttr("activation", activation);
   desc.SetOutput("Output", {output});
   desc.SetAttr("is_test", true);
-
+  desc.SetAttr("use_cudnn", false);
+  desc.Flush();
   return *desc.Proto();
 }
 
 std::unique_ptr<ir::Graph> ConvElementwiseAdd2ActFusePass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
-  const std::string pattern_name = "conv_elementwise_add_act_fuse";
+  const std::string pattern_name = "conv_elementwise_add2_act_fuse";
   FusePassBase::Init(pattern_name, graph.get());
 
   GraphPatternDetector gpd;
@@ -76,22 +78,23 @@ std::unique_ptr<ir::Graph> ConvElementwiseAdd2ActFusePass::ApplyImpl(
     framework::OpDesc new_op_desc(new_op_proto, nullptr);
 
     // Create a new node for the fused op.
-    graph->CreateOpNode(&new_op_desc);
+    auto* new_conv_op = graph->CreateOpNode(&new_op_desc);
 
     // Link inputs and outputs.
     PADDLE_ENFORCE(subgraph.count(x));
     auto* conv_in_node = subgraph.at(x);
 
-    IR_NODE_LINK_TO(conv_in_node, conv_op);            // Input
-    IR_NODE_LINK_TO(conv_filter, conv_op);             // Filter
-    IR_NODE_LINK_TO(conv_op, conv_out);                // Output
-    IR_NODE_LINK_TO(elementwise_add_in_y, conv_op);    // Bias
-    IR_NODE_LINK_TO(elementwise_add_in_y_1, conv_op);  // Bias
+    IR_NODE_LINK_TO(conv_in_node, new_conv_op);            // Input
+    IR_NODE_LINK_TO(conv_filter, new_conv_op);             // Filter
+    IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op);    // Bias
+    IR_NODE_LINK_TO(elementwise_add_in_y_1, new_conv_op);  // Bias
+    IR_NODE_LINK_TO(new_conv_op, act_out);                 // Output
 
     // Delete the unneeded nodes.
-    GraphSafeRemoveNodes(graph.get(),
-                         {conv_op, elementwise_add_op, elementwise_add_op_1,
-                          elementwise_add_out});
+    GraphSafeRemoveNodes(
+        graph.get(),
+        {conv_op, conv_out, elementwise_add_op, elementwise_add_op_1,
+         elementwise_add_out, elementwise_add_out_1, act_op});
   };
   gpd(graph.get(), handler);
   return graph;
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 13d752e516..73d1a3da8f 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1101,9 +1101,7 @@ PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) {
   return out_var;
 }
 
-std::unordered_set<std::string> conv_act_set({"identity", "sigmoid", "relu",
-                                              "relu6", "relux", "tanh",
-                                              "band_pass"});
+std::unordered_set<std::string> conv_act_set({"identity", "relu"});
 
 PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) {
   conv_in->AsInput();
@@ -1169,13 +1167,13 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) {
                                   ->AsInput();
   auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr())
                                  ->assert_is_op_output("elementwise_add")
-                                 ->assert_is_op_input("elementwise_add", "X")
+                                 ->assert_is_op_input("elementwise_add", "Y")
                                  ->AsIntermediate();
 
   auto elementwise_add_op_1 = pattern->NewNode(elementwise_add_op_1_repr())
                                   ->assert_is_op("elementwise_add");
   auto elementwise_add_in_y_1 = pattern->NewNode(elementwise_add_in_y_1_repr())
-                                    ->assert_is_op_input("elementwise_add", "Y")
+                                    ->assert_is_op_input("elementwise_add", "X")
                                     ->AsInput();
   auto elementwise_add_out_1 = pattern->NewNode(elementwise_add_out_1_repr())
                                    ->assert_is_op_output("elementwise_add")
@@ -1203,8 +1201,8 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) {
   conv_op->LinksFrom({conv_in, conv_filter}).LinksTo({conv_out});
   elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y})
       .LinksTo({elementwise_add_out});
-  elementwise_add_op_1->LinksFrom(
-      {elementwise_add_out, elementwise_add_in_y_1});
+  elementwise_add_op_1->LinksFrom({elementwise_add_out, elementwise_add_in_y_1})
+      .LinksTo({elementwise_add_out_1});
   act_op->LinksFrom({elementwise_add_out_1}).LinksTo({act_out});
   return act_out;
 }
diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc
index 3235ad52b9..acceadab16 100644
--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cu.cc
@@ -22,7 +22,7 @@ DECLARE_bool(cudnn_exhaustive_search);
 namespace paddle {
 namespace operators {
 
-#if CUDNN_VERSION >= 7001
+#if CUDNN_VERSION >= 7100
 using Tensor = framework::Tensor;
 using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
@@ -204,7 +204,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-#if CUDNN_VERSION >= 7001
+#if CUDNN_VERSION >= 7100
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(conv2d_fusion, ops::CUDNNConvFusionOpKernel<float>,
                         ops::CUDNNConvFusionOpKernel<double>);

From a6aa8ea7719f6664e5218bb13d3d1db691e4225f Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Wed, 26 Dec 2018 05:58:23 +0000
Subject: [PATCH 56/77] faster rcnn input is presistable. (fix it in
 paddle-trt)

test=develop
---
 .../framework/ir/graph_pattern_detector.cc    |  6 -----
 .../ir_passes/tensorrt_subgraph_pass.cc       | 22 +++++++++++++++++--
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 6ef3417901..a826dfb275 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1101,12 +1101,6 @@ PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) {
   return out_var;
 }
 
-// only support "identity" and "relu" now.
-/*
-std::unordered_set<std::string> conv_act_set({"identity", "sigmoid", "relu",
-                                              "relu6", "relux", "tanh",
-                                              "band_pass"});
-*/
 std::unordered_set<std::string> conv_act_set({"identity", "relu"});
 
 PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) {
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 9c42b83e7a..5886868be0 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
+#include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
+#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
 
 namespace paddle {
 namespace inference {
@@ -197,10 +199,26 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
 
 std::vector<std::string> ExtractParameters(
     const std::unordered_set<Node *> &nodes) {
+  // We can judge whether a variable is a parameter by
+  // its presistable property, but sometimes the presistable
+  // of the feed op output is true, so we have to identify it.
+  std::vector<std::string> feed_outputs;
+  for (const auto &node : nodes) {
+    if (!node->IsOp()) continue;
+    std::string op_type = node->Op()->Type();
+    if (op_type == "feed") {
+      std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
+      std::copy(output_names.begin(), output_names.end(),
+                std::back_inserter(feed_outputs));
+    }
+  }
+
   std::vector<std::string> parameters;
   for (const auto &node : nodes) {
     if (!node->IsVar()) continue;
-    if (node->Var()->Persistable()) {
+    if (node->Var()->Persistable() &&
+        std::find(feed_outputs.begin(), feed_outputs.end(), node->Name()) ==
+            feed_outputs.end()) {
       parameters.push_back(node->Name());
     }
   }

From dc8eca826ecd4a9029fc65fb482ea47805c1a384 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 26 Dec 2018 15:08:57 +0800
Subject: [PATCH 57/77] code style fix, test=develop (#15045)

* code style fix, test=develop
---
 paddle/fluid/framework/attribute.h            | 27 ++++++++++---------
 paddle/fluid/framework/op_desc.cc             |  2 +-
 paddle/fluid/framework/op_registry.cc         |  2 +-
 .../operators/sequence_ops/sequence_mask_op.h |  2 +-
 4 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/framework/attribute.h b/paddle/fluid/framework/attribute.h
index d9c76881b7..67054eccb3 100644
--- a/paddle/fluid/framework/attribute.h
+++ b/paddle/fluid/framework/attribute.h
@@ -165,7 +165,7 @@ template <typename T>
 class GreaterThanChecker {
  public:
   explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
-  void operator()(T& value) const {
+  void operator()(const T& value) const {
     PADDLE_ENFORCE(value > lower_bound_, "larger_than check fails.");
   }
 
@@ -177,7 +177,7 @@ template <typename T>
 class EqualGreaterThanChecker {
  public:
   explicit EqualGreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
-  void operator()(T& value) const {
+  void operator()(const T& value) const {
     PADDLE_ENFORCE_GE(value, lower_bound_, "equal_larger_than check fails.");
   }
 
@@ -193,7 +193,7 @@ class DefaultValueSetter {
  public:
   explicit DefaultValueSetter(T default_value)
       : default_value_(default_value) {}
-  void operator()(T& value) const { value = default_value_; }  // NOLINT
+  void operator()(T* value) const { *value = default_value_; }
 
  private:
   T default_value_;
@@ -203,7 +203,7 @@ template <typename T>
 class EnumInContainer {
  public:
   explicit EnumInContainer(const std::unordered_set<T>& c) : container_(c) {}
-  void operator()(T& val) const {
+  void operator()(const T& val) const {
     PADDLE_ENFORCE(container_.find(val) != container_.end(),
                    "Value %s is not in enum container %s", val,
                    ContainerDebugString());
@@ -232,7 +232,8 @@ class EnumInContainer {
 // an attribute can have more than one limits
 template <typename T>
 class TypedAttrChecker {
-  typedef std::function<void(T&)> ValueChecker;
+  typedef std::function<void(T*)> DefaultValueChecker;
+  typedef std::function<void(const T&)> ValueChecker;
 
  public:
   explicit TypedAttrChecker(const std::string& attr_name)
@@ -268,17 +269,17 @@ class TypedAttrChecker {
     return *this;
   }
 
-  void operator()(AttributeMap& attr_map) const {  // NOLINT
-    if (!attr_map.count(attr_name_)) {
+  void operator()(AttributeMap* attr_map) const {
+    if (!attr_map->count(attr_name_)) {
       // user do not set this attr
       PADDLE_ENFORCE(!default_value_setter_.empty(),
                      "Attribute '%s' is required!", attr_name_);
       // default_value_setter_ has no more than one element
       T val;
-      (default_value_setter_[0])(val);
-      attr_map[attr_name_] = val;
+      (default_value_setter_[0])(&val);
+      (*attr_map)[attr_name_] = val;
     }
-    Attribute& attr = attr_map.at(attr_name_);
+    Attribute& attr = attr_map->at(attr_name_);
     ExtractAttribute<T> extract_attr(attr_name_);
     T* attr_value = extract_attr(attr);
     for (const auto& checker : value_checkers_) {
@@ -289,12 +290,12 @@ class TypedAttrChecker {
  private:
   std::string attr_name_;
   std::vector<ValueChecker> value_checkers_;
-  std::vector<ValueChecker> default_value_setter_;
+  std::vector<DefaultValueChecker> default_value_setter_;
 };
 
 // check whether op's all attributes fit their own limits
 class OpAttrChecker {
-  typedef std::function<void(AttributeMap&)> AttrChecker;
+  typedef std::function<void(AttributeMap*)> AttrChecker;
 
  public:
   template <typename T>
@@ -304,7 +305,7 @@ class OpAttrChecker {
     return *(checker.target<TypedAttrChecker<T>>());
   }
 
-  void Check(AttributeMap& attr_map) const {  // NOLINT
+  void Check(AttributeMap* attr_map) const {
     for (const auto& checker : attr_checkers_) {
       checker(attr_map);
     }
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 2fe1c94ec0..0e7b0cbeb9 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -643,7 +643,7 @@ void OpDesc::CheckAttrs() {
     // not by users.
     return;
   }
-  checker->Check(attrs_);
+  checker->Check(&attrs_);
 }
 
 void OpDesc::InferShape(const BlockDesc &block) const {
diff --git a/paddle/fluid/framework/op_registry.cc b/paddle/fluid/framework/op_registry.cc
index bfc411ca2c..346d14d408 100644
--- a/paddle/fluid/framework/op_registry.cc
+++ b/paddle/fluid/framework/op_registry.cc
@@ -24,7 +24,7 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
     const VariableNameMap& outputs, AttributeMap attrs) {
   auto& info = OpInfoMap::Instance().Get(type);
   if (info.Checker() != nullptr) {
-    info.Checker()->Check(attrs);
+    info.Checker()->Check(&attrs);
   }
   auto op = info.Creator()(type, inputs, outputs, attrs);
   return std::unique_ptr<OperatorBase>(op);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
index 8fceed3558..57d6f4b3ea 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
@@ -52,7 +52,7 @@ class SequenceMaskOpMaker : public framework::OpProtoAndCheckerMaker {
                  "The maximum length of the sequence. If maxlen < 0, maxlen "
                  "= max(Input(X)).")
         .SetDefault(-1)
-        .AddCustomChecker([](int &v) {
+        .AddCustomChecker([](const int &v) {
           PADDLE_ENFORCE(v < 0 || v >= 1,
                          "Attr(maxlen) must be less than 0 or larger than 1");
         });

From 01c00b07dd5739d6bc9f3a33eebe27d2d32e6d24 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Wed, 26 Dec 2018 16:05:19 +0800
Subject: [PATCH 58/77] fix test issues on windows test=develop

---
 cmake/simd.cmake                        | 73 ++++++++++++-------------
 paddle/fluid/framework/CMakeLists.txt   | 32 ++++-------
 paddle/fluid/framework/mixed_vector.h   | 10 ++--
 paddle/fluid/framework/op_registry.h    |  3 +-
 paddle/fluid/inference/tests/test.cmake |  8 ++-
 paddle/fluid/operators/CMakeLists.txt   |  2 +-
 paddle/fluid/operators/cum_op.h         |  2 +
 paddle/fluid/operators/huber_loss_op.h  |  8 ++-
 paddle/fluid/platform/float16_test.cc   |  1 +
 paddle/fluid/platform/float16_test.cu   |  1 +
 10 files changed, 69 insertions(+), 71 deletions(-)

diff --git a/cmake/simd.cmake b/cmake/simd.cmake
index 86096d4fea..566dc75fda 100644
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -57,46 +57,43 @@ int main()
     return 0;
 }" SSE3_FOUND)
 
-# disable AVX by default on windows
-if(NOT WIN32)
-    # Check AVX
-    set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
-    set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-    CHECK_CXX_SOURCE_RUNS("
-    #include <immintrin.h>
-    int main()
-    {
-        __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
-        __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
-        __m256 result = _mm256_add_ps (a, b);
-        return 0;
-    }" AVX_FOUND)
+# Check AVX
+set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
+set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+CHECK_CXX_SOURCE_RUNS("
+#include <immintrin.h>
+int main()
+{
+    __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
+    __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
+    __m256 result = _mm256_add_ps (a, b);
+    return 0;
+}" AVX_FOUND)
 
-    # Check AVX 2
-    set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
-    set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-    CHECK_CXX_SOURCE_RUNS("
-    #include <immintrin.h>
-    int main()
-    {
-        __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
-        __m256i result = _mm256_abs_epi32 (a);
-        return 0;
-    }" AVX2_FOUND)
+# Check AVX 2
+set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
+set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+CHECK_CXX_SOURCE_RUNS("
+#include <immintrin.h>
+int main()
+{
+    __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
+    __m256i result = _mm256_abs_epi32 (a);
+    return 0;
+}" AVX2_FOUND)
 
-    # Check AVX512F
-    set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
-    set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-    CHECK_CXX_SOURCE_RUNS("
-    #include <immintrin.h>
-    int main()
-    {
-        __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
-                                      13, -5, 6, -7, 9, 2, -6, 3);
-        __m512i result = _mm512_abs_epi32 (a);
-        return 0;
-    }" AVX512F_FOUND)
-endif(NOT WIN32)
+# Check AVX512F
+set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
+set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+CHECK_CXX_SOURCE_RUNS("
+#include <immintrin.h>
+int main()
+{
+    __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
+                                  13, -5, 6, -7, 9, 2, -6, 3);
+    __m512i result = _mm512_abs_epi32 (a);
+    return 0;
+}" AVX512F_FOUND)
 
 set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
 mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 867970717b..d7fbc4466f 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -7,27 +7,17 @@ function(windows_symbolic TARGET)
   cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
   set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH})
   foreach(src ${windows_symbolic_SRCS})
-  get_filename_component(src ${src} NAME_WE)
-  if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu)
-      message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
-  endif()
-
-#only copy the xx.cu to.xx.cu when the content are modified
-  set(copy_flag 1)
-  if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu)
-  file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR)
-  file(READ ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu TARGET_STR)
-  if (SOURCE_STR STREQUAL TARGET_STR)
-    set(copy_flag 0)
-  endif()
-  endif()
-  if (copy_flag)
-  add_custom_command(OUTPUT .${src}.cu
-          COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu
-          COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu"
-          COMMENT "create hidden file of ${src}.cu")
-  endif(copy_flag)
-  add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)
+    get_filename_component(src ${src} NAME_WE)
+    if (NOT EXISTS ${final_path}/${src}.cc OR NOT EXISTS ${final_path}/${src}.cu)
+        message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
+    endif()
+
+    file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc)
+
+    add_custom_command(OUTPUT ${final_path}/.${src}.cu
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" "${final_path}/.${src}.cu"
+            COMMENT "create hidden file of ${src}.cu")
+    add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)
   endforeach()
 endfunction()
 
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index 6940250c3f..c3a044d22c 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -215,8 +215,8 @@ class Vector {
       auto stream = dev_ctx->stream();
       void *src = gpu_->ptr();
       void *dst = cpu_.data();
-      memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
-                   gpu_->size(), stream);
+      paddle::memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
+                           gpu_->size(), stream);
       dev_ctx->Wait();
     }
 
@@ -261,8 +261,8 @@ class Vector {
       auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
           platform::DeviceContextPool::Instance().Get(place));
       auto stream = dev_ctx->stream();
-      memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
-                   gpu_->size(), stream);
+      paddle::memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
+                           gpu_->size(), stream);
     }
 
     void ImmutableCPU() const {
@@ -284,7 +284,7 @@ class Vector {
     bool IsInCPU() const { return flag_ & kDataInCPU; }
 
     mutable std::vector<T> cpu_;
-    mutable memory::AllocationPtr gpu_;
+    mutable paddle::memory::AllocationPtr gpu_;
     mutable int flag_;
 
     mutable std::mutex mtx_;
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 6d39bb3c52..2c1648c81f 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -23,7 +23,8 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 
-#include "glog/logging.h"  // For VLOG()
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#include "glog/logging.h"               // For VLOG()
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/details/op_registry.h"
 #include "paddle/fluid/framework/framework.pb.h"
diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake
index ab3a30ce6b..29f0f034a2 100644
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -3,14 +3,16 @@ set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
     "A path setting inference demo download directories.")
 function (inference_download install_dir url filename)
     message(STATUS "Download inference test stuff from ${url}/${filename}")
-    execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
-    execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}")
+    file(DOWNLOAD "${url}/${filename}" "${install_dir}/${filename}")
     message(STATUS "finish downloading ${filename}")
 endfunction()
 
 function (inference_download_and_uncompress install_dir url filename)
     inference_download(${install_dir} ${url} ${filename})
-    execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}")
+    execute_process(
+            COMMAND ${CMAKE_COMMAND} -E tar xzf ${install_dir}/${filename}
+            WORKING_DIRECTORY ${install_dir}
+    )
 endfunction()
 
 set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 4a14eb941c..ee15420775 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -46,7 +46,7 @@ endif()
 register_operators(EXCLUDES py_func_op warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
 
 # warpctc_op needs cudnn 7 above
-if (WITH_GPU AND NOT WIN32)
+if (WITH_GPU)
     if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
         op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
     else()
diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h
index 999fdcff90..7c0fda4169 100644
--- a/paddle/fluid/operators/cum_op.h
+++ b/paddle/fluid/operators/cum_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
+#include <array>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/huber_loss_op.h b/paddle/fluid/operators/huber_loss_op.h
index 9efda3dfc9..666500ef26 100644
--- a/paddle/fluid/operators/huber_loss_op.h
+++ b/paddle/fluid/operators/huber_loss_op.h
@@ -104,15 +104,19 @@ class HuberLossGradKernel : public framework::OpKernel<T> {
     if (out0) {
       out0->mutable_data<T>(context.GetPlace());
       auto x_grad = EigenVector<T>::Flatten(*out0);
+      // MSVC not treat it well when partial template arguments were specified
       x_grad.device(place) =
-          out_grad * residual.unaryExpr(HuberLossBackward<T>(delta, -1.0));
+          out_grad *
+          residual.unaryExpr(HuberLossBackward<T>(delta, static_cast<T>(-1.0)));
     }
 
     if (out1) {
       out1->mutable_data<T>(context.GetPlace());
       auto y_grad = EigenVector<T>::Flatten(*out1);
+      // MSVC not treat it well when partial template arguments were specified
       y_grad.device(place) =
-          out_grad * residual.unaryExpr(HuberLossBackward<T>(delta, 1.0));
+          out_grad *
+          residual.unaryExpr(HuberLossBackward<T>(delta, static_cast<T>(1.0)));
     }
   }
 };
diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc
index 27e930e6e0..3a937dfaec 100644
--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 
 #include <vector>
 
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/init.h"
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index e2b7ca9b03..b1b51d804e 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/float16.h"
 
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <bitset>

From 71636e677d456b4e9f63b6890d094bb1449cd552 Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Wed, 26 Dec 2018 08:31:51 +0000
Subject: [PATCH 59/77] add min_subgraph_size attr to tensorrt config
 test=develop

---
 paddle/fluid/inference/analysis/argument.h          |  1 +
 paddle/fluid/inference/analysis/ir_pass_manager.cc  |  2 ++
 .../analysis/ir_passes/tensorrt_subgraph_pass.cc    |  6 ++++--
 paddle/fluid/inference/api/analysis_config.cc       |  8 ++++++--
 paddle/fluid/inference/api/analysis_predictor.cc    |  1 +
 paddle/fluid/inference/api/paddle_analysis_config.h | 13 ++++++++++++-
 6 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 83d411eecf..2db5705d09 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -127,6 +127,7 @@ struct Argument {
                       std::function<bool(const framework::ir::Node*)>);
   DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
   DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
+  DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
 
   // The program transformed by IR analysis phase.
   DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram,
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 51bca8039d..b8c9426ed3 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -75,6 +75,8 @@ void IRPassManager::CreatePasses(Argument *argument,
                         argument->tensorrt_node_teller_ptr());
       pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
       pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
+      pass->Set("min_subgraph_size",
+                new int(argument->tensorrt_min_subgraph_size()));
     }
 
     // graph_ = pass->Apply(std::move(graph_));
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 5886868be0..ad10010e42 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -38,7 +38,8 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
   auto teller =
       Get<SubgraphDetector::NodeInsideSubgraphTeller>("tensorrt_node_teller");
 
-  SubGraphFuser fuser(graph.get(), teller, 2 /*min subgraph size*/);
+  SubGraphFuser fuser(graph.get(), teller,
+                      Get<int>("min_subgraph_size") /*min subgraph size*/);
   fuser();
 
   for (auto *node : graph->Nodes()) {
@@ -233,4 +234,5 @@ REGISTER_PASS(tensorrt_subgraph_pass,
               paddle::inference::analysis::TensorRtSubgraphPass)
     .RequirePassAttr("tensorrt_node_teller")
     .RequirePassAttr("max_batch_size")
-    .RequirePassAttr("workspace_size");
+    .RequirePassAttr("workspace_size")
+    .RequirePassAttr("min_subgraph_size");
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 8a0ddfbab4..6d6e799fde 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -57,6 +57,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
   use_tensorrt_ = other.use_tensorrt_;
   tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
   tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
+  tensorrt_min_subgraph_size_ = other.tensorrt_min_subgraph_size_;
   model_from_memory_ = other.model_from_memory_;
 
   if (use_gpu) {
@@ -89,6 +90,7 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) {
   use_tensorrt_ = other.use_tensorrt_;
   tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
   tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
+  tensorrt_min_subgraph_size_ = other.tensorrt_min_subgraph_size_;
   model_from_memory_ = other.model_from_memory_;
 
   pass_builder_ = std::move(other.pass_builder_);
@@ -105,11 +107,13 @@ void contrib::AnalysisConfig::EnableMKLDNN() {
 }
 
 void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
-                                                   int max_batch_size) {
+                                                   int max_batch_size,
+                                                   int min_subgraph_size) {
   use_tensorrt_ = true;
   tensorrt_workspace_size_ = workspace_size;
   tensorrt_max_batchsize_ = max_batch_size;
-  // Append after the infer_clean pass.
+  tensorrt_min_subgraph_size_ = min_subgraph_size;
+  // Append after the conv+affine_channel fuse pass.
   pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
 }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 3937884ce4..3f8feaaa1e 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -328,6 +328,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
     argument_.SetUseTensorRT(true);
     argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
     argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
+    argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_);
   }
 
   if (config_.use_mkldnn_) {
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index f05b9832da..e7ccea6587 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -49,7 +49,7 @@ struct AnalysisConfig : public NativeConfig {
   bool use_feed_fetch_ops{true};
 
   void EnableTensorRtEngine(int workspace_size = 1 << 20,
-                            int max_batch_size = 1);
+                            int max_batch_size = 1, int min_subgraph_size = 3);
   bool use_tensorrt() const { return use_tensorrt_; }
 
   void EnableMKLDNN();
@@ -69,8 +69,19 @@ struct AnalysisConfig : public NativeConfig {
   bool use_tensorrt_{false};
   bool use_mkldnn_{false};
   std::unordered_set<std::string> mkldnn_enabled_op_types_;
+  // For workspace_size, refer it from here:
+  // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
   int tensorrt_workspace_size_;
+  // While TensorRT allows an engine optimized for a given max batch size
+  // to run at any smaller size, the performance for those smaller
+  // sizes may not be as well-optimized. Therefore, Max batch is best
+  // equivalent to the runtime batch size.
   int tensorrt_max_batchsize_;
+  //  We transform the Ops that can be converted into TRT layer in the model,
+  //  and aggregate these Ops into subgraphs for TRT execution.
+  //  We set this variable to control the minimum number of nodes in the
+  //  subgraph, 3 as default value.
+  int tensorrt_min_subgraph_size_{3};
   std::unique_ptr<PassStrategy> pass_builder_;
   bool model_from_memory_{false};
 };

From 2388d0e7d6277bfbb41a6f17324bb3a0e5df1c9c Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Wed, 26 Dec 2018 16:45:57 +0800
Subject: [PATCH 60/77] Revert "cherry-pick the #12759" test=develop

This reverts commit 7f6d8acecb0c1d61dad645c581cd8cef9d554841.
---
 paddle/fluid/framework/op_proto_maker.cc      |  4 --
 paddle/fluid/framework/op_proto_maker.h       |  1 -
 paddle/fluid/framework/operator.cc            | 71 +++++--------------
 paddle/fluid/pybind/const_value.cc            |  3 -
 python/paddle/fluid/framework.py              |  5 --
 .../tests/unittests/test_operator_desc.py     |  2 +-
 6 files changed, 18 insertions(+), 68 deletions(-)

diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
index 2311614c33..ca31303f77 100644
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -82,10 +82,6 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
   AddAttr<std::string>(OpNamescopeAttrName(), "Operator name with namesope.")
       .SetDefault("");
 
-  AddAttr<std::vector<std::string>>(OpCreationCallstackAttrName(),
-                                    "Callstack for Op Creatation.")
-      .SetDefault({});
-
   Validate();
 }
 
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index 0a0f8f4655..4c59c73d87 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -47,7 +47,6 @@ class OpProtoAndCheckerMaker {
   static const char *OpRoleAttrName() { return "op_role"; }
   static const char *OpRoleVarAttrName() { return "op_role_var"; }
   static const char *OpNamescopeAttrName() { return "op_namescope"; }
-  static const char *OpCreationCallstackAttrName() { return "op_callstack"; }
 
   void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index ac2828136b..f48e403cef 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -16,15 +16,10 @@ limitations under the License. */
 #include <glog/logging.h>
 
 #include <algorithm>
-#include <sstream>
-#include <string>
-#include <vector>
-#include "gflags/gflags.h"
-#include "glog/logging.h"
+
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
@@ -162,59 +157,27 @@ RuntimeContext::RuntimeContext(const VariableNameMap& innames,
 }
 
 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
-  try {
-    if (VLOG_IS_ON(4)) {
-      VLOG(4) << place << " " << DebugStringEx(&scope);
-    }
-    if (platform::is_gpu_place(place)) {
+  VLOG(4) << place << " " << DebugStringEx(&scope);
+  if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
-      PADDLE_THROW("Cannot run operator on place %s", place);
+    PADDLE_THROW("Cannot run operator on place %s", place);
 #else
-      auto dev_id = boost::get<platform::CUDAPlace>(place).device;
-      platform::SetDeviceId(dev_id);
+    auto dev_id = boost::get<platform::CUDAPlace>(place).device;
+    platform::SetDeviceId(dev_id);
 #endif
-    }
-
-    // The profile has a process-wide mutex, results in serious performance
-    // issue
-    // in concurrency scenerio. Here use an `if` to fix this issue.
-    // Please not remove the `if`, ask @Superjomn if there are any concern.
-    if (platform::IsProfileEnabled()) {
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
-      platform::RecordEvent record_event(Type(), pool.Get(place));
-      RunImpl(scope, place);
-    } else {
-      RunImpl(scope, place);
-    }
-
-    if (VLOG_IS_ON(3)) {
-      VLOG(3) << place << " " << DebugStringEx(&scope);
-    }
-  } catch (platform::EnforceNotMet exception) {
-    if (Attrs().count("sub_block") != 0) {
-      throw exception;
-    }
-
-    auto& callstack = Attr<std::vector<std::string>>(
-        OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
+  }
 
-    if (callstack.empty()) {
-      throw exception;
-    }
-    std::ostringstream sout;
-    sout << "Invoke operator " << Type() << " error.\n";
-    sout << "Python Callstacks: \n";
-    for (auto& line : callstack) {
-      sout << line;
-    }
-    sout << "C++ Callstacks: \n";
-    sout << exception.err_str_;
-    exception.err_str_ = sout.str();
-    throw exception;
-  } catch (...) {
-    std::rethrow_exception(std::current_exception());
+  // The profile has a process-wide mutex, results in serious performance issue
+  // in concurrency scenerio. Here use an `if` to fix this issue.
+  // Please not remove the `if`, ask @Superjomn if there are any concern.
+  if (platform::IsProfileEnabled()) {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    platform::RecordEvent record_event(Type(), pool.Get(place));
+    RunImpl(scope, place);
+  } else {
+    RunImpl(scope, place);
   }
+  VLOG(3) << place << " " << DebugStringEx(&scope);
 }
 
 bool OperatorBase::HasInputs(const std::string& name) const {
diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc
index f8ded9f94e..06d8b65fb1 100644
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -49,9 +49,6 @@ void BindConstValue(pybind11::module* m) {
   op_proto_and_checker_maker.def(
       "kOpNameScopeAttrName",
       framework::OpProtoAndCheckerMaker::OpNamescopeAttrName);
-  op_proto_and_checker_maker.def(
-      "kOpCreationCallstackAttrName",
-      framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName);
 }
 
 }  // namespace pybind
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 3427fb0c4a..de30ed2fc5 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -20,7 +20,6 @@ import os
 import re
 import six
 import sys
-import traceback
 
 import numpy as np
 
@@ -605,10 +604,6 @@ class Operator(object):
         if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0:
             del op_attrs[role_var_name]
 
-        callstack_var_name = op_maker.kOpCreationCallstackAttrName()
-        op_attrs[callstack_var_name] = list(
-            reversed(traceback.format_stack()))[1:]
-
         if len(self.desc.type()) != 0:
             return
         if type is None:
diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py
index 37b9a9188a..4153394c1d 100644
--- a/python/paddle/fluid/tests/unittests/test_operator_desc.py
+++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py
@@ -69,7 +69,7 @@ class TestOperator(unittest.TestCase):
             set(mul_op.attr_names),
             set([
                 "x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var",
-                "op_namescope", "op_callstack"
+                "op_namescope"
             ]))
         self.assertEqual(mul_op.has_attr("x_num_col_dims"), True)
         self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT)

From e49276e731716a1f9f796d102f82ebf58effb22b Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Wed, 26 Dec 2018 17:53:08 +0800
Subject: [PATCH 61/77] restore the huber_loss_op test=develop

---
 paddle/fluid/operators/huber_loss_op.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/huber_loss_op.h b/paddle/fluid/operators/huber_loss_op.h
index 666500ef26..9efda3dfc9 100644
--- a/paddle/fluid/operators/huber_loss_op.h
+++ b/paddle/fluid/operators/huber_loss_op.h
@@ -104,19 +104,15 @@ class HuberLossGradKernel : public framework::OpKernel<T> {
     if (out0) {
       out0->mutable_data<T>(context.GetPlace());
       auto x_grad = EigenVector<T>::Flatten(*out0);
-      // MSVC not treat it well when partial template arguments were specified
       x_grad.device(place) =
-          out_grad *
-          residual.unaryExpr(HuberLossBackward<T>(delta, static_cast<T>(-1.0)));
+          out_grad * residual.unaryExpr(HuberLossBackward<T>(delta, -1.0));
     }
 
     if (out1) {
       out1->mutable_data<T>(context.GetPlace());
       auto y_grad = EigenVector<T>::Flatten(*out1);
-      // MSVC not treat it well when partial template arguments were specified
       y_grad.device(place) =
-          out_grad *
-          residual.unaryExpr(HuberLossBackward<T>(delta, static_cast<T>(1.0)));
+          out_grad * residual.unaryExpr(HuberLossBackward<T>(delta, 1.0));
     }
   }
 };

From 02e17396c24f0deb11826e37a579a69dc41ca382 Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Wed, 26 Dec 2018 11:33:35 +0000
Subject: [PATCH 62/77] fix comments test=develop

---
 paddle/fluid/inference/api/paddle_pass_builder.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index d327f2bcec..1062ac5f58 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -118,13 +118,13 @@ class GpuPassStrategy : public PassStrategy {
  public:
   GpuPassStrategy() : PassStrategy({}) {
     passes_.assign({
-        "infer_clean_graph_pass",  //
-        "conv_affine_channel_fuse_pass",
-        "conv_eltwiseadd_affine_channel_fuse_pass",
-        "conv_bn_fuse_pass",                    //
-        "conv_elementwise_add_act_fuse_pass",   //
-        "conv_elementwise_add2_act_fuse_pass",  //
-        "conv_elementwise_add_fuse_pass",       //
+        "infer_clean_graph_pass",                    //
+        "conv_affine_channel_fuse_pass",             //
+        "conv_eltwiseadd_affine_channel_fuse_pass",  //
+        "conv_bn_fuse_pass",                         //
+        "conv_elementwise_add_act_fuse_pass",        //
+        "conv_elementwise_add2_act_fuse_pass",       //
+        "conv_elementwise_add_fuse_pass",            //
     });
   }
 

From 3e917a934af212ab3ff3b2704666fb283cb3ed11 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Wed, 26 Dec 2018 08:03:13 +0000
Subject: [PATCH 63/77] add scope_pool add module cleanup test=develop

---
 paddle/contrib/float16/float16_transpiler.py  |  2 +-
 paddle/fluid/framework/CMakeLists.txt         |  1 +
 paddle/fluid/framework/scope_pool.cc          | 54 +++++++++++++++++++
 paddle/fluid/framework/scope_pool.h           | 46 ++++++++++++++++
 paddle/fluid/pybind/CMakeLists.txt            |  2 +-
 paddle/fluid/pybind/pybind.cc                 | 17 +++++-
 python/paddle/fluid/__init__.py               |  2 +-
 python/paddle/fluid/executor.py               |  2 +-
 .../fluid/tests/unittests/test_py_func_op.py  |  6 +--
 .../fluid/transpiler/inference_transpiler.py  |  2 +-
 10 files changed, 124 insertions(+), 10 deletions(-)
 create mode 100644 paddle/fluid/framework/scope_pool.cc
 create mode 100644 paddle/fluid/framework/scope_pool.h

diff --git a/paddle/contrib/float16/float16_transpiler.py b/paddle/contrib/float16/float16_transpiler.py
index 8d95dc0591..500f64bed9 100644
--- a/paddle/contrib/float16/float16_transpiler.py
+++ b/paddle/contrib/float16/float16_transpiler.py
@@ -60,7 +60,7 @@ class Float16Transpiler:
             raise TypeError("place should be as CPUPlace/CUDAPlace type")
         if scope is None:
             scope = global_scope()
-        if not isinstance(scope, core.Scope):
+        if not isinstance(scope, core._Scope):
             raise TypeError("scope should be as Scope type or None")
 
         self.scope = scope
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 412bc9cbe8..514eeb5347 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -84,6 +84,7 @@ cc_library(threadpool SRCS threadpool.cc DEPS enforce)
 cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
 
 cc_library(scope SRCS scope.cc DEPS glog threadpool)
+cc_library(scope_pool SRCS scope_pool.cc DEPS scope)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
 
 cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
diff --git a/paddle/fluid/framework/scope_pool.cc b/paddle/fluid/framework/scope_pool.cc
new file mode 100644
index 0000000000..5cb241a7a3
--- /dev/null
+++ b/paddle/fluid/framework/scope_pool.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/scope_pool.h"
+#include "paddle/fluid/framework/threadpool.h"
+
+namespace paddle {
+namespace framework {
+
+ScopePool &ScopePool::Instance() {  // NOLINT
+  static ScopePool pool;
+  return pool;
+}
+
+void ScopePool::DeleteScope(Scope *scope) { delete scope; }
+
+void ScopePool::Insert(std::unique_ptr<Scope> &&s) {
+  std::lock_guard<std::mutex> guard(mtx_);
+  scopes_.insert(s.release());
+}
+
+void ScopePool::Remove(Scope *s) {
+  size_t has_scope;
+  {
+    std::lock_guard<std::mutex> guard(mtx_);
+    has_scope = scopes_.erase(s);
+  }
+  PADDLE_ENFORCE(has_scope > 0, "Delete non-existing global scope");
+  DeleteScope(s);
+}
+
+ScopePool::~ScopePool() { Clear(); }
+
+void ScopePool::Clear() {
+  std::lock_guard<std::mutex> guard(mtx_);
+  for (auto *s : scopes_) {
+    DeleteScope(s);
+  }
+  scopes_.clear();
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/scope_pool.h b/paddle/fluid/framework/scope_pool.h
new file mode 100644
index 0000000000..a8b468699a
--- /dev/null
+++ b/paddle/fluid/framework/scope_pool.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <mutex>  // NOLINT
+#include <unordered_set>
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+
+class ScopePool {
+ public:
+  static ScopePool &Instance();  // NOLINT
+
+  void Insert(std::unique_ptr<Scope> &&s);
+
+  void Remove(Scope *s);
+
+  void Clear();
+
+  ~ScopePool();
+
+ private:
+  ScopePool() = default;
+
+  static void DeleteScope(Scope *scope);
+
+  std::unordered_set<Scope *> scopes_;
+  std::mutex mtx_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index fb8bcb190b..72b0f216d3 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer)
+set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer scope_pool)
 if(WITH_PYTHON)
   list(APPEND PYBIND_DEPS py_func_op)
 endif()
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 88a2a5276a..81d63aace0 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -32,6 +32,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/scope_pool.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/version.h"
 #include "paddle/fluid/imperative/layer.h"
@@ -117,6 +118,9 @@ PYBIND11_MODULE(core, m) {
         return paddle::operators::AppendPythonCallableObjectAndReturnId(py_obj);
       });
 
+  m.add_object("_cleanup",
+               py::capsule([]() { ScopePool::Instance().Clear(); }));
+
   py::class_<imperative::VarBase, PyVarBase>(m, "VarBase", R"DOC()DOC")
       .def(py::init<>())
       .def("_run_backward",
@@ -454,7 +458,7 @@ All parameter, weight, gradient are variables in Paddle.
             },
         py::return_value_policy::copy);
 
-  py::class_<Scope>(m, "Scope", R"DOC(
+  py::class_<Scope>(m, "_Scope", R"DOC(
     Scope is an association of a name to Variable. All variables belong to Scope.
 
     Variables in a parent scope can be retrieved from local scope.
@@ -474,17 +478,26 @@ All parameter, weight, gradient are variables in Paddle.
           param.set(param_array, place)
 
         )DOC")
+      .def("_remove_from_pool",
+           [](Scope &self) { ScopePool::Instance().Remove(&self); })
       .def("var",
            [](Scope &self, const std::string &name) -> Variable * {
              return self.Var(name);
            },
            py::return_value_policy::reference)
       .def("find_var", &Scope::FindVar, py::return_value_policy::reference)
-      .def(py::init<>())
       .def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); },
            py::return_value_policy::reference)
       .def("drop_kids", &Scope::DropKids);
 
+  m.def("Scope",
+        []() -> Scope * {
+          auto *s = new Scope();
+          ScopePool::Instance().Insert(std::unique_ptr<Scope>(s));
+          return s;
+        },
+        py::return_value_policy::reference);
+
   //! @note: Be careful! PyBind will return std::string as an unicode, not
   //! Python str. If you want a str object, you should cast them in Python.
   m.def("get_all_op_protos", []() -> std::vector<py::bytes> {
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 8f3660ca38..e0078e5314 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -46,7 +46,7 @@ from . import transpiler
 from . import distribute_lookup_table
 from .param_attr import ParamAttr, WeightNormParamAttr
 from .data_feeder import DataFeeder
-from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
+from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope
 from .transpiler import DistributeTranspiler, \
     memory_optimize, release_memory, DistributeTranspilerConfig
 from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index f2886090d7..5a9e908b61 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -191,7 +191,7 @@ def _fetch_var(name, scope=None, return_numpy=True):
     assert isinstance(name, str)
     if scope is None:
         scope = global_scope()
-    assert isinstance(scope, core.Scope)
+    assert isinstance(scope, core._Scope)
 
     var = scope.find_var(name)
     assert var is not None, (
diff --git a/python/paddle/fluid/tests/unittests/test_py_func_op.py b/python/paddle/fluid/tests/unittests/test_py_func_op.py
index 943ad3ed22..655378f7f8 100644
--- a/python/paddle/fluid/tests/unittests/test_py_func_op.py
+++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py
@@ -26,7 +26,7 @@ os.environ['CPU_NUM'] = str(dev_cnt)
 
 
 def dummy_func_with_no_input():
-    return float(1.0)
+    return np.array([0], dtype='float32')
 
 
 def dummy_func_with_no_output(x):
@@ -105,7 +105,7 @@ def simple_fc_net(img, label, use_py_func_op):
             name='test_tmp_var', dtype='float32', shape=[1])
         fluid.layers.py_func(
             func=dummy_func_with_no_input, x=None, out=dummy_var)
-
+        loss += dummy_var
         fluid.layers.py_func(func=dummy_func_with_no_output, x=loss, out=None)
 
     loss = fluid.layers.mean(loss)
@@ -174,7 +174,7 @@ class TestPyFuncOpUseExecutor(unittest.TestCase):
             self.assertAlmostEqual(max_diff, 0, delta=1e-3)
 
 
-class TestPyFuncOpUseParallelExecutor(unittest.TestCase):
+class TestPyFuncOpUseParallelExecutor(TestPyFuncOpUseExecutor):
     def setUp(self):
         self.use_parallel_executor = True
 
diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py
index ccf7af334d..cc7f5ec90c 100644
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -57,7 +57,7 @@ class InferenceTranspiler(object):
             raise TypeError("place should be as CPUPlace/CUDAPlace type")
         if scope is None:
             scope = global_scope()
-        if not isinstance(scope, core.Scope):
+        if not isinstance(scope, core._Scope):
             raise TypeError("scope should be as Scope type or None")
         use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))
 

From 10a6bc9675848c6ab0a30b7dc47f9d5c8788b0d1 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Wed, 26 Dec 2018 11:53:29 +0000
Subject: [PATCH 64/77] modify API.spec test=develop

---
 paddle/fluid/API.spec | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index e3b4449925..3970d9a731 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -447,11 +447,7 @@ paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, ke
 paddle.fluid.unique_name.guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
 paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
-paddle.fluid.Scope.__init__ __init__(self: paddle.fluid.core.Scope) -> None
-paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None
-paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
-paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope
-paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
+paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope
 paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None)
 paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None)
 paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None)

From 05f1b65da34a9daa3b8edc218505fa7b74ca3069 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Wed, 26 Dec 2018 18:53:28 +0800
Subject: [PATCH 65/77] simplify prepere_input in analyzer_test

test=develop
---
 paddle/fluid/inference/api/helper.h           | 10 ++++++++
 .../tests/api/analyzer_lac_tester.cc          |  4 +---
 .../tests/api/analyzer_mm_dnn_tester.cc       | 12 ++++------
 .../tests/api/analyzer_ner_tester.cc          | 11 ++++-----
 .../tests/api/analyzer_seq_conv1_tester.cc    | 24 ++++++-------------
 5 files changed, 26 insertions(+), 35 deletions(-)

diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index 9a393a61c4..7830e85956 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -113,6 +113,16 @@ static void TensorAssignData(PaddleTensor *tensor,
   }
 }
 
+template <typename T>
+static void TensorAssignData(PaddleTensor *tensor,
+                             const std::vector<std::vector<T>> &data,
+                             const std::vector<size_t> &lod) {
+  int size = lod[lod.size() - 1];
+  tensor->shape.assign({size, 1});
+  tensor->lod.assign({lod});
+  TensorAssignData(tensor, data);
+}
+
 template <typename T>
 static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
                                     const std::vector<std::vector<T>> &data) {
diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
index 142801382b..2213971c17 100644
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -98,10 +98,8 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   auto one_batch = data->NextBatch();
   PaddleTensor input_tensor;
   input_tensor.name = "word";
-  input_tensor.shape.assign({static_cast<int>(one_batch.data.size()), 1});
-  input_tensor.lod.assign({one_batch.lod});
   input_tensor.dtype = PaddleDType::INT64;
-  TensorAssignData<int64_t>(&input_tensor, {one_batch.data});
+  TensorAssignData<int64_t>(&input_tensor, {one_batch.data}, one_batch.lod);
   PADDLE_ENFORCE_EQ(batch_size, static_cast<int>(one_batch.lod.size() - 1));
   input_slots->assign({input_tensor});
 }
diff --git a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
index 8aaab6d664..98335fe4f8 100644
--- a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
@@ -80,15 +80,11 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   lod_query_tensor.name = "left";
   lod_title_tensor.name = "right";
   auto one_batch = data->NextBatch();
-  int size1 = one_batch.lod1[one_batch.lod1.size() - 1];  // token batch size
-  int size2 = one_batch.lod2[one_batch.lod2.size() - 1];  // token batch size
-  lod_query_tensor.shape.assign({size1, 1});
-  lod_query_tensor.lod.assign({one_batch.lod1});
-  lod_title_tensor.shape.assign({size2, 1});
-  lod_title_tensor.lod.assign({one_batch.lod2});
   // assign data
-  TensorAssignData<int64_t>(&lod_query_tensor, one_batch.query_data_all);
-  TensorAssignData<int64_t>(&lod_title_tensor, one_batch.title_data_all);
+  TensorAssignData<int64_t>(&lod_query_tensor, one_batch.query_data_all,
+                            one_batch.lod1);
+  TensorAssignData<int64_t>(&lod_title_tensor, one_batch.title_data_all,
+                            one_batch.lod2);
   // Set inputs.
   input_slots->assign({lod_query_tensor, lod_title_tensor});
   for (auto &tensor : *input_slots) {
diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
index f19a2ed59e..54298fdab2 100644
--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -78,14 +78,11 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   lod_word_tensor.name = "word";
   lod_mention_tensor.name = "mention";
   auto one_batch = data->NextBatch();
-  int size = one_batch.lod[one_batch.lod.size() - 1];  // token batch size
-  lod_word_tensor.shape.assign({size, 1});
-  lod_word_tensor.lod.assign({one_batch.lod});
-  lod_mention_tensor.shape.assign({size, 1});
-  lod_mention_tensor.lod.assign({one_batch.lod});
   // assign data
-  TensorAssignData<int64_t>(&lod_word_tensor, one_batch.word_data_all);
-  TensorAssignData<int64_t>(&lod_mention_tensor, one_batch.mention_data_all);
+  TensorAssignData<int64_t>(&lod_word_tensor, one_batch.word_data_all,
+                            one_batch.lod);
+  TensorAssignData<int64_t>(&lod_mention_tensor, one_batch.mention_data_all,
+                            one_batch.lod);
   // Set inputs.
   input_slots->assign({lod_word_tensor, lod_mention_tensor});
   for (auto &tensor : *input_slots) {
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
index f5082cd60f..49f6059715 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
@@ -109,24 +109,14 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   title3_tensor.name = "title3";
   l1_tensor.name = "l1";
   auto one_batch = data->NextBatch();
-  int title1_size = one_batch.title1_lod[one_batch.title1_lod.size() - 1];
-  title1_tensor.shape.assign({title1_size, 1});
-  title1_tensor.lod.assign({one_batch.title1_lod});
-  int title2_size = one_batch.title2_lod[one_batch.title2_lod.size() - 1];
-  title2_tensor.shape.assign({title2_size, 1});
-  title2_tensor.lod.assign({one_batch.title2_lod});
-  int title3_size = one_batch.title3_lod[one_batch.title3_lod.size() - 1];
-  title3_tensor.shape.assign({title3_size, 1});
-  title3_tensor.lod.assign({one_batch.title3_lod});
-  int l1_size = one_batch.l1_lod[one_batch.l1_lod.size() - 1];
-  l1_tensor.shape.assign({l1_size, 1});
-  l1_tensor.lod.assign({one_batch.l1_lod});
-
   // assign data
-  TensorAssignData<int64_t>(&title1_tensor, one_batch.title1);
-  TensorAssignData<int64_t>(&title2_tensor, one_batch.title2);
-  TensorAssignData<int64_t>(&title3_tensor, one_batch.title3);
-  TensorAssignData<int64_t>(&l1_tensor, one_batch.l1);
+  TensorAssignData<int64_t>(&title1_tensor, one_batch.title1,
+                            one_batch.title1_lod);
+  TensorAssignData<int64_t>(&title2_tensor, one_batch.title2,
+                            one_batch.title2_lod);
+  TensorAssignData<int64_t>(&title3_tensor, one_batch.title3,
+                            one_batch.title3_lod);
+  TensorAssignData<int64_t>(&l1_tensor, one_batch.l1, one_batch.l1_lod);
   // Set inputs.
   input_slots->assign({title1_tensor, title2_tensor, title3_tensor, l1_tensor});
   for (auto &tensor : *input_slots) {

From ecae157edf352ad73c8e60a90ced540fe0e48ff3 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Wed, 26 Dec 2018 21:31:45 +0800
Subject: [PATCH 66/77] simplify some data record in analyzer_tester

test=develop
---
 .../tests/api/analyzer_mm_dnn_tester.cc       | 35 +++-------
 .../tests/api/analyzer_ner_tester.cc          | 33 +++-------
 .../tests/api/analyzer_seq_conv1_tester.cc    | 64 ++++---------------
 .../fluid/inference/tests/api/tester_helper.h | 12 ++++
 4 files changed, 45 insertions(+), 99 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
index 98335fe4f8..9d3c751943 100644
--- a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
@@ -19,11 +19,9 @@ namespace inference {
 using contrib::AnalysisConfig;
 
 struct DataRecord {
-  std::vector<std::vector<int64_t>> query_data_all, title_data_all;
+  std::vector<std::vector<int64_t>> query, title;
   std::vector<size_t> lod1, lod2;
-  size_t batch_iter{0};
-  size_t batch_size{1};
-  size_t num_samples;  // total number of samples
+  size_t batch_iter{0}, batch_size{1}, num_samples;  // total number of samples
   DataRecord() = default;
   explicit DataRecord(const std::string &path, int batch_size = 1)
       : batch_size(batch_size) {
@@ -33,22 +31,9 @@ struct DataRecord {
     DataRecord data;
     size_t batch_end = batch_iter + batch_size;
     // NOTE skip the final batch, if no enough data is provided.
-    if (batch_end <= query_data_all.size()) {
-      data.query_data_all.assign(query_data_all.begin() + batch_iter,
-                                 query_data_all.begin() + batch_end);
-      data.title_data_all.assign(title_data_all.begin() + batch_iter,
-                                 title_data_all.begin() + batch_end);
-      // Prepare LoDs
-      data.lod1.push_back(0);
-      data.lod2.push_back(0);
-      CHECK(!data.query_data_all.empty());
-      CHECK(!data.title_data_all.empty());
-      CHECK_EQ(data.query_data_all.size(), data.title_data_all.size());
-      for (size_t j = 0; j < data.query_data_all.size(); j++) {
-        // calculate lod
-        data.lod1.push_back(data.lod1.back() + data.query_data_all[j].size());
-        data.lod2.push_back(data.lod2.back() + data.title_data_all[j].size());
-      }
+    if (batch_end <= query.size()) {
+      GetInputPerBatch(query, &data.query, &data.lod1, batch_iter, batch_end);
+      GetInputPerBatch(title, &data.title, &data.lod2, batch_iter, batch_end);
     }
     batch_iter += batch_size;
     return data;
@@ -67,8 +52,8 @@ struct DataRecord {
       // load title data
       std::vector<int64_t> title_data;
       split_to_int64(data[1], ' ', &title_data);
-      query_data_all.push_back(std::move(query_data));
-      title_data_all.push_back(std::move(title_data));
+      query.push_back(std::move(query_data));
+      title.push_back(std::move(title_data));
     }
     num_samples = num_lines;
   }
@@ -81,10 +66,8 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   lod_title_tensor.name = "right";
   auto one_batch = data->NextBatch();
   // assign data
-  TensorAssignData<int64_t>(&lod_query_tensor, one_batch.query_data_all,
-                            one_batch.lod1);
-  TensorAssignData<int64_t>(&lod_title_tensor, one_batch.title_data_all,
-                            one_batch.lod2);
+  TensorAssignData<int64_t>(&lod_query_tensor, one_batch.query, one_batch.lod1);
+  TensorAssignData<int64_t>(&lod_title_tensor, one_batch.title, one_batch.lod2);
   // Set inputs.
   input_slots->assign({lod_query_tensor, lod_title_tensor});
   for (auto &tensor : *input_slots) {
diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
index 54298fdab2..f8635968ce 100644
--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -19,11 +19,9 @@ namespace inference {
 using contrib::AnalysisConfig;
 
 struct DataRecord {
-  std::vector<std::vector<int64_t>> word_data_all, mention_data_all;
+  std::vector<std::vector<int64_t>> word, mention;
   std::vector<size_t> lod;  // two inputs have the same lod info.
-  size_t batch_iter{0};
-  size_t batch_size{1};
-  size_t num_samples;  // total number of samples
+  size_t batch_iter{0}, batch_size{1}, num_samples;  // total number of samples
   DataRecord() = default;
   explicit DataRecord(const std::string &path, int batch_size = 1)
       : batch_size(batch_size) {
@@ -33,20 +31,10 @@ struct DataRecord {
     DataRecord data;
     size_t batch_end = batch_iter + batch_size;
     // NOTE skip the final batch, if no enough data is provided.
-    if (batch_end <= word_data_all.size()) {
-      data.word_data_all.assign(word_data_all.begin() + batch_iter,
-                                word_data_all.begin() + batch_end);
-      data.mention_data_all.assign(mention_data_all.begin() + batch_iter,
-                                   mention_data_all.begin() + batch_end);
-      // Prepare LoDs
-      data.lod.push_back(0);
-      CHECK(!data.word_data_all.empty());
-      CHECK(!data.mention_data_all.empty());
-      CHECK_EQ(data.word_data_all.size(), data.mention_data_all.size());
-      for (size_t j = 0; j < data.word_data_all.size(); j++) {
-        // calculate lod
-        data.lod.push_back(data.lod.back() + data.word_data_all[j].size());
-      }
+    if (batch_end <= word.size()) {
+      GetInputPerBatch(word, &data.word, &data.lod, batch_iter, batch_end);
+      GetInputPerBatch(mention, &data.mention, &data.lod, batch_iter,
+                       batch_end);
     }
     batch_iter += batch_size;
     return data;
@@ -65,8 +53,8 @@ struct DataRecord {
       // load mention data
       std::vector<int64_t> mention_data;
       split_to_int64(data[3], ' ', &mention_data);
-      word_data_all.push_back(std::move(word_data));
-      mention_data_all.push_back(std::move(mention_data));
+      word.push_back(std::move(word_data));
+      mention.push_back(std::move(mention_data));
     }
     num_samples = num_lines;
   }
@@ -79,9 +67,8 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   lod_mention_tensor.name = "mention";
   auto one_batch = data->NextBatch();
   // assign data
-  TensorAssignData<int64_t>(&lod_word_tensor, one_batch.word_data_all,
-                            one_batch.lod);
-  TensorAssignData<int64_t>(&lod_mention_tensor, one_batch.mention_data_all,
+  TensorAssignData<int64_t>(&lod_word_tensor, one_batch.word, one_batch.lod);
+  TensorAssignData<int64_t>(&lod_mention_tensor, one_batch.mention,
                             one_batch.lod);
   // Set inputs.
   input_slots->assign({lod_word_tensor, lod_mention_tensor});
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
index 49f6059715..e6d6cd2960 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
@@ -18,12 +18,9 @@ namespace paddle {
 namespace inference {
 
 struct DataRecord {
-  std::vector<std::vector<int64_t>> title1_all, title2_all, title3_all, l1_all;
   std::vector<std::vector<int64_t>> title1, title2, title3, l1;
-  std::vector<size_t> title1_lod, title2_lod, title3_lod, l1_lod;
-  size_t batch_iter{0};
-  size_t batch_size{1};
-  size_t num_samples;  // total number of samples
+  std::vector<size_t> lod1, lod2, lod3, l1_lod;
+  size_t batch_iter{0}, batch_size{1}, num_samples;  // total number of samples
   DataRecord() = default;
   explicit DataRecord(const std::string &path, int batch_size = 1)
       : batch_size(batch_size) {
@@ -33,41 +30,11 @@ struct DataRecord {
     DataRecord data;
     size_t batch_end = batch_iter + batch_size;
     // NOTE skip the final batch, if no enough data is provided.
-    if (batch_end <= title1_all.size()) {
-      data.title1_all.assign(title1_all.begin() + batch_iter,
-                             title1_all.begin() + batch_end);
-      data.title2_all.assign(title2_all.begin() + batch_iter,
-                             title2_all.begin() + batch_end);
-      data.title3_all.assign(title3_all.begin() + batch_iter,
-                             title3_all.begin() + batch_end);
-      data.l1_all.assign(l1_all.begin() + batch_iter,
-                         l1_all.begin() + batch_end);
-      // Prepare LoDs
-      data.title1_lod.push_back(0);
-      data.title2_lod.push_back(0);
-      data.title3_lod.push_back(0);
-      data.l1_lod.push_back(0);
-      CHECK(!data.title1_all.empty());
-      CHECK(!data.title2_all.empty());
-      CHECK(!data.title3_all.empty());
-      CHECK(!data.l1_all.empty());
-      CHECK_EQ(data.title1_all.size(), data.title2_all.size());
-      CHECK_EQ(data.title1_all.size(), data.title3_all.size());
-      CHECK_EQ(data.title1_all.size(), data.l1_all.size());
-      for (size_t j = 0; j < data.title1_all.size(); j++) {
-        data.title1.push_back(data.title1_all[j]);
-        data.title2.push_back(data.title2_all[j]);
-        data.title3.push_back(data.title3_all[j]);
-        data.l1.push_back(data.l1_all[j]);
-        // calculate lod
-        data.title1_lod.push_back(data.title1_lod.back() +
-                                  data.title1_all[j].size());
-        data.title2_lod.push_back(data.title2_lod.back() +
-                                  data.title2_all[j].size());
-        data.title3_lod.push_back(data.title3_lod.back() +
-                                  data.title3_all[j].size());
-        data.l1_lod.push_back(data.l1_lod.back() + data.l1_all[j].size());
-      }
+    if (batch_end <= title1.size()) {
+      GetInputPerBatch(title1, &data.title1, &data.lod1, batch_iter, batch_end);
+      GetInputPerBatch(title2, &data.title2, &data.lod2, batch_iter, batch_end);
+      GetInputPerBatch(title3, &data.title3, &data.lod3, batch_iter, batch_end);
+      GetInputPerBatch(l1, &data.l1, &data.l1_lod, batch_iter, batch_end);
     }
     batch_iter += batch_size;
     return data;
@@ -92,10 +59,10 @@ struct DataRecord {
       // load l1 data
       std::vector<int64_t> l1_data;
       split_to_int64(data[3], ' ', &l1_data);
-      title1_all.push_back(std::move(title1_data));
-      title2_all.push_back(std::move(title2_data));
-      title3_all.push_back(std::move(title3_data));
-      l1_all.push_back(std::move(l1_data));
+      title1.push_back(std::move(title1_data));
+      title2.push_back(std::move(title2_data));
+      title3.push_back(std::move(title3_data));
+      l1.push_back(std::move(l1_data));
     }
     num_samples = num_lines;
   }
@@ -110,12 +77,9 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   l1_tensor.name = "l1";
   auto one_batch = data->NextBatch();
   // assign data
-  TensorAssignData<int64_t>(&title1_tensor, one_batch.title1,
-                            one_batch.title1_lod);
-  TensorAssignData<int64_t>(&title2_tensor, one_batch.title2,
-                            one_batch.title2_lod);
-  TensorAssignData<int64_t>(&title3_tensor, one_batch.title3,
-                            one_batch.title3_lod);
+  TensorAssignData<int64_t>(&title1_tensor, one_batch.title1, one_batch.lod1);
+  TensorAssignData<int64_t>(&title2_tensor, one_batch.title2, one_batch.lod2);
+  TensorAssignData<int64_t>(&title3_tensor, one_batch.title3, one_batch.lod3);
   TensorAssignData<int64_t>(&l1_tensor, one_batch.l1, one_batch.l1_lod);
   // Set inputs.
   input_slots->assign({title1_tensor, title2_tensor, title3_tensor, l1_tensor});
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index b0c8f395ce..144027589c 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -169,6 +169,18 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
   (*inputs).emplace_back(input_slots);
 }
 
+void GetInputPerBatch(const std::vector<std::vector<int64_t>> &in,
+                      std::vector<std::vector<int64_t>> *out,
+                      std::vector<size_t> *lod, size_t batch_iter,
+                      size_t batch_end) {
+  lod->clear();
+  lod->push_back(0);
+  for (auto it = in.begin() + batch_iter; it < in.begin() + batch_end; it++) {
+    out->push_back(*it);
+    lod->push_back(lod->back() + (*it).size());  // calculate lod
+  }
+}
+
 void TestOneThreadPrediction(
     const PaddlePredictor::Config *config,
     const std::vector<std::vector<PaddleTensor>> &inputs,

From ce7e503cbe10dee0f3cad2145bec4559ab89f00f Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Tue, 25 Dec 2018 14:40:55 +0800
Subject: [PATCH 67/77] refactor to avoid scope.

test=develop
---
 paddle/fluid/framework/operator.cc            |  60 +++++-
 paddle/fluid/framework/operator.h             |  10 +
 paddle/fluid/imperative/layer.cc              | 188 ++++++++----------
 paddle/fluid/imperative/layer.h               |  45 +++--
 paddle/fluid/imperative/tracer.h              | 120 ++++++++---
 paddle/fluid/operators/fill_constant_op.cc    |  35 ++++
 paddle/fluid/pybind/pybind.cc                 |  12 +-
 python/paddle/fluid/framework.py              |  37 ++--
 python/paddle/fluid/imperative/base.py        |   3 +-
 python/paddle/fluid/layer_helper.py           |  21 +-
 python/paddle/fluid/layers/nn.py              |   2 +
 .../fluid/tests/unittests/test_imperative.py  |  13 +-
 12 files changed, 347 insertions(+), 199 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 2e7006ed95..38675d2cac 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -180,6 +180,11 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
   VLOG(3) << place << " " << DebugStringEx(&scope);
 }
 
+void OperatorBase::Run(const RuntimeContext& ctx,
+                       const platform::Place& place) {
+  RunImpl(ctx, place);
+}
+
 bool OperatorBase::HasInputs(const std::string& name) const {
   return inputs_.find(name) != inputs_.end();
 }
@@ -954,6 +959,51 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   }
 }
 
+void OperatorWithKernel::RunImpl(const RuntimeContext& ctx,
+                                 const platform::Place& place) const {
+  Scope scope;
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(place);
+
+  // check if op[type] has kernel registered.
+  auto& all_op_kernels = AllOpKernels();
+  auto kernels_iter = all_op_kernels.find(type_);
+  if (kernels_iter == all_op_kernels.end()) {
+    PADDLE_THROW(
+        "There are no kernels which are registered in the %s operator.", type_);
+  }
+
+  OpKernelMap& kernels = kernels_iter->second;
+
+  auto expected_kernel_key = this->GetExpectedKernelType(
+      ExecutionContext(*this, scope, *dev_ctx, ctx));
+  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+
+  auto kernel_iter = kernels.find(expected_kernel_key);
+#ifdef PADDLE_WITH_MKLDNN
+  // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
+  if (kernel_iter == kernels.end() &&
+      expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
+    VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
+    expected_kernel_key.library_type_ = LibraryType::kPlain;
+    expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
+#endif
+  if (kernel_iter == kernels.end()) {
+    PADDLE_THROW("op %s does not have kernel for %s", type_,
+                 KernelTypeToString(expected_kernel_key));
+  }
+
+  if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) {
+    dev_ctx = pool.Get(expected_kernel_key.place_);
+  }
+
+  RuntimeInferShapeContext infer_shape_ctx(*this, scope, ctx);
+  this->InferShape(&infer_shape_ctx);
+  kernel_iter->second(ExecutionContext(*this, scope, *dev_ctx, ctx));
+}
+
 void OperatorWithKernel::TransferInplaceVarsBack(
     const Scope& scope, const std::vector<std::string>& inplace_vars,
     const Scope& transfer_scope) const {
@@ -1041,12 +1091,9 @@ Scope* OperatorWithKernel::PrepareData(
 
 proto::VarType::Type OperatorWithKernel::IndicateDataType(
     const ExecutionContext& ctx) const {
-  auto& scope = ctx.scope();
   int data_type = -1;
-  std::string last_input_name;
   for (auto& input : this->inputs_) {
-    for (auto& ipt_name : input.second) {
-      auto* var = scope.FindVar(ipt_name);
+    for (const Variable* var : ctx.MultiInputVar(input.first)) {
       if (var != nullptr) {
         const Tensor* t = nullptr;
         if (var->IsType<Tensor>()) {
@@ -1062,10 +1109,9 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
           int tmp = static_cast<int>(t->type());
           PADDLE_ENFORCE(
               tmp == data_type || data_type == -1,
-              "DataType of Paddle Op %s must be the same. Get %s(%d) != %s(%d)",
-              Type(), last_input_name, data_type, ipt_name, tmp);
+              "DataType of Paddle Op %s must be the same. Get (%d) != (%d)",
+              Type(), data_type, tmp);
           data_type = tmp;
-          last_input_name = ipt_name;
         }
       }
     }
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index bad9716e8b..446d27efa0 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -81,6 +81,10 @@ class RuntimeContext {
   RuntimeContext(const VariableNameMap& innames,
                  const VariableNameMap& outnames, const Scope& scope);
 
+  RuntimeContext(const VariableValueMap& invars,
+                 const VariableValueMap& outvars)
+      : inputs(invars), outputs(outvars) {}
+
   VariableValueMap inputs;
   VariableValueMap outputs;
 };
@@ -101,6 +105,7 @@ class OperatorBase {
   /// Executor will call this interface function to Run an op.
   //  The implementation should be written at RunImpl
   void Run(const Scope& scope, const platform::Place& place);
+  void Run(const RuntimeContext& ctx, const platform::Place& place);
 
   // FIXME(typhoonzero): this is only used for recv_op to stop event_loop.
   virtual void Stop() {}
@@ -167,6 +172,9 @@ class OperatorBase {
   void CheckAllInputOutputSet() const;
   virtual void RunImpl(const Scope& scope,
                        const platform::Place& place) const = 0;
+
+  virtual void RunImpl(const RuntimeContext& ctx,
+                       const platform::Place& place) const {}
 };
 
 class ExecutionContext {
@@ -458,6 +466,8 @@ class OperatorWithKernel : public OperatorBase {
   // same.
   proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const;
   void RunImpl(const Scope& scope, const platform::Place& place) const final;
+  void RunImpl(const RuntimeContext& ctx,
+               const platform::Place& place) const final;
 
   /**
    * Transfer data from scope to a transfered scope. If there is no data need to
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 342cb68ab2..239ff029db 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -31,6 +31,11 @@ using framework::Variable;
 void AddTo(Variable* src, Variable* dst) {
   framework::LoDTensor* dst_tensor = dst->GetMutable<framework::LoDTensor>();
   framework::LoDTensor* src_tensor = src->GetMutable<framework::LoDTensor>();
+
+  VLOG(3) << "apply var grad " << src_tensor->data<float>()[0] << " "
+          << src_tensor->data<float>()[1] << " "
+          << src_tensor->data<float>()[2];
+
   PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), "%lld vs %lld",
                  dst_tensor->numel(), src_tensor->numel());
   float* dst_data = dst_tensor->mutable_data<float>(platform::CPUPlace());
@@ -38,16 +43,28 @@ void AddTo(Variable* src, Variable* dst) {
   for (size_t i = 0; i < src_tensor->numel(); ++i) {
     dst_data[i] += src_data[i];
   }
+
+  VLOG(3) << "apply var dst grad " << dst_tensor->data<float>()[0] << " "
+          << dst_tensor->data<float>()[1] << " "
+          << dst_tensor->data<float>()[2];
 }
 
 class Autograd {
  public:
-  explicit Autograd(framework::Scope* scope) : scope_(scope) {}
+  Autograd() {}
 
   void RunBackward(VarBase* var) {
     PADDLE_ENFORCE(var->pre_op_->op_desc_);
     // TODO(panyx0718): Only create for vars that "require_grad"
-    (*var->pre_op_->output_vars_)[var->pre_op_out_idx_]->grads_ = var->grads_;
+    LOG(ERROR) << reinterpret_cast<void*>(var->grads_) << " vs "
+               << reinterpret_cast<void*>(
+                      var->pre_op_
+                          ->output_vars_[var->pre_op_out_name_]
+                                        [var->pre_op_out_idx_]
+                          ->grads_);
+    var->pre_op_->output_vars_[var->pre_op_out_name_][var->pre_op_out_idx_]
+        ->grads_->GetMutable<framework::LoDTensor>()
+        ->ShareDataWith(var->grads_->Get<framework::LoDTensor>());
 
     std::deque<OpBase*> ready;
     ready.push_back(var->pre_op_);
@@ -57,18 +74,23 @@ class Autograd {
     while (!ready.empty()) {
       OpBase* ready_op = ready.front();
       ready.pop_front();
-      std::vector<Variable*> input_grads = ready_op->ApplyGrad(scope_);
-
-      for (size_t i = 0; i < input_grads.size(); ++i) {
-        if (!input_grads[i]) continue;
-        OpBase* pre_op = ready_op->pre_ops_->at(i);
-        if (!pre_op) continue;
-
-        dep_counts[pre_op] -= 1;
-        PADDLE_ENFORCE(dep_counts[pre_op] >= 0);
-        bool pre_op_ready = dep_counts[pre_op] == 0;
-        if (pre_op_ready) {
-          ready.push_back(pre_op);
+      std::map<std::string, std::vector<VarBase*>> input_grads =
+          ready_op->ApplyGrad();
+      VLOG(3) << "after apply grad";
+
+      for (auto it : input_grads) {
+        const std::vector<VarBase*>& ingrads = it.second;
+        for (size_t i = 0; i < ingrads.size(); ++i) {
+          if (!ingrads[i]) continue;
+          OpBase* pre_op = (*ready_op->pre_ops_)[it.first][i];
+          if (!pre_op) continue;
+
+          dep_counts[pre_op] -= 1;
+          PADDLE_ENFORCE(dep_counts[pre_op] >= 0);
+          bool pre_op_ready = dep_counts[pre_op] == 0;
+          if (pre_op_ready) {
+            ready.push_back(pre_op);
+          }
         }
       }
     }
@@ -85,26 +107,25 @@ class Autograd {
     while (!queue.empty()) {
       OpBase* candidate = queue.front();
       queue.pop_front();
-      for (OpBase* pre_op : *(candidate->pre_ops_)) {
-        if (!pre_op) continue;
-        if (visited.find(pre_op) == visited.end()) {
-          visited.insert(pre_op);
-          queue.push_back(pre_op);
+      for (auto it : *(candidate->pre_ops_)) {
+        for (OpBase* pre_op : it.second) {
+          if (!pre_op) continue;
+          if (visited.find(pre_op) == visited.end()) {
+            visited.insert(pre_op);
+            queue.push_back(pre_op);
+          }
+          ret[pre_op] += 1;
         }
-        ret[pre_op] += 1;
       }
     }
-
     return ret;
   }
-
-  framework::Scope* scope_;
 };
 
-framework::Variable* CreateVariable(const std::string& name,
-                                    const framework::DDim& dim, float val,
-                                    framework::Scope* scope,
-                                    bool random_name = true) {
+void CreateVariable(const std::string& name, const framework::DDim& dim,
+                    float val, bool random_name, framework::Variable* var) {
+  if (var->IsInitialized()) return;
+
   std::string varname = name;
   if (random_name) {
     std::mt19937 rng;
@@ -116,12 +137,9 @@ framework::Variable* CreateVariable(const std::string& name,
   }
 
   VLOG(3) << "creating var " << varname;
-  framework::Variable* var = scope->Var(varname);
   framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
-
   float* data = tensor->mutable_data<float>(dim, platform::CPUPlace());
   std::fill(data, data + tensor->numel(), val);
-  return var;
 }
 
 framework::LoDTensor& VarBase::Grad() {
@@ -129,94 +147,56 @@ framework::LoDTensor& VarBase::Grad() {
   return *grads_->GetMutable<framework::LoDTensor>();
 }
 
-void VarBase::ApplyGrad(framework::Scope* scope, Variable* grad) {
-  VLOG(3) << "apply var grad " << var_desc_->Name() << " "
-          << grad->Get<framework::LoDTensor>().data<float>()[0];
-  if (!grads_) {
-    grads_ =
-        CreateVariable(string::Sprintf("%s@IGrad", var_desc_->Name()),
-                       var_->Get<framework::LoDTensor>().dims(), 0.0, scope);
+std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
+  if (!grad_op_desc_) {
+    VLOG(3) << "op with no grad: " << op_desc_->Type();
+    return {};
   }
-  AddTo(grad, grads_);
-  VLOG(3) << "grad_ after apply var grad " << var_desc_->Name() << " "
-          << grads_->Get<framework::LoDTensor>().data<float>()[0];
-}
-
-std::vector<Variable*> OpBase::ApplyGrad(framework::Scope* scope) {
   VLOG(3) << "op grad " << grad_op_desc_->Type();
 
-  for (const std::string& grad_invar : grad_op_desc_->InputArgumentNames()) {
-    if (grad_to_var_->find(grad_invar) == grad_to_var_->end()) {
-      // grad op inputs can be forward inputs, so not in grad_to_var.
-      continue;
-    }
-    VLOG(3) << "op grad in var " << grad_invar;
-    block_->FindRecursiveOrCreateVar(grad_invar);
-    framework::Variable* var = scope->Var(grad_invar);
-    const std::string& invar = grad_to_var_->at(grad_invar);
-    for (VarBase* varbase : *output_vars_) {
-      // Use the accumulated grads_ by sharing the input with grads_.
-      if (varbase->var_desc_->Name() == invar) {
-        var->GetMutable<framework::LoDTensor>()->ShareDataWith(
-            varbase->grads_->Get<framework::LoDTensor>());
-        break;
-      }
+  std::map<std::string, std::vector<framework::Variable*>> grad_outputs;
+  for (auto it : grad_output_vars_) {
+    auto& outputs = grad_outputs[it.first];
+    for (size_t i = 0; i < it.second.size(); ++i) {
+      outputs.push_back(new framework::Variable());
+      outputs.back()->GetMutable<framework::LoDTensor>();
+      /*
+      auto& accum_grad_t = it.second[i]->Get<framework::LoDTensor>();
+      Variable* grad_var = outputs.back();
+      float* data = grad_var->GetMutable<framework::LoDTensor>()
+          ->mutable_data<float>(accum_grad_t.dims(), platform::CPUPlace());
+      std::fill(data, data + accum_grad_t.numel(), 0.0);*/
     }
   }
 
-  for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) {
-    VLOG(3) << "grad outvar " << outvar;
-    block_->FindRecursiveOrCreateVar(outvar);
-    framework::Variable* var = scope->Var(outvar);
-    if (!var->IsInitialized()) {
-      framework::VarDesc* var_desc = block_->FindVar(outvar);
-      if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
-        var->GetMutable<framework::LoDTensor>();
-      } else {
-        LOG(ERROR) << "tracer doesn't support yet";
-      }
-    }
-  }
-  grad_op_desc_->InferShape(*block_);
+  framework::RuntimeContext ctx(grad_input_vars_, grad_outputs);
+
+  // grad_op_desc_->InferShape(*block_);
   grad_op_desc_->InferVarType(block_);
+
   std::unique_ptr<framework::OperatorBase> opbase =
       framework::OpRegistry::CreateOp(*grad_op_desc_);
-
-  opbase->Run(*scope, platform::CPUPlace());
-
-  // `ret` matches exactly with `input_vars_` of forward op.
-  std::vector<Variable*> ret;
-  for (size_t i = 0; i < input_vars_->size(); ++i) {
-    bool found = false;
-    VarBase* origin_var = (*input_vars_)[i];
-    for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) {
-      Variable* var = scope->FindVar(outvar);
-      std::string orig_var = grad_to_var_->at(outvar);
-      if (origin_var->var_desc_->Name() != orig_var) {
-        continue;
-      }
-      VLOG(3) << "apply grad " << outvar << " with origin " << orig_var;
-      origin_var->ApplyGrad(scope, var);
-      found = true;
-      ret.push_back(var);
-      // TODO(panyx0718): There might be another outvar with the same name.
-      // In that case, it doesn't matter the first one or the second one is
-      // used.
-      break;
-    }
-    if (!found) {
-      ret.push_back(nullptr);
+  opbase->Run(ctx, platform::CPUPlace());
+
+  for (auto it : grad_output_vars_) {
+    auto& outputs = grad_outputs[it.first];
+    auto& origin_outputs = it.second;
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      framework::Variable* orig_grad = origin_outputs[i];
+      AddTo(outputs[i], orig_grad);
+      VLOG(3) << "done add to " << grad_op_desc_->Outputs().at(it.first)[i];
     }
   }
-  return ret;
+  return input_vars_;
 }
 
-void VarBase::RunBackward(framework::Scope* scope) {
-  grads_ = CreateVariable(framework::GradVarName(var_desc_->Name()),
-                          var_->Get<framework::LoDTensor>().dims(), 1.0, scope,
-                          false);
+void VarBase::RunBackward() {
+  auto grads_t = grads_->GetMutable<framework::LoDTensor>();
+  float* data = grads_t->mutable_data<float>(platform::CPUPlace());
+  std::fill(data, data + grads_t->numel(), 1.0);
+
   if (!pre_op_) return;
-  Autograd(scope).RunBackward(this);
+  Autograd().RunBackward(this);
 }
 
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 85a71ca83d..eb5fd553bd 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -14,11 +14,11 @@
 
 #pragma once
 
+#include <map>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -33,18 +33,26 @@ class VarBase {
       : pre_op_(nullptr),
         pre_op_out_idx_(-1),
         var_desc_(nullptr),
-        var_(nullptr),
-        grads_(nullptr) {}
-
-  virtual ~VarBase() {}
-
-  void ApplyGrad(framework::Scope* scope, framework::Variable* grad);
+        var_(new framework::Variable()),
+        grads_(new framework::Variable()) {}
+
+  virtual ~VarBase() {
+    if (var_) {
+      delete var_;
+      var_ = nullptr;
+    }
+    if (grads_) {
+      delete grads_;
+      grads_ = nullptr;
+    }
+  }
 
-  void RunBackward(framework::Scope* scope);
+  void RunBackward();
 
   framework::LoDTensor& Grad();
 
   OpBase* pre_op_;
+  std::string pre_op_out_name_;
   int pre_op_out_idx_;
 
   framework::VarDesc* var_desc_;
@@ -55,17 +63,12 @@ class VarBase {
 class OpBase {
  public:
   OpBase()
-      : input_vars_(new std::vector<VarBase*>()),
-        output_vars_(new std::vector<VarBase*>()),
-        pre_ops_(new std::vector<OpBase*>()),
-        pre_ops_out_idx_(new std::vector<int>()),
+      : pre_ops_(new std::map<std::string, std::vector<OpBase*>>()),
+        pre_ops_out_idx_(new std::map<std::string, std::vector<int>>()),
         op_desc_(nullptr),
         grad_op_desc_(nullptr) {}
 
   virtual ~OpBase() {
-    delete input_vars_;
-    delete output_vars_;
-
     delete pre_ops_;
     delete pre_ops_out_idx_;
 
@@ -73,16 +76,18 @@ class OpBase {
     if (grad_to_var_) delete grad_to_var_;
   }
 
-  std::vector<framework::Variable*> ApplyGrad(framework::Scope* scope);
+  std::map<std::string, std::vector<VarBase*>> ApplyGrad();
 
-  std::vector<VarBase*>* input_vars_;
-  std::vector<VarBase*>* output_vars_;
-  std::vector<OpBase*>* pre_ops_;
-  std::vector<int>* pre_ops_out_idx_;
+  std::map<std::string, std::vector<VarBase*>> input_vars_;
+  std::map<std::string, std::vector<VarBase*>> output_vars_;
+  std::map<std::string, std::vector<OpBase*>>* pre_ops_;
+  std::map<std::string, std::vector<int>>* pre_ops_out_idx_;
   framework::OpDesc* op_desc_;
 
   framework::OpDesc* grad_op_desc_;
   std::unordered_map<std::string, std::string>* grad_to_var_;
+  std::map<std::string, std::vector<framework::Variable*>> grad_input_vars_;
+  std::map<std::string, std::vector<framework::Variable*>> grad_output_vars_;
   framework::BlockDesc* block_;
 };
 
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 97772dc110..e7a60621cd 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -41,6 +41,14 @@ void CreateGradOp(const framework::OpDesc& op_desc,
   *grad_op_desc = grad_op_descs[0].release();
 }
 
+void InitVar(framework::Variable* var, framework::Variable* grad_var) {
+  auto& var_t = var->Get<framework::LoDTensor>();
+  float* data =
+      grad_var->GetMutable<framework::LoDTensor>()->mutable_data<float>(
+          var_t.dims(), platform::CPUPlace());
+  std::fill(data, data + var_t.numel(), 0.0);
+}
+
 class Tracer {
  public:
   explicit Tracer(framework::BlockDesc* root_block,
@@ -53,10 +61,13 @@ class Tracer {
 
   virtual ~Tracer() { delete root_scope_; }
 
-  void Trace(OpBase* op, const std::vector<VarBase*>& inputs,
-             const std::vector<VarBase*>& outputs,
+  void Trace(OpBase* op,
+             const std::map<std::string, std::vector<VarBase*>>& inputs,
+             const std::map<std::string, std::vector<VarBase*>>& outputs,
              framework::BlockDesc* block) {
-    framework::Scope* scope = GetScope(block);
+    // framework::Scope* scope = GetScope(block);
+    std::map<std::string, VarBase*> vars;
+
     framework::OpDesc* op_desc = op->op_desc_;
     VLOG(3) << "tracer tracing " << op_desc->Type();
     op_desc->InferShape(*block);
@@ -64,48 +75,60 @@ class Tracer {
     std::unique_ptr<framework::OperatorBase> op_base =
         framework::OpRegistry::CreateOp(*op_desc);
 
-    *op->input_vars_ = inputs;
-    for (VarBase* input : inputs) {
-      const std::string vname = input->var_desc_->Name();
-      framework::Variable* var = scope->Var(vname);
-      input->var_ = var;
-      if (!var->IsInitialized()) {
-        framework::VarDesc* var_desc = block->FindVar(vname);
-        if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
-          var->GetMutable<framework::LoDTensor>();
+    framework::VariableValueMap invars_map;
+    framework::VariableValueMap outvars_map;
+
+    op->input_vars_ = inputs;
+    for (auto it : op->input_vars_) {
+      auto& invars = invars_map[it.first];
+      for (VarBase* inp : it.second) {
+        PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr",
+                                op->op_desc_->Type(), inp->var_desc_->Name());
+
+        invars.push_back(inp->var_);
+        vars[inp->var_desc_->Name()] = inp;
+        if (inp->pre_op_) {
+          (*op->pre_ops_)[it.first].push_back(inp->pre_op_);
+          (*op->pre_ops_out_idx_)[it.first].push_back(inp->pre_op_out_idx_);
         } else {
-          LOG(ERROR) << "tracer doesn't support yet";
+          (*op->pre_ops_)[it.first].push_back(nullptr);
         }
+        VLOG(3) << "input vname " << inp->var_desc_->Name() << " "
+                << inp->var_->Get<framework::LoDTensor>().dims().size()
+                << reinterpret_cast<void*>(inp->var_);
       }
-      if (input->pre_op_) {
-        op->pre_ops_->push_back(input->pre_op_);
-        op->pre_ops_out_idx_->push_back(input->pre_op_out_idx_);
-      } else {
-        op->pre_ops_->push_back(nullptr);
-      }
-      VLOG(3) << "input vname " << vname << " "
-              << var->Get<framework::LoDTensor>().dims().size();
     }
 
-    *op->output_vars_ = outputs;
-    for (size_t i = 0; i < outputs.size(); ++i) {
-      const std::string vname = outputs[i]->var_desc_->Name();
-      framework::Variable* var = scope->Var(vname);
-      if (!var->IsInitialized()) {
-        framework::VarDesc* var_desc = block->FindVar(vname);
+    op->output_vars_ = outputs;
+    for (auto it : op->output_vars_) {
+      auto& outvars = outvars_map[it.first];
+      const std::vector<VarBase*>& outputs = it.second;
+      for (size_t i = 0; i < outputs.size(); ++i) {
+        VarBase* out = outputs[i];
+        outvars.push_back(out->var_);
+        vars[out->var_desc_->Name()] = out;
+
+        framework::VarDesc* var_desc = block->FindVar(out->var_desc_->Name());
         if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
-          var->GetMutable<framework::LoDTensor>();
+          out->var_->GetMutable<framework::LoDTensor>();
         } else {
           LOG(ERROR) << "tracer doesn't support yet";
         }
+        out->pre_op_ = op;
+        out->pre_op_out_name_ = it.first;
+        out->pre_op_out_idx_ = i;
+
+        VLOG(3) << "output vname " << out->var_desc_->Name() << " "
+                << out->var_->Get<framework::LoDTensor>().dims().size() << " "
+                << reinterpret_cast<void*>(out->var_) << " "
+                << out->var_->IsInitialized();
       }
-      outputs[i]->var_ = var;
-      outputs[i]->pre_op_ = op;
-      outputs[i]->pre_op_out_idx_ = i;
     }
 
     VLOG(3) << "tracer running " << op_desc->Type();
-    op_base->Run(*scope, platform::CPUPlace());
+    framework::RuntimeContext ctx(invars_map, outvars_map);
+    op_base->Run(ctx, platform::CPUPlace());
+
     if (block == startup_block_) {
       op->grad_op_desc_ = nullptr;
       op->grad_to_var_ = nullptr;
@@ -115,6 +138,39 @@ class Tracer {
       CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var);
       op->grad_op_desc_ = grad_op_desc;
       op->grad_to_var_ = grad_to_var;
+
+      for (auto it : grad_op_desc->Inputs()) {
+        auto& grad_in_vars = op->grad_input_vars_[it.first];
+        for (const std::string& grad_invar : it.second) {
+          block->FindRecursiveOrCreateVar(grad_invar);
+          auto var_it = op->grad_to_var_->find(grad_invar);
+          if (var_it == op->grad_to_var_->end()) {
+            auto fwd_var_it = vars.find(grad_invar);
+            PADDLE_ENFORCE(fwd_var_it != vars.end());
+            grad_in_vars.push_back(fwd_var_it->second->var_);
+          } else {
+            VarBase* var = vars[var_it->second];
+            if (!var->grads_->IsInitialized()) {
+              InitVar(var->var_, var->grads_);
+            }
+            grad_in_vars.push_back(var->grads_);
+          }
+        }
+      }
+      for (auto it : grad_op_desc->Outputs()) {
+        auto& grad_out_vars = op->grad_output_vars_[it.first];
+        for (const std::string& grad_outvar : it.second) {
+          block->FindRecursiveOrCreateVar(grad_outvar);
+          auto var_it = op->grad_to_var_->find(grad_outvar);
+          PADDLE_ENFORCE(var_it != op->grad_to_var_->end());
+          VarBase* var = vars[var_it->second];
+          if (!var->grads_->IsInitialized()) {
+            InitVar(var->var_, var->grads_);
+          }
+          LOG(ERROR) << grad_outvar << " map to " << var->var_desc_->Name();
+          grad_out_vars.push_back(var->grads_);
+        }
+      }
     }
     op->block_ = block;
   }
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 38cb33e790..7b04c5d21f 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -68,6 +68,41 @@ class FillConstantOp : public framework::OperatorBase {
     auto &dev_ctx = *pool.Get(dev_place);
     math::set_constant(dev_ctx, tensor, value);
   }
+
+  void RunImpl(const framework::RuntimeContext &ctx,
+               const platform::Place &dev_place) const override {
+    auto data_type =
+        static_cast<framework::proto::VarType::Type>(Attr<int>("dtype"));
+    auto value = Attr<float>("value");
+    auto force_cpu = Attr<bool>("force_cpu");
+
+    framework::Tensor *tensor = nullptr;
+
+    auto &out_var = *ctx.outputs.at("Out")[0];
+
+    if (out_var.IsType<framework::LoDTensor>()) {
+      tensor = out_var.GetMutable<framework::LoDTensor>();
+      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
+    } else if (out_var.IsType<framework::SelectedRows>()) {
+      tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
+      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
+    } else {
+      PADDLE_THROW(
+          "fill constant op's output only"
+          "supports SelectedRows and LoDTensor");
+    }
+
+    if (force_cpu) {
+      auto cpu = platform::CPUPlace();
+      tensor->mutable_data(cpu, data_type);
+    } else {
+      tensor->mutable_data(dev_place, data_type);
+    }
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+    math::set_constant(dev_ctx, tensor, value);
+  }
 };
 
 class FillConstantOpVarTypeInference : public framework::VarTypeInference {
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 81d63aace0..2ffdc90d84 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -124,9 +124,7 @@ PYBIND11_MODULE(core, m) {
   py::class_<imperative::VarBase, PyVarBase>(m, "VarBase", R"DOC()DOC")
       .def(py::init<>())
       .def("_run_backward",
-           [](imperative::VarBase &self, framework::Scope *scope) {
-             self.RunBackward(scope);
-           })
+           [](imperative::VarBase &self) { self.RunBackward(); })
       .def("_grad", &imperative::VarBase::Grad)
       .def_property(
           "desc",
@@ -134,7 +132,13 @@ PYBIND11_MODULE(core, m) {
           [](imperative::VarBase &self, framework::VarDesc *var_desc) {
             self.var_desc_ = var_desc;
           },
-          py::return_value_policy::reference);
+          py::return_value_policy::reference)
+      .def_property("var",
+                    [](const imperative::VarBase &self) { return self.var_; },
+                    [](imperative::VarBase &self, framework::Variable *var) {
+                      self.var_ = var;
+                    },
+                    py::return_value_policy::reference);
 
   py::class_<imperative::OpBase, PyOpBase>(m, "OpBase", R"DOC()DOC")
       .def(py::init<>())
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index de30ed2fc5..823b6d80be 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import collections
+from collections import defaultdict
 import contextlib
 import os
 import re
@@ -369,13 +370,11 @@ class Variable(object):
             self._ivar.desc = self.desc
 
     def _numpy(self):
-        scope = _imperative_tracer().get_scope(self.block.desc)
-        tensor = core.get_variable_tensor(scope, self.desc.name())
+        tensor = self._ivar.var.get_tensor()
         return np.array(tensor)
 
     def _backward(self):
-        scope = _imperative_tracer().get_scope(self.block.desc)
-        self._ivar._run_backward(scope)
+        self._ivar._run_backward()
 
     def _gradient(self):
         return np.array(self._ivar._grad())
@@ -692,20 +691,20 @@ class Operator(object):
         if _in_imperative_mode():
             self.iop = core.OpBase()
             self.iop.desc = self.desc
-            self.inputs = []
+            self.inputs = defaultdict(list)
             if inputs is not None:
-                for inp in inputs.values():
-                    if isinstance(inp, Variable):
-                        self.inputs.append(inp)
-                    elif isinstance(inp, list) or isinstance(inp, tuple):
-                        self.inputs.extend(inp[:])
-            self.outputs = []
+                for k, v in six.iteritems(inputs):
+                    if isinstance(v, Variable):
+                        self.inputs[k].append(v._ivar)
+                    elif isinstance(v, list) or isinstance(v, tuple):
+                        self.inputs[k].extend([var._ivar for var in v])
+            self.outputs = defaultdict(list)
             if outputs is not None:
-                for out in outputs.values():
-                    if isinstance(out, Variable):
-                        self.outputs.append(out)
-                    elif isinstance(out, list) or isinstance(out, tuple):
-                        self.outputs.extend(out[:])
+                for k, v in six.iteritems(outputs):
+                    if isinstance(v, Variable):
+                        self.outputs[k].append(v._ivar)
+                    elif isinstance(v, list) or isinstance(v, tuple):
+                        self.outputs[k].extend([var._ivar for var in v])
 
     def _has_kernel(self, op_type):
         return op_type not in self.OP_WITHOUT_KERNEL_SET
@@ -1273,8 +1272,7 @@ class Block(object):
         op_desc = self.desc.append_op()
         op = Operator(block=self, desc=op_desc, *args, **kwargs)
         if _in_imperative_mode():
-            _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs],
-                                       [v._ivar for v in op.outputs], self.desc)
+            _imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc)
         self.ops.append(op)
         return op
 
@@ -1325,8 +1323,7 @@ class Block(object):
         op_desc = self.desc._prepend_op()
         op = Operator(self, op_desc, *args, **kwargs)
         if _in_imperative_mode():
-            _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs],
-                                       [v._ivar for v in op.outputs], self.desc)
+            _imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc)
         self.ops.insert(0, op)
         return op
 
diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py
index aa48ef71aa..61e243e288 100644
--- a/python/paddle/fluid/imperative/base.py
+++ b/python/paddle/fluid/imperative/base.py
@@ -46,8 +46,7 @@ def to_variable(value, block=None):
             name=None,
             shape=value.shape,
             dtype=value.dtype)
-        scope = framework._imperative_tracer().get_scope(block.desc)
-        var = scope.var(py_var.name)
+        var = py_var._ivar.var
         tensor = var.get_tensor()
         tensor.set(value, core.CPUPlace())
         return py_var
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index 74b4a977db..0a299bc2fb 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -20,7 +20,7 @@ import six
 import sys
 import numpy as np
 
-from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating
+from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating, _in_imperative_mode
 from . import unique_name
 from paddle.fluid.initializer import Constant, Xavier
 from paddle.fluid.imperative import base
@@ -313,11 +313,20 @@ class LayerHelper(object):
             param = self._create_weight_normalize(attr, shape, dtype)
             WeightNormParamAttr.params_with_weight_norm.append(param)
             return param
-
-        self.startup_program.global_block().create_parameter(
-            dtype=dtype, shape=shape, **attr._to_kwargs(with_initializer=True))
-        return self.main_program.global_block().create_parameter(
-            dtype=dtype, shape=shape, **attr._to_kwargs())
+        if _in_imperative_mode():
+            self.main_program.global_block().create_parameter(
+                dtype=dtype, shape=shape, **attr._to_kwargs())
+            return self.startup_program.global_block().create_parameter(
+                dtype=dtype,
+                shape=shape,
+                **attr._to_kwargs(with_initializer=True))
+        else:
+            self.startup_program.global_block().create_parameter(
+                dtype=dtype,
+                shape=shape,
+                **attr._to_kwargs(with_initializer=True))
+            return self.main_program.global_block().create_parameter(
+                dtype=dtype, shape=shape, **attr._to_kwargs())
 
     def get_parameter(self, name):
         param = self.main_program.global_block().var(name)
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index cc1fdbd285..d83e2735ff 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import numpy as np
 import six
 import os
+import sys
 import inspect
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
@@ -9682,6 +9683,7 @@ class FC(layers.PyLayer):
             shape=param_shape,
             dtype=self._dtype,
             is_bias=False)
+        sys.stderr.write('created w: %s\n' % self._w.name)
 
     def forward(self, inputs):
         tmp = self._helper.create_variable_for_type_inference(self._dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py
index 0fe69d1bd4..6368f9b44a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import sys
 import contextlib
 import unittest
 import numpy as np
@@ -38,7 +39,9 @@ class MyLayer(fluid.imperative.PyLayer):
     def forward(self, inputs):
         x = fluid.layers.relu(inputs[0])
         self._x_for_debug = x
-        return [fluid.layers.elementwise_mul(x, x)]
+        x = fluid.layers.elementwise_mul(x, x)
+        x = fluid.layers.reduce_sum(x)
+        return [x]
 
 
 class MLP(fluid.imperative.PyLayer):
@@ -79,10 +82,12 @@ class TestImperative(unittest.TestCase):
         with new_program_scope():
             inp = fluid.layers.data(
                 name="inp", shape=[3], append_batch_size=False)
-            l = MyLayer()
-            x = l(inp)[0]
+            x = fluid.layers.relu(inp)
+            x_for_debug = x
+            x = fluid.layers.elementwise_mul(x, x)
+            x = fluid.layers.reduce_sum(x)
             param_grads = fluid.backward.append_backward(
-                x, parameter_list=[l._x_for_debug.name])[0]
+                x, parameter_list=[x_for_debug.name])[0]
             exe = fluid.Executor(fluid.CPUPlace())
 
             static_out, static_grad = exe.run(

From 61491ce250548122ec3abf3df0928c819906e091 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Wed, 26 Dec 2018 16:29:55 +0800
Subject: [PATCH 68/77] clean

test=develop
---
 paddle/fluid/framework/operator.cc         | 14 +++++-----
 paddle/fluid/framework/operator.h          | 10 ++++---
 paddle/fluid/imperative/layer.cc           | 32 ++++------------------
 paddle/fluid/imperative/tracer.h           | 29 ++------------------
 paddle/fluid/operators/fill_constant_op.cc |  4 +--
 paddle/fluid/pybind/imperative.cc          |  4 +--
 python/paddle/fluid/layers/nn.py           |  2 --
 7 files changed, 24 insertions(+), 71 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 38675d2cac..51b7f572c9 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -182,7 +182,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 
 void OperatorBase::Run(const RuntimeContext& ctx,
                        const platform::Place& place) {
-  RunImpl(ctx, place);
+  RunImplPrepared(ctx, place);
 }
 
 bool OperatorBase::HasInputs(const std::string& name) const {
@@ -959,9 +959,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   }
 }
 
-void OperatorWithKernel::RunImpl(const RuntimeContext& ctx,
-                                 const platform::Place& place) const {
-  Scope scope;
+void OperatorWithKernel::RunImplPrepared(const RuntimeContext& ctx,
+                                         const platform::Place& place) const {
+  Scope dummy_scope;
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
 
@@ -976,7 +976,7 @@ void OperatorWithKernel::RunImpl(const RuntimeContext& ctx,
   OpKernelMap& kernels = kernels_iter->second;
 
   auto expected_kernel_key = this->GetExpectedKernelType(
-      ExecutionContext(*this, scope, *dev_ctx, ctx));
+      ExecutionContext(*this, dummy_scope, *dev_ctx, ctx));
   VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
   auto kernel_iter = kernels.find(expected_kernel_key);
@@ -999,9 +999,9 @@ void OperatorWithKernel::RunImpl(const RuntimeContext& ctx,
     dev_ctx = pool.Get(expected_kernel_key.place_);
   }
 
-  RuntimeInferShapeContext infer_shape_ctx(*this, scope, ctx);
+  RuntimeInferShapeContext infer_shape_ctx(*this, dummy_scope, ctx);
   this->InferShape(&infer_shape_ctx);
-  kernel_iter->second(ExecutionContext(*this, scope, *dev_ctx, ctx));
+  kernel_iter->second(ExecutionContext(*this, dummy_scope, *dev_ctx, ctx));
 }
 
 void OperatorWithKernel::TransferInplaceVarsBack(
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 446d27efa0..3605bf22fc 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -173,8 +173,10 @@ class OperatorBase {
   virtual void RunImpl(const Scope& scope,
                        const platform::Place& place) const = 0;
 
-  virtual void RunImpl(const RuntimeContext& ctx,
-                       const platform::Place& place) const {}
+  virtual void RunImplPrepared(const RuntimeContext& ctx,
+                               const platform::Place& place) const {
+    PADDLE_THROW("%s doesn't support RunPreparedImpl", Type());
+  }
 };
 
 class ExecutionContext {
@@ -466,8 +468,8 @@ class OperatorWithKernel : public OperatorBase {
   // same.
   proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const;
   void RunImpl(const Scope& scope, const platform::Place& place) const final;
-  void RunImpl(const RuntimeContext& ctx,
-               const platform::Place& place) const final;
+  void RunImplPrepared(const RuntimeContext& ctx,
+                       const platform::Place& place) const final;
 
   /**
    * Transfer data from scope to a transfered scope. If there is no data need to
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 239ff029db..7741865f9f 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -31,11 +31,6 @@ using framework::Variable;
 void AddTo(Variable* src, Variable* dst) {
   framework::LoDTensor* dst_tensor = dst->GetMutable<framework::LoDTensor>();
   framework::LoDTensor* src_tensor = src->GetMutable<framework::LoDTensor>();
-
-  VLOG(3) << "apply var grad " << src_tensor->data<float>()[0] << " "
-          << src_tensor->data<float>()[1] << " "
-          << src_tensor->data<float>()[2];
-
   PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), "%lld vs %lld",
                  dst_tensor->numel(), src_tensor->numel());
   float* dst_data = dst_tensor->mutable_data<float>(platform::CPUPlace());
@@ -43,10 +38,6 @@ void AddTo(Variable* src, Variable* dst) {
   for (size_t i = 0; i < src_tensor->numel(); ++i) {
     dst_data[i] += src_data[i];
   }
-
-  VLOG(3) << "apply var dst grad " << dst_tensor->data<float>()[0] << " "
-          << dst_tensor->data<float>()[1] << " "
-          << dst_tensor->data<float>()[2];
 }
 
 class Autograd {
@@ -55,16 +46,10 @@ class Autograd {
 
   void RunBackward(VarBase* var) {
     PADDLE_ENFORCE(var->pre_op_->op_desc_);
-    // TODO(panyx0718): Only create for vars that "require_grad"
-    LOG(ERROR) << reinterpret_cast<void*>(var->grads_) << " vs "
-               << reinterpret_cast<void*>(
-                      var->pre_op_
-                          ->output_vars_[var->pre_op_out_name_]
-                                        [var->pre_op_out_idx_]
-                          ->grads_);
-    var->pre_op_->output_vars_[var->pre_op_out_name_][var->pre_op_out_idx_]
-        ->grads_->GetMutable<framework::LoDTensor>()
-        ->ShareDataWith(var->grads_->Get<framework::LoDTensor>());
+    PADDLE_ENFORCE(
+        var->grads_ ==
+        var->pre_op_->output_vars_[var->pre_op_out_name_][var->pre_op_out_idx_]
+            ->grads_);
 
     std::deque<OpBase*> ready;
     ready.push_back(var->pre_op_);
@@ -76,7 +61,6 @@ class Autograd {
       ready.pop_front();
       std::map<std::string, std::vector<VarBase*>> input_grads =
           ready_op->ApplyGrad();
-      VLOG(3) << "after apply grad";
 
       for (auto it : input_grads) {
         const std::vector<VarBase*>& ingrads = it.second;
@@ -160,17 +144,12 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
     for (size_t i = 0; i < it.second.size(); ++i) {
       outputs.push_back(new framework::Variable());
       outputs.back()->GetMutable<framework::LoDTensor>();
-      /*
-      auto& accum_grad_t = it.second[i]->Get<framework::LoDTensor>();
-      Variable* grad_var = outputs.back();
-      float* data = grad_var->GetMutable<framework::LoDTensor>()
-          ->mutable_data<float>(accum_grad_t.dims(), platform::CPUPlace());
-      std::fill(data, data + accum_grad_t.numel(), 0.0);*/
     }
   }
 
   framework::RuntimeContext ctx(grad_input_vars_, grad_outputs);
 
+  // No need to do static infer shape here.
   // grad_op_desc_->InferShape(*block_);
   grad_op_desc_->InferVarType(block_);
 
@@ -184,7 +163,6 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
     for (size_t i = 0; i < outputs.size(); ++i) {
       framework::Variable* orig_grad = origin_outputs[i];
       AddTo(outputs[i], orig_grad);
-      VLOG(3) << "done add to " << grad_op_desc_->Outputs().at(it.first)[i];
     }
   }
   return input_vars_;
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index e7a60621cd..6b2e978737 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -20,7 +20,6 @@
 
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/imperative/engine.h"
 #include "paddle/fluid/imperative/layer.h"
 
@@ -53,19 +52,14 @@ class Tracer {
  public:
   explicit Tracer(framework::BlockDesc* root_block,
                   framework::BlockDesc* startup_block)
-      : root_block_(root_block), startup_block_(startup_block) {
-    root_scope_ = new framework::Scope();
-    scopes_[root_block_] = root_scope_;
-    scopes_[startup_block_] = root_scope_;
-  }
+      : root_block_(root_block), startup_block_(startup_block) {}
 
-  virtual ~Tracer() { delete root_scope_; }
+  virtual ~Tracer() {}
 
   void Trace(OpBase* op,
              const std::map<std::string, std::vector<VarBase*>>& inputs,
              const std::map<std::string, std::vector<VarBase*>>& outputs,
              framework::BlockDesc* block) {
-    // framework::Scope* scope = GetScope(block);
     std::map<std::string, VarBase*> vars;
 
     framework::OpDesc* op_desc = op->op_desc_;
@@ -94,8 +88,7 @@ class Tracer {
           (*op->pre_ops_)[it.first].push_back(nullptr);
         }
         VLOG(3) << "input vname " << inp->var_desc_->Name() << " "
-                << inp->var_->Get<framework::LoDTensor>().dims().size()
-                << reinterpret_cast<void*>(inp->var_);
+                << inp->var_->IsInitialized();
       }
     }
 
@@ -119,8 +112,6 @@ class Tracer {
         out->pre_op_out_idx_ = i;
 
         VLOG(3) << "output vname " << out->var_desc_->Name() << " "
-                << out->var_->Get<framework::LoDTensor>().dims().size() << " "
-                << reinterpret_cast<void*>(out->var_) << " "
                 << out->var_->IsInitialized();
       }
     }
@@ -167,7 +158,6 @@ class Tracer {
           if (!var->grads_->IsInitialized()) {
             InitVar(var->var_, var->grads_);
           }
-          LOG(ERROR) << grad_outvar << " map to " << var->var_desc_->Name();
           grad_out_vars.push_back(var->grads_);
         }
       }
@@ -175,22 +165,9 @@ class Tracer {
     op->block_ = block;
   }
 
-  framework::Scope* GetScope(framework::BlockDesc* block) {
-    if (scopes_.find(block) != scopes_.end()) {
-      return scopes_.at(block);
-    }
-    framework::BlockDesc* parent_block = block->ParentBlock();
-    PADDLE_ENFORCE(scopes_.find(parent_block) != scopes_.end());
-    framework::Scope* scope = &scopes_[parent_block]->NewScope();
-    scopes_[block] = scope;
-    return scope;
-  }
-
  private:
-  std::map<framework::BlockDesc*, framework::Scope*> scopes_;
   framework::BlockDesc* root_block_;
   framework::BlockDesc* startup_block_;
-  framework::Scope* root_scope_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 7b04c5d21f..d10fb1214c 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -69,8 +69,8 @@ class FillConstantOp : public framework::OperatorBase {
     math::set_constant(dev_ctx, tensor, value);
   }
 
-  void RunImpl(const framework::RuntimeContext &ctx,
-               const platform::Place &dev_place) const override {
+  void RunImplPrepared(const framework::RuntimeContext &ctx,
+                       const platform::Place &dev_place) const override {
     auto data_type =
         static_cast<framework::proto::VarType::Type>(Attr<int>("dtype"));
     auto value = Attr<float>("value");
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index be63fb8778..7f9d937981 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -28,9 +28,7 @@ void BindTracer(pybind11::module *m) {
               framework::BlockDesc *startup_block) {
              new (&self) imperative::Tracer(root_block, startup_block);
            })
-      .def("trace", &imperative::Tracer::Trace)
-      .def("get_scope", &imperative::Tracer::GetScope,
-           pybind11::return_value_policy::reference);
+      .def("trace", &imperative::Tracer::Trace);
 }
 
 }  // namespace pybind
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index d83e2735ff..cc1fdbd285 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 import numpy as np
 import six
 import os
-import sys
 import inspect
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
@@ -9683,7 +9682,6 @@ class FC(layers.PyLayer):
             shape=param_shape,
             dtype=self._dtype,
             is_bias=False)
-        sys.stderr.write('created w: %s\n' % self._w.name)
 
     def forward(self, inputs):
         tmp = self._helper.create_variable_for_type_inference(self._dtype)

From 7b6bf9ddf23a70a0f67dcf412034d9cf8a02e5ef Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Wed, 26 Dec 2018 19:17:37 +0800
Subject: [PATCH 69/77] make fill_constant kernel-based

test=develop
---
 paddle/fluid/operators/fill_constant_op.cc    | 113 +++++-------------
 paddle/fluid/operators/fill_constant_op.cu.cc |  20 ++++
 paddle/fluid/operators/fill_constant_op.h     |  64 ++++++++++
 paddle/fluid/pybind/imperative.cc             |   1 -
 4 files changed, 111 insertions(+), 87 deletions(-)
 create mode 100644 paddle/fluid/operators/fill_constant_op.cu.cc
 create mode 100644 paddle/fluid/operators/fill_constant_op.h

diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index d10fb1214c..6c7b9fa115 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -12,103 +12,40 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/fill_constant_op.h"
 
 namespace paddle {
 namespace operators {
 
-class FillConstantInferShape : public framework::InferShapeBase {
+class FillConstantOp : public framework::OperatorWithKernel {
  public:
-  void operator()(framework::InferShapeContext *ctx) const override {
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of FillConstantOp should not be null.");
-    auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
+    auto& shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
     ctx->SetOutputDim("Out", framework::make_ddim(shape));
   }
-};
-
-class FillConstantOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto data_type =
-        static_cast<framework::proto::VarType::Type>(Attr<int>("dtype"));
-    auto value = Attr<float>("value");
-    auto force_cpu = Attr<bool>("force_cpu");
 
-    framework::Tensor *tensor = nullptr;
-
-    auto &out_var = *scope.FindVar(Output("Out"));
-
-    if (out_var.IsType<framework::LoDTensor>()) {
-      tensor = out_var.GetMutable<framework::LoDTensor>();
-      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
-    } else if (out_var.IsType<framework::SelectedRows>()) {
-      tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
-      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
-    } else {
-      PADDLE_THROW(
-          "fill constant op's output only"
-          "supports SelectedRows and LoDTensor");
-    }
-
-    if (force_cpu) {
-      auto cpu = platform::CPUPlace();
-      tensor->mutable_data(cpu, data_type);
-    } else {
-      tensor->mutable_data(dev_place, data_type);
-    }
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(dev_place);
-    math::set_constant(dev_ctx, tensor, value);
-  }
-
-  void RunImplPrepared(const framework::RuntimeContext &ctx,
-                       const platform::Place &dev_place) const override {
-    auto data_type =
-        static_cast<framework::proto::VarType::Type>(Attr<int>("dtype"));
-    auto value = Attr<float>("value");
-    auto force_cpu = Attr<bool>("force_cpu");
-
-    framework::Tensor *tensor = nullptr;
-
-    auto &out_var = *ctx.outputs.at("Out")[0];
-
-    if (out_var.IsType<framework::LoDTensor>()) {
-      tensor = out_var.GetMutable<framework::LoDTensor>();
-      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
-    } else if (out_var.IsType<framework::SelectedRows>()) {
-      tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
-      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
-    } else {
-      PADDLE_THROW(
-          "fill constant op's output only"
-          "supports SelectedRows and LoDTensor");
-    }
-
-    if (force_cpu) {
-      auto cpu = platform::CPUPlace();
-      tensor->mutable_data(cpu, data_type);
-    } else {
-      tensor->mutable_data(dev_place, data_type);
-    }
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(dev_place);
-    math::set_constant(dev_ctx, tensor, value);
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
+        ctx.GetPlace());
   }
 };
 
 class FillConstantOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {}
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    auto data_type = static_cast<framework::proto::VarType::Type>(
+        boost::get<int>(op_desc.GetAttr("dtype")));
+    auto& out_var_name = op_desc.Output("Out").front();
+    block->Var(out_var_name)->SetDataType(data_type);
+  }
 };
 
 class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -142,7 +79,11 @@ Fill up a variable with specified constant value.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(fill_constant, ops::FillConstantOp,
-                  ops::FillConstantInferShape, ops::FillConstantOpMaker,
-                  paddle::framework::EmptyGradOpMaker,
-                  ops::FillConstantOpVarTypeInference);
+
+REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, ops::FillConstantOpMaker,
+                  ops::FillConstantOpVarTypeInference,
+                  paddle::framework::EmptyGradOpMaker);
+
+REGISTER_OP_CPU_KERNEL(fill_constant, ops::FillConstantKernel<float>,
+                       ops::FillConstantKernel<double>,
+                       ops::FillConstantKernel<int64_t>);
diff --git a/paddle/fluid/operators/fill_constant_op.cu.cc b/paddle/fluid/operators/fill_constant_op.cu.cc
new file mode 100644
index 0000000000..fba5583505
--- /dev/null
+++ b/paddle/fluid/operators/fill_constant_op.cu.cc
@@ -0,0 +1,20 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fill_constant_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(fill_constant, ops::FillConstantKernel<float>,
+                        ops::FillConstantKernel<double>,
+                        ops::FillConstantKernel<int64_t>);
diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h
new file mode 100644
index 0000000000..417c5b4da6
--- /dev/null
+++ b/paddle/fluid/operators/fill_constant_op.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+template <typename T>
+class FillConstantKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext &ctx) const override {
+    auto data_type =
+        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
+    auto value = ctx.Attr<float>("value");
+    auto force_cpu = ctx.Attr<bool>("force_cpu");
+
+    framework::Tensor *tensor = nullptr;
+
+    framework::Variable *out_var = ctx.OutputVar("Out");
+
+    if (out_var->IsType<framework::LoDTensor>()) {
+      tensor = out_var->GetMutable<framework::LoDTensor>();
+      tensor->Resize(
+          framework::make_ddim(ctx.Attr<std::vector<int64_t>>("shape")));
+    } else if (out_var->IsType<framework::SelectedRows>()) {
+      tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
+      tensor->Resize(
+          framework::make_ddim(ctx.Attr<std::vector<int64_t>>("shape")));
+    } else {
+      PADDLE_THROW(
+          "fill constant op's output only"
+          "supports SelectedRows and LoDTensor");
+    }
+
+    if (force_cpu) {
+      tensor->mutable_data(platform::CPUPlace(), data_type);
+    } else {
+      tensor->mutable_data(ctx.GetPlace(), data_type);
+    }
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(ctx.GetPlace());
+    math::set_constant(dev_ctx, tensor, value);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 7f9d937981..819943508b 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/imperative/tracer.h"
 
 namespace paddle {

From 4e80e04f230cdd1c8e14eabfd204329b33867f8c Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Wed, 26 Dec 2018 19:22:32 +0800
Subject: [PATCH 70/77] fix

test=develop
---
 paddle/fluid/framework/operator.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 51b7f572c9..ea3f4b7715 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <glog/logging.h>
 
 #include <algorithm>
-
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -1104,8 +1103,7 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
           t = &(var->Get<SelectedRows>().value());
         }
         if (t != nullptr) {
-          PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized",
-                         ipt_name);
+          PADDLE_ENFORCE(t->IsInitialized(), "Input is not initialized");
           int tmp = static_cast<int>(t->type());
           PADDLE_ENFORCE(
               tmp == data_type || data_type == -1,

From f52b514dcd2db6dcec5c817ac516baf5af4273eb Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Thu, 27 Dec 2018 09:38:44 +0800
Subject: [PATCH 71/77] call kernel

---
 paddle/fluid/framework/operator.cc            | 11 ++-
 paddle/fluid/framework/operator.h             |  5 +-
 paddle/fluid/imperative/layer.cc              | 30 +++++---
 paddle/fluid/imperative/layer.h               | 73 +++++++++++++++----
 paddle/fluid/imperative/tracer.h              | 29 +++++---
 paddle/fluid/operators/fill_constant_op.cc    |  3 +-
 python/paddle/fluid/layer_helper.py           |  2 +
 .../fluid/tests/unittests/test_imperative.py  |  9 +--
 8 files changed, 114 insertions(+), 48 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index ea3f4b7715..dc365a954d 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -179,8 +179,8 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
   VLOG(3) << place << " " << DebugStringEx(&scope);
 }
 
-void OperatorBase::Run(const RuntimeContext& ctx,
-                       const platform::Place& place) {
+void OperatorBase::RunPrepared(const RuntimeContext& ctx,
+                               const platform::Place& place) {
   RunImplPrepared(ctx, place);
 }
 
@@ -1092,7 +1092,9 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
     const ExecutionContext& ctx) const {
   int data_type = -1;
   for (auto& input : this->inputs_) {
-    for (const Variable* var : ctx.MultiInputVar(input.first)) {
+    const std::vector<const Variable*> vars = ctx.MultiInputVar(input.first);
+    for (size_t i = 0; i < vars.size(); ++i) {
+      const Variable* var = vars[i];
       if (var != nullptr) {
         const Tensor* t = nullptr;
         if (var->IsType<Tensor>()) {
@@ -1103,7 +1105,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
           t = &(var->Get<SelectedRows>().value());
         }
         if (t != nullptr) {
-          PADDLE_ENFORCE(t->IsInitialized(), "Input is not initialized");
+          PADDLE_ENFORCE(t->IsInitialized(), "Input %s(%lu)is not initialized",
+                         input.first, i);
           int tmp = static_cast<int>(t->type());
           PADDLE_ENFORCE(
               tmp == data_type || data_type == -1,
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 3605bf22fc..a6bdc0bfa7 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -105,7 +105,7 @@ class OperatorBase {
   /// Executor will call this interface function to Run an op.
   //  The implementation should be written at RunImpl
   void Run(const Scope& scope, const platform::Place& place);
-  void Run(const RuntimeContext& ctx, const platform::Place& place);
+  void RunPrepared(const RuntimeContext& ctx, const platform::Place& place);
 
   // FIXME(typhoonzero): this is only used for recv_op to stop event_loop.
   virtual void Stop() {}
@@ -457,8 +457,9 @@ class OperatorWithKernel : public OperatorBase {
   void RuntimeInferShape(const Scope& scope, const platform::Place& place,
                          const RuntimeContext& ctx) const override;
 
- protected:
   virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
+
+ protected:
   virtual OpKernelType GetKernelTypeForVar(
       const std::string& var_name, const Tensor& tensor,
       const OpKernelType& expected_kernel_type) const;
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 7741865f9f..0d850ee162 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -45,12 +45,6 @@ class Autograd {
   Autograd() {}
 
   void RunBackward(VarBase* var) {
-    PADDLE_ENFORCE(var->pre_op_->op_desc_);
-    PADDLE_ENFORCE(
-        var->grads_ ==
-        var->pre_op_->output_vars_[var->pre_op_out_name_][var->pre_op_out_idx_]
-            ->grads_);
-
     std::deque<OpBase*> ready;
     ready.push_back(var->pre_op_);
 
@@ -66,7 +60,7 @@ class Autograd {
         const std::vector<VarBase*>& ingrads = it.second;
         for (size_t i = 0; i < ingrads.size(); ++i) {
           if (!ingrads[i]) continue;
-          OpBase* pre_op = (*ready_op->pre_ops_)[it.first][i];
+          OpBase* pre_op = ready_op->pre_ops_[it.first][i];
           if (!pre_op) continue;
 
           dep_counts[pre_op] -= 1;
@@ -91,7 +85,7 @@ class Autograd {
     while (!queue.empty()) {
       OpBase* candidate = queue.front();
       queue.pop_front();
-      for (auto it : *(candidate->pre_ops_)) {
+      for (auto it : candidate->pre_ops_) {
         for (OpBase* pre_op : it.second) {
           if (!pre_op) continue;
           if (visited.find(pre_op) == visited.end()) {
@@ -138,11 +132,13 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
   }
   VLOG(3) << "op grad " << grad_op_desc_->Type();
 
+  std::vector<std::unique_ptr<framework::Variable>> tmp_vars;
   std::map<std::string, std::vector<framework::Variable*>> grad_outputs;
   for (auto it : grad_output_vars_) {
     auto& outputs = grad_outputs[it.first];
     for (size_t i = 0; i < it.second.size(); ++i) {
-      outputs.push_back(new framework::Variable());
+      tmp_vars.emplace_back(new framework::Variable());
+      outputs.push_back(tmp_vars.back().get());
       outputs.back()->GetMutable<framework::LoDTensor>();
     }
   }
@@ -155,7 +151,15 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
 
   std::unique_ptr<framework::OperatorBase> opbase =
       framework::OpRegistry::CreateOp(*grad_op_desc_);
-  opbase->Run(ctx, platform::CPUPlace());
+  framework::OperatorWithKernel* op_kernel =
+      dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
+  PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
+
+  framework::Scope scope;
+  platform::CPUPlace place;
+  PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
+  p.op.RuntimeInferShape(scope, place, ctx);
+  p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
 
   for (auto it : grad_output_vars_) {
     auto& outputs = grad_outputs[it.first];
@@ -169,11 +173,15 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
 }
 
 void VarBase::RunBackward() {
+  if (!pre_op_) return;
+
   auto grads_t = grads_->GetMutable<framework::LoDTensor>();
   float* data = grads_t->mutable_data<float>(platform::CPUPlace());
   std::fill(data, data + grads_t->numel(), 1.0);
 
-  if (!pre_op_) return;
+  PADDLE_ENFORCE(
+      grads_ ==
+      pre_op_->output_vars_[pre_op_out_name_][pre_op_out_idx_]->grads_);
   Autograd().RunBackward(this);
 }
 
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index eb5fd553bd..6225edea77 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -25,6 +25,59 @@
 namespace paddle {
 namespace imperative {
 
+class PreparedOp {
+ public:
+  PreparedOp(const framework::OperatorBase& op,
+             const framework::RuntimeContext& ctx,
+             framework::OperatorWithKernel::OpKernelFunc func,
+             platform::DeviceContext* dev_ctx)
+      : op(op), ctx(ctx), func(func), dev_ctx(dev_ctx) {}
+
+  static PreparedOp Prepare(const framework::RuntimeContext& ctx,
+                            const framework::OperatorWithKernel& op,
+                            const platform::Place& place) {
+    framework::Scope dummy_scope;
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto* dev_ctx = pool.Get(place);
+
+    // check if op[type] has kernel registered.
+    auto& all_op_kernels = op.AllOpKernels();
+    auto kernels_iter = all_op_kernels.find(op.Type());
+    if (kernels_iter == all_op_kernels.end()) {
+      PADDLE_THROW(
+          "There are no kernels which are registered in the %s operator.",
+          op.Type());
+    }
+
+    framework::OperatorWithKernel::OpKernelMap& kernels = kernels_iter->second;
+
+    auto expected_kernel_key = op.GetExpectedKernelType(
+        framework::ExecutionContext(op, dummy_scope, *dev_ctx, ctx));
+    VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+
+    auto kernel_iter = kernels.find(expected_kernel_key);
+#ifdef PADDLE_WITH_MKLDNN
+    // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
+    if (kernel_iter == kernels.end() &&
+        expected_kernel_key.library_type_ == framework::LibraryType::kMKLDNN) {
+      VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
+      expected_kernel_key.library_type_ = framework::LibraryType::kPlain;
+      expected_kernel_key.data_layout_ = framework::DataLayout::kAnyLayout;
+      kernel_iter = kernels.find(expected_kernel_key);
+    }
+#endif
+    if (kernel_iter == kernels.end()) {
+      PADDLE_THROW("op %s does not have kernel for %s", op.Type(),
+                   KernelTypeToString(expected_kernel_key));
+    }
+    return PreparedOp(op, ctx, kernel_iter->second, dev_ctx);
+  }
+
+  const framework::OperatorBase& op;
+  const framework::RuntimeContext& ctx;
+  framework::OperatorWithKernel::OpKernelFunc func;
+  platform::DeviceContext* dev_ctx;
+};
 class OpBase;
 
 class VarBase {
@@ -62,30 +115,22 @@ class VarBase {
 
 class OpBase {
  public:
-  OpBase()
-      : pre_ops_(new std::map<std::string, std::vector<OpBase*>>()),
-        pre_ops_out_idx_(new std::map<std::string, std::vector<int>>()),
-        op_desc_(nullptr),
-        grad_op_desc_(nullptr) {}
+  OpBase() : op_desc_(nullptr), grad_op_desc_(nullptr) {}
 
   virtual ~OpBase() {
-    delete pre_ops_;
-    delete pre_ops_out_idx_;
-
     if (grad_op_desc_) delete grad_op_desc_;
-    if (grad_to_var_) delete grad_to_var_;
   }
 
   std::map<std::string, std::vector<VarBase*>> ApplyGrad();
 
+  framework::OpDesc* op_desc_;
+  framework::OpDesc* grad_op_desc_;
+
   std::map<std::string, std::vector<VarBase*>> input_vars_;
   std::map<std::string, std::vector<VarBase*>> output_vars_;
-  std::map<std::string, std::vector<OpBase*>>* pre_ops_;
-  std::map<std::string, std::vector<int>>* pre_ops_out_idx_;
-  framework::OpDesc* op_desc_;
+  std::map<std::string, std::vector<OpBase*>> pre_ops_;
+  std::map<std::string, std::vector<int>> pre_ops_out_idx_;
 
-  framework::OpDesc* grad_op_desc_;
-  std::unordered_map<std::string, std::string>* grad_to_var_;
   std::map<std::string, std::vector<framework::Variable*>> grad_input_vars_;
   std::map<std::string, std::vector<framework::Variable*>> grad_output_vars_;
   framework::BlockDesc* block_;
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 6b2e978737..1f0c7b30b4 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -82,10 +82,10 @@ class Tracer {
         invars.push_back(inp->var_);
         vars[inp->var_desc_->Name()] = inp;
         if (inp->pre_op_) {
-          (*op->pre_ops_)[it.first].push_back(inp->pre_op_);
-          (*op->pre_ops_out_idx_)[it.first].push_back(inp->pre_op_out_idx_);
+          op->pre_ops_[it.first].push_back(inp->pre_op_);
+          op->pre_ops_out_idx_[it.first].push_back(inp->pre_op_out_idx_);
         } else {
-          (*op->pre_ops_)[it.first].push_back(nullptr);
+          op->pre_ops_[it.first].push_back(nullptr);
         }
         VLOG(3) << "input vname " << inp->var_desc_->Name() << " "
                 << inp->var_->IsInitialized();
@@ -118,24 +118,33 @@ class Tracer {
 
     VLOG(3) << "tracer running " << op_desc->Type();
     framework::RuntimeContext ctx(invars_map, outvars_map);
-    op_base->Run(ctx, platform::CPUPlace());
+    // op_base->RunPrepared(ctx, platform::CPUPlace());
+
+    // TODO(panyx0718): Cache p.
+    framework::OperatorWithKernel* op_kernel =
+        dynamic_cast<framework::OperatorWithKernel*>(op_base.get());
+    PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
+
+    framework::Scope scope;
+    platform::CPUPlace place;
+    PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
+    p.op.RuntimeInferShape(scope, place, ctx);
+    p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
 
     if (block == startup_block_) {
       op->grad_op_desc_ = nullptr;
-      op->grad_to_var_ = nullptr;
     } else {
       framework::OpDesc* grad_op_desc;
       auto grad_to_var = new std::unordered_map<std::string, std::string>();
       CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var);
       op->grad_op_desc_ = grad_op_desc;
-      op->grad_to_var_ = grad_to_var;
 
       for (auto it : grad_op_desc->Inputs()) {
         auto& grad_in_vars = op->grad_input_vars_[it.first];
         for (const std::string& grad_invar : it.second) {
           block->FindRecursiveOrCreateVar(grad_invar);
-          auto var_it = op->grad_to_var_->find(grad_invar);
-          if (var_it == op->grad_to_var_->end()) {
+          auto var_it = grad_to_var->find(grad_invar);
+          if (var_it == grad_to_var->end()) {
             auto fwd_var_it = vars.find(grad_invar);
             PADDLE_ENFORCE(fwd_var_it != vars.end());
             grad_in_vars.push_back(fwd_var_it->second->var_);
@@ -152,8 +161,8 @@ class Tracer {
         auto& grad_out_vars = op->grad_output_vars_[it.first];
         for (const std::string& grad_outvar : it.second) {
           block->FindRecursiveOrCreateVar(grad_outvar);
-          auto var_it = op->grad_to_var_->find(grad_outvar);
-          PADDLE_ENFORCE(var_it != op->grad_to_var_->end());
+          auto var_it = grad_to_var->find(grad_outvar);
+          PADDLE_ENFORCE(var_it != grad_to_var->end());
           VarBase* var = vars[var_it->second];
           if (!var->grads_->IsInitialized()) {
             InitVar(var->var_, var->grads_);
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 6c7b9fa115..73f38de08e 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -86,4 +86,5 @@ REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, ops::FillConstantOpMaker,
 
 REGISTER_OP_CPU_KERNEL(fill_constant, ops::FillConstantKernel<float>,
                        ops::FillConstantKernel<double>,
-                       ops::FillConstantKernel<int64_t>);
+                       ops::FillConstantKernel<int64_t>,
+                       ops::FillConstantKernel<int>);
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index 0a299bc2fb..8543cb847d 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -316,6 +316,8 @@ class LayerHelper(object):
         if _in_imperative_mode():
             self.main_program.global_block().create_parameter(
                 dtype=dtype, shape=shape, **attr._to_kwargs())
+            # In imperative mode, we want the returned parameter to be
+            # initialized so that it can be used imperatively.
             return self.startup_program.global_block().create_parameter(
                 dtype=dtype,
                 shape=shape,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py
index 6368f9b44a..6b6ab227de 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import contextlib
 import unittest
 import numpy as np
@@ -82,12 +81,10 @@ class TestImperative(unittest.TestCase):
         with new_program_scope():
             inp = fluid.layers.data(
                 name="inp", shape=[3], append_batch_size=False)
-            x = fluid.layers.relu(inp)
-            x_for_debug = x
-            x = fluid.layers.elementwise_mul(x, x)
-            x = fluid.layers.reduce_sum(x)
+            l = MyLayer()
+            x = l(inp)[0]
             param_grads = fluid.backward.append_backward(
-                x, parameter_list=[x_for_debug.name])[0]
+                x, parameter_list=[l._x_for_debug.name])[0]
             exe = fluid.Executor(fluid.CPUPlace())
 
             static_out, static_grad = exe.run(

From b91a7a9d3073e4e38f659f4353dbf4eb0215d816 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Thu, 27 Dec 2018 09:41:18 +0800
Subject: [PATCH 72/77] clear operator changes

test=develop
---
 paddle/fluid/framework/operator.cc | 50 ------------------------------
 paddle/fluid/framework/operator.h  |  8 -----
 paddle/fluid/imperative/tracer.h   |  1 -
 3 files changed, 59 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index dc365a954d..d67782319d 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -179,11 +179,6 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
   VLOG(3) << place << " " << DebugStringEx(&scope);
 }
 
-void OperatorBase::RunPrepared(const RuntimeContext& ctx,
-                               const platform::Place& place) {
-  RunImplPrepared(ctx, place);
-}
-
 bool OperatorBase::HasInputs(const std::string& name) const {
   return inputs_.find(name) != inputs_.end();
 }
@@ -958,51 +953,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   }
 }
 
-void OperatorWithKernel::RunImplPrepared(const RuntimeContext& ctx,
-                                         const platform::Place& place) const {
-  Scope dummy_scope;
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto* dev_ctx = pool.Get(place);
-
-  // check if op[type] has kernel registered.
-  auto& all_op_kernels = AllOpKernels();
-  auto kernels_iter = all_op_kernels.find(type_);
-  if (kernels_iter == all_op_kernels.end()) {
-    PADDLE_THROW(
-        "There are no kernels which are registered in the %s operator.", type_);
-  }
-
-  OpKernelMap& kernels = kernels_iter->second;
-
-  auto expected_kernel_key = this->GetExpectedKernelType(
-      ExecutionContext(*this, dummy_scope, *dev_ctx, ctx));
-  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
-
-  auto kernel_iter = kernels.find(expected_kernel_key);
-#ifdef PADDLE_WITH_MKLDNN
-  // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
-  if (kernel_iter == kernels.end() &&
-      expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
-    VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
-    expected_kernel_key.library_type_ = LibraryType::kPlain;
-    expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
-    kernel_iter = kernels.find(expected_kernel_key);
-  }
-#endif
-  if (kernel_iter == kernels.end()) {
-    PADDLE_THROW("op %s does not have kernel for %s", type_,
-                 KernelTypeToString(expected_kernel_key));
-  }
-
-  if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) {
-    dev_ctx = pool.Get(expected_kernel_key.place_);
-  }
-
-  RuntimeInferShapeContext infer_shape_ctx(*this, dummy_scope, ctx);
-  this->InferShape(&infer_shape_ctx);
-  kernel_iter->second(ExecutionContext(*this, dummy_scope, *dev_ctx, ctx));
-}
-
 void OperatorWithKernel::TransferInplaceVarsBack(
     const Scope& scope, const std::vector<std::string>& inplace_vars,
     const Scope& transfer_scope) const {
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index a6bdc0bfa7..e2bedc60d2 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -105,7 +105,6 @@ class OperatorBase {
   /// Executor will call this interface function to Run an op.
   //  The implementation should be written at RunImpl
   void Run(const Scope& scope, const platform::Place& place);
-  void RunPrepared(const RuntimeContext& ctx, const platform::Place& place);
 
   // FIXME(typhoonzero): this is only used for recv_op to stop event_loop.
   virtual void Stop() {}
@@ -172,11 +171,6 @@ class OperatorBase {
   void CheckAllInputOutputSet() const;
   virtual void RunImpl(const Scope& scope,
                        const platform::Place& place) const = 0;
-
-  virtual void RunImplPrepared(const RuntimeContext& ctx,
-                               const platform::Place& place) const {
-    PADDLE_THROW("%s doesn't support RunPreparedImpl", Type());
-  }
 };
 
 class ExecutionContext {
@@ -469,8 +463,6 @@ class OperatorWithKernel : public OperatorBase {
   // same.
   proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const;
   void RunImpl(const Scope& scope, const platform::Place& place) const final;
-  void RunImplPrepared(const RuntimeContext& ctx,
-                       const platform::Place& place) const final;
 
   /**
    * Transfer data from scope to a transfered scope. If there is no data need to
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 1f0c7b30b4..c814da9853 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -118,7 +118,6 @@ class Tracer {
 
     VLOG(3) << "tracer running " << op_desc->Type();
     framework::RuntimeContext ctx(invars_map, outvars_map);
-    // op_base->RunPrepared(ctx, platform::CPUPlace());
 
     // TODO(panyx0718): Cache p.
     framework::OperatorWithKernel* op_kernel =

From c132c790111d7fadf212a72ec2cd35e03aed364f Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Thu, 27 Dec 2018 11:23:24 +0800
Subject: [PATCH 73/77] address comments and resolve conflicts.

test=develop
---
 paddle/fluid/imperative/layer.cc | 20 --------------------
 paddle/fluid/imperative/layer.h  |  3 +--
 2 files changed, 1 insertion(+), 22 deletions(-)

diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 0d850ee162..26e7830265 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -100,26 +100,6 @@ class Autograd {
   }
 };
 
-void CreateVariable(const std::string& name, const framework::DDim& dim,
-                    float val, bool random_name, framework::Variable* var) {
-  if (var->IsInitialized()) return;
-
-  std::string varname = name;
-  if (random_name) {
-    std::mt19937 rng;
-    rng.seed(std::random_device()());
-    std::uniform_int_distribution<std::mt19937::result_type> dist6(
-        1, std::numeric_limits<int>::max());
-    int id = dist6(rng);
-    varname = string::Sprintf("%s@%d", varname, id);
-  }
-
-  VLOG(3) << "creating var " << varname;
-  framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
-  float* data = tensor->mutable_data<float>(dim, platform::CPUPlace());
-  std::fill(data, data + tensor->numel(), val);
-}
-
 framework::LoDTensor& VarBase::Grad() {
   VLOG(3) << "get var grad " << var_desc_->Name();
   return *grads_->GetMutable<framework::LoDTensor>();
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 6225edea77..ae4e8e0f8a 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -36,7 +36,6 @@ class PreparedOp {
   static PreparedOp Prepare(const framework::RuntimeContext& ctx,
                             const framework::OperatorWithKernel& op,
                             const platform::Place& place) {
-    framework::Scope dummy_scope;
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto* dev_ctx = pool.Get(place);
 
@@ -52,7 +51,7 @@ class PreparedOp {
     framework::OperatorWithKernel::OpKernelMap& kernels = kernels_iter->second;
 
     auto expected_kernel_key = op.GetExpectedKernelType(
-        framework::ExecutionContext(op, dummy_scope, *dev_ctx, ctx));
+        framework::ExecutionContext(op, framework::Scope(), *dev_ctx, ctx));
     VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
     auto kernel_iter = kernels.find(expected_kernel_key);

From fe8495a7583b503a094168aae38a22843c96a72d Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Wed, 26 Dec 2018 23:42:35 -0600
Subject: [PATCH 74/77] [WIP] Refine MultiDevSSAGraph (#15040)

* refine parallel_exe
test=develop

* rename shared_var_device

* code refine

* add test_weight_decay

* remove Sort
test=develop

* Add SortForReduce
test=develop

* code refine
test=develop

* follow comment
test=develop
---
 .../details/multi_devices_graph_pass.cc       | 405 +++++++++---------
 .../details/multi_devices_graph_pass.h        |  19 +-
 paddle/fluid/framework/ir/graph.cc            |  58 ---
 paddle/fluid/framework/parallel_executor.cc   |   5 +-
 python/paddle/fluid/parallel_executor.py      |   4 +-
 .../tests/unittests/test_weight_decay.py      | 188 ++++++++
 6 files changed, 401 insertions(+), 278 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_weight_decay.py

diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 7e320a0894..5b9a818117 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -42,6 +42,12 @@ namespace {
 typedef std::vector<OpHandleBase *> GraphOps;
 const char kGraphOps[] = "ops";
 
+bool OpHaveRole(const ir::Node &node, const framework::OpRole &role) {
+  return boost::get<int>(
+             node.Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+         static_cast<int>(role);
+}
+
 void PolishGraphToSupportDataHazards(ir::Graph *graph) {
   for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
     for (auto &name_pair : var_map) {
@@ -147,6 +153,7 @@ void MultiDevSSAGraphBuilder::Init() const {
 #endif
 
   balance_vars_.resize(places_.size(), 0);
+
   if (strategy_.enable_data_balance_ && places_.size() == 1) {
     LOG(WARNING) << "It is no need to enable data balance when there is only "
                     "one place. enable_data_balance is set to False.";
@@ -154,145 +161,16 @@ void MultiDevSSAGraphBuilder::Init() const {
   }
 }
 
-void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result,
-                                                ir::Node *node,
-                                                size_t place_id) const {
-  auto p = places_[place_id];
-  auto *op_handle = result->Get<GraphOps>(kGraphOps).back();
-  op_handle->SetDeviceContext(p,
-                              platform::DeviceContextPool::Instance().Get(p));
-
-  for (ir::Node *input : node->inputs) {
-    VarHandle *var = CreateOrGetLatestVarHandle(result, input, p, place_id);
-    op_handle->AddInput(var);
-  }
-
-  for (ir::Node *output : node->outputs) {
-    ir::Node *new_node = nullptr;
-    if (output->Var()) {
-      new_node = result->CreateVarNode(output->Var());
-    } else {
-      new_node =
-          result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable);
-    }
-    CreateOpOutput(result, op_handle, new_node, p, place_id);
-  }
-}
-
-std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainSendVars(
-    const std::vector<ir::Node *> &nodes) const {
-  std::vector<std::string> send_vars;
-  // since parameters are all in block 0,
-  // it's enough to only scan send ops in block 0
-  for (auto &node : nodes) {
-    OpDesc *op = node->Op();
-    // TODO(Yancey1989): use a graceful method to find send op,
-    // instead of the the hard code string
-    if (op->Type() == "send") {
-      auto op_vars = op->InputArgumentNames();
-      send_vars.reserve(send_vars.size() +
-                        std::distance(op_vars.begin(), op_vars.end()));
-      send_vars.insert(send_vars.end(), op_vars.begin(), op_vars.end());
-    }
-  }
-  return send_vars;
-}
-
-std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainRecvVars(
-    const std::vector<ir::Node *> &nodes) const {
-  std::vector<std::string> recv_vars;
-  for (auto &node : nodes) {
-    OpDesc *op = node->Op();
-    // TODO(Yancey1989): use a graceful method to find recv op,
-    // instead of the hard code string
-    if (op->Type() == "recv") {
-      auto op_vars = op->OutputArgumentNames();
-      recv_vars.reserve(recv_vars.size() +
-                        std::distance(op_vars.begin(), op_vars.end()));
-      recv_vars.insert(recv_vars.end(), op_vars.begin(), op_vars.end());
-    }
-  }
-  return recv_vars;
-}
-
-size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
-    const std::vector<std::string> &var_names) const {
-  int64_t numel_sum = 0;
-  for (auto var_name : var_names) {
-    if (all_vars_.find(var_name) == all_vars_.end()) continue;
-    auto var_desc = all_vars_.at(var_name);
-    PADDLE_ENFORCE_NOT_NULL(var_desc);
-    auto dim = framework::make_ddim(var_desc->GetShape());
-    int64_t numel = framework::product(dim);
-    PADDLE_ENFORCE_GT(numel, 0);
-    numel_sum += numel;
-  }
-
-  auto smallest =
-      std::min_element(std::begin(balance_vars_), std::end(balance_vars_));
-  size_t dev_id =
-      static_cast<size_t>(std::distance(std::begin(balance_vars_), smallest));
-  balance_vars_[dev_id] += numel_sum;
-  return dev_id;
-}
-
-// Topology sort the graph nodes from inputs to outputs.
-// Since SSAGraphBuilder depends on forward/backward nodes to assign devices
-// to parameter/gradients before optimizer ops, topo sort is insufficient. (
-// some optimizer ops might not depend on any nodes), we manually move all
-// optimizer nodes after last backward nodes.
-// However, the assumption by SSAGraphBuilder should be relaxed in the future.
-std::vector<ir::Node *> SortOpsAndDelayOptimizeOp(const ir::Graph &graph) {
-  std::vector<ir::Node *> ret = ir::TopologySortOperations(graph);
-  size_t last_backward = 0;
-  for (size_t i = 0; i < ret.size(); ++i) {
-    if (boost::get<int>(
-            ret[i]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
-        static_cast<int>(OpRole::kBackward)) {
-      last_backward = i;
-    }
-  }
-
-  std::vector<ir::Node *> optimize_ops;
-  std::vector<ir::Node *> sorted_ret;
-  for (size_t i = 0; i < ret.size(); ++i) {
-    if (i < last_backward) {
-      if (static_cast<bool>(boost::get<int>(ret[i]->Op()->GetAttr(
-                                OpProtoAndCheckerMaker::OpRoleAttrName())) &
-                            static_cast<int>(OpRole::kOptimize))) {
-        optimize_ops.push_back(ret[i]);
-      } else {
-        sorted_ret.push_back(ret[i]);
-      }
-    } else if (i == last_backward) {
-      sorted_ret.push_back(ret[i]);
-      // Verify that no operations before optimize ops depends on optimize ops.
-      std::unordered_set<ir::Node *> optimize_set(optimize_ops.begin(),
-                                                  optimize_ops.end());
-      for (ir::Node *n : sorted_ret) {
-        for (ir::Node *in : n->inputs) {
-          for (ir::Node *pre_n : in->inputs) {
-            PADDLE_ENFORCE(optimize_set.find(pre_n) == optimize_set.end(),
-                           "optimize operations cannot be depended by forward "
-                           "or backward node %s -> %s",
-                           pre_n->Name(), n->Name());
-          }
-        }
-      }
-      sorted_ret.insert(sorted_ret.end(), optimize_ops.begin(),
-                        optimize_ops.end());
-    } else {
-      sorted_ret.push_back(ret[i]);
-    }
-  }
-  return sorted_ret;
-}
-
 std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
   Init();
   // Give the topology sort order and rebuild the graph structure.
-  std::vector<ir::Node *> sorted_ops = SortOpsAndDelayOptimizeOp(*graph);
+  std::vector<ir::Node *> sorted_ops = ir::TopologySortOperations(*graph);
+
+  if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
+    sorted_ops = SortForReduceMode(sorted_ops);
+  }
+
   auto nodes = graph->ReleaseNodes();
   ir::Graph &result = *graph;
 
@@ -303,31 +181,22 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
       all_vars_.emplace(node->Name(), node->Var());
     }
   }
-  std::unordered_set<std::string> og_has_been_broadcast;
 
   // We cannot invoke resize. It is a bug of GCC 4.8
   result.Set(kGraphVars, new GraphVars(places_.size()));
   result.Set(kGraphDepVars, new GraphDepVars);
   result.Set(kGraphOps, new GraphOps);
 
-  // find send/recv vars so that we can place the distributed training
-  // related op in the place 0
-  auto send_vars = FindDistTrainSendVars(sorted_ops);
-  auto recv_vars = FindDistTrainRecvVars(sorted_ops);
-
   std::vector<std::unordered_set<std::string>> bcast_var_name_set;
   bcast_var_name_set.resize(places_.size());
 
-  size_t cur_device_id = 0;
   bool is_forwarding = true;
   bool is_dist_train = false;
 
   std::unordered_map<std::string, int> sharded_var_device;
 
   for (ir::Node *node : sorted_ops) {
-    if (boost::get<int>(
-            node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
-        static_cast<int>(OpRole::kRPC)) {
+    if (OpHaveRole(*node, OpRole::kRPC)) {
       int op_dev_id = CreateRPCOp(&result, node, &sharded_var_device);
       PADDLE_ENFORCE(op_dev_id != -1,
                      "Can not schedule the RPC operator to the right place.");
@@ -341,9 +210,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
         }
       }
       is_dist_train = true;
-    } else if (boost::get<int>(node->Op()->GetAttr(
-                   OpProtoAndCheckerMaker::OpRoleAttrName())) ==
-               static_cast<int>(OpRole::kDist)) {
+    } else if (OpHaveRole(*node, OpRole::kDist)) {
       int op_dev_id = CreateDistTrainOp(&result, node, &sharded_var_device);
       if (node->Op()->Type() == "concat") {
         auto origin_param_name = node->Op()->OutputArgumentNames()[0];
@@ -365,7 +232,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
       // the block.
       is_forwarding = false;
     } else {
-      int op_dev_id = GetOpDeviceID(result, node, sharded_var_device);
+      int op_dev_id = GetOpDeviceID(node, sharded_var_device);
       if (op_dev_id != -1) {  // This op only runs on one specific device.
         CreateComputationalOp(&result, node, op_dev_id);
         for (ir::Node *n : node->outputs) {
@@ -385,47 +252,48 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
         }
 
         if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) {
+          bool is_bk_op =
+              static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
+                                    OpProtoAndCheckerMaker::OpRoleAttrName())) &
+                                static_cast<int>(OpRole::kBackward));
+          if (!is_bk_op) continue;
           // Currently, we assume that once gradient is generated, it can be
           // broadcast, and each gradient is only broadcast once.
-          if (static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
-                                    OpProtoAndCheckerMaker::OpRoleAttrName())) &
-                                static_cast<int>(OpRole::kBackward))) {
-            try {
-              auto backward_vars = boost::get<std::vector<std::string>>(
-                  node->Op()->GetNullableAttr(
-                      OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-
-              PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
-
-              for (size_t i = 0; i < backward_vars.size(); i += 2) {
-                auto &p_name = backward_vars[i];
-                auto &g_name = backward_vars[i + 1];
-                VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
-
-                switch (strategy_.reduce_) {
-                  case BuildStrategy::ReduceStrategy::kReduce:
-                    cur_device_id = GetAppropriateDeviceID({g_name});
-                    CreateReduceOp(&result, g_name, cur_device_id);
-                    sharded_var_device.emplace(g_name, cur_device_id);
-                    if (!is_dist_train) {
-                      bcast_var_name_set[cur_device_id].emplace(p_name);
-                    }
-                    break;
-                  case BuildStrategy::ReduceStrategy::kAllReduce:
-                    if (IsSparseGradient(g_name)) {
-                      CreateReduceOp(&result, g_name, 0);
-                      CreateBroadcastOp(&result, g_name, 0);
-                    } else {
-                      InsertAllReduceOp(&result, g_name);
-                    }
-                    break;
-                  default:
-                    LOG(FATAL) << "Unknown reduce strategy ";
-                    break;
-                }
+          try {
+            auto backward_vars = boost::get<std::vector<std::string>>(
+                node->Op()->GetNullableAttr(
+                    OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+
+            PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
+
+            for (size_t i = 0; i < backward_vars.size(); i += 2) {
+              auto &p_name = backward_vars[i];
+              auto &g_name = backward_vars[i + 1];
+              VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
+              size_t cur_device_id = -1;
+              switch (strategy_.reduce_) {
+                case BuildStrategy::ReduceStrategy::kReduce:
+                  cur_device_id = GetAppropriateDeviceID({g_name});
+                  CreateReduceOp(&result, g_name, cur_device_id);
+                  sharded_var_device.emplace(g_name, cur_device_id);
+                  if (!is_dist_train) {
+                    bcast_var_name_set[cur_device_id].emplace(p_name);
+                  }
+                  break;
+                case BuildStrategy::ReduceStrategy::kAllReduce:
+                  if (IsSparseGradient(g_name)) {
+                    CreateReduceOp(&result, g_name, 0);
+                    CreateBroadcastOp(&result, g_name, 0);
+                  } else {
+                    InsertAllReduceOp(&result, g_name);
+                  }
+                  break;
+                default:
+                  LOG(FATAL) << "Unknown reduce strategy ";
+                  break;
               }
-            } catch (boost::bad_get e) {
             }
+          } catch (boost::bad_get e) {
           }
         }
       }
@@ -469,12 +337,108 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
   return graph;
 }
 
-bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const {
-  PADDLE_ENFORCE(all_vars_.count(og) != 0);
-  if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
-    return true;
+std::vector<ir::Node *> MultiDevSSAGraphBuilder::SortForReduceMode(
+    const std::vector<ir::Node *> &topo_ops) const {
+  std::unordered_map<std::string, int> sharded_var_device;
+  std::vector<ir::Node *> sorted_ops;
+  std::unordered_map<std::string, std::vector<ir::Node *>> delayed_op;
+  sorted_ops.reserve(topo_ops.size());
+
+  auto insert_delayed_op = [&](const std::string &var_name, int dev_id) {
+    sharded_var_device.emplace(var_name, dev_id);
+    if (delayed_op.count(var_name)) {
+      auto &ops = delayed_op.at(var_name);
+      sorted_ops.insert(sorted_ops.end(), ops.begin(), ops.end());
+      delayed_op.at(var_name).clear();
+    }
+  };
+
+  for (ir::Node *node : topo_ops) {
+    int op_dev_id = GetOpDeviceID(node, sharded_var_device, &delayed_op);
+    if (op_dev_id > -1) {
+      // This op only runs on one specific device.
+      sorted_ops.emplace_back(node);
+      for (ir::Node *n : node->outputs) {
+        insert_delayed_op(n->Name(), op_dev_id);
+      }
+    } else if (op_dev_id == -1) {
+      // This op runs on all devices, and its output may have parameter's
+      // gradients.
+      sorted_ops.emplace_back(node);
+      bool is_bk_op =
+          static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
+                                OpProtoAndCheckerMaker::OpRoleAttrName())) &
+                            static_cast<int>(OpRole::kBackward));
+      if (!is_bk_op) continue;
+      // Currently, we assume that once gradient is generated, it can be
+      // broadcast, and each gradient is only broadcast once.
+      std::vector<std::string> backward_vars;
+      try {
+        backward_vars =
+            boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
+                OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+      } catch (boost::bad_get e) {
+      }
+      PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
+
+      for (size_t i = 0; i < backward_vars.size(); i += 2) {
+        auto &g_name = backward_vars[i + 1];
+        size_t cur_device_id = GetAppropriateDeviceID({g_name});
+        insert_delayed_op(g_name, static_cast<int>(cur_device_id));
+      }
+    } else if (op_dev_id == -2) {
+      // The Op on which the Op depends has not yet been generated.
+    }
   }
-  return false;
+
+  PADDLE_ENFORCE_EQ(sorted_ops.size(), topo_ops.size());
+  return sorted_ops;
+}
+
+void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result,
+                                                ir::Node *node,
+                                                size_t place_id) const {
+  auto p = places_[place_id];
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back();
+  op_handle->SetDeviceContext(p,
+                              platform::DeviceContextPool::Instance().Get(p));
+
+  for (ir::Node *input : node->inputs) {
+    VarHandle *var = CreateOrGetLatestVarHandle(result, input, p, place_id);
+    op_handle->AddInput(var);
+  }
+
+  for (ir::Node *output : node->outputs) {
+    ir::Node *new_node = nullptr;
+    if (output->Var()) {
+      new_node = result->CreateVarNode(output->Var());
+    } else {
+      new_node =
+          result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable);
+    }
+    CreateOpOutput(result, op_handle, new_node, p, place_id);
+  }
+}
+
+size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
+    const std::vector<std::string> &var_names) const {
+  int64_t numel_sum = 0;
+  for (auto var_name : var_names) {
+    if (all_vars_.find(var_name) == all_vars_.end()) continue;
+    auto var_desc = all_vars_.at(var_name);
+    PADDLE_ENFORCE_NOT_NULL(var_desc);
+    auto dim = framework::make_ddim(var_desc->GetShape());
+    int64_t numel = framework::product(dim);
+    PADDLE_ENFORCE_GT(numel, 0);
+    numel_sum += numel;
+  }
+
+  auto smallest =
+      std::min_element(std::begin(balance_vars_), std::end(balance_vars_));
+  size_t dev_id =
+      static_cast<size_t>(std::distance(std::begin(balance_vars_), smallest));
+  balance_vars_[dev_id] += numel_sum;
+  return dev_id;
 }
 
 void MultiDevSSAGraphBuilder::SetCommunicationContext(
@@ -625,28 +589,52 @@ void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
 }
 
 int MultiDevSSAGraphBuilder::GetOpDeviceID(
-    const ir::Graph &graph, ir::Node *node,
+    ir::Node *node,
+    const std::unordered_map<std::string, int> &sharded_var_device,
+    std::unordered_map<std::string, std::vector<ir::Node *>> *delay_ops) const {
+  if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
+    return -1;
+  }
+
+  if (!OpHaveRole(*node, framework::OpRole::kOptimize)) {
+    return -1;
+  }
+
+  auto param_grad = boost::get<std::vector<std::string>>(
+      node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+
+  PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
+  int dev_id = GetVarDeviceID(param_grad[1], sharded_var_device);
+
+  if (dev_id == -1) {
+    (*delay_ops)[param_grad[1]].push_back(node);
+    return -2;
+  }
+  return dev_id;
+}
+
+int MultiDevSSAGraphBuilder::GetOpDeviceID(
+    ir::Node *node,
     const std::unordered_map<std::string, int> &sharded_var_device) const {
   if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
     return -1;
   }
-  int op_role = boost::get<int>(
-      node->Op()->GetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
-  if (op_role != static_cast<int>(framework::OpRole::kOptimize)) {
+
+  if (!OpHaveRole(*node, framework::OpRole::kOptimize)) {
     return -1;
   }
   auto param_grad = boost::get<std::vector<std::string>>(
       node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
 
   PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
-  int dev_id = GetVarDeviceID(graph, param_grad[1], sharded_var_device);
+  int dev_id = GetVarDeviceID(param_grad[1], sharded_var_device);
   PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]",
                     node->Op()->Type(), param_grad[0], param_grad[1]);
   return dev_id;
 }
 
 int MultiDevSSAGraphBuilder::GetVarDeviceID(
-    const ir::Graph &graph, const std::string &varname,
+    const std::string &varname,
     const std::unordered_map<std::string, int> &sharded_var_device) const {
   auto got = sharded_var_device.find(varname);
   if (got == sharded_var_device.end()) {
@@ -740,8 +728,7 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp(
       node->Op()->Type() == "split_selected_rows" ||
       node->Op()->Type() == "split_ids") {
     // TODO(paddle-dev): getting the first var is not safe.
-    op_dev_id =
-        GetVarDeviceID(*result, input_var_names[0], *sharded_var_device);
+    op_dev_id = GetVarDeviceID(input_var_names[0], *sharded_var_device);
     if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
       op_dev_id = GetAppropriateDeviceID(input_var_names);
       for (auto &varname : input_var_names) {
@@ -752,8 +739,7 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp(
       sharded_var_device->emplace(varname, op_dev_id);
     }
   } else if (node->Op()->Type() == "concat") {
-    op_dev_id =
-        GetVarDeviceID(*result, input_var_names[0], *sharded_var_device);
+    op_dev_id = GetVarDeviceID(input_var_names[0], *sharded_var_device);
     for (auto &varname : output_var_names) {
       sharded_var_device->emplace(varname, op_dev_id);
     }
@@ -794,8 +780,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
   int op_dev_id = -1;
   if (node->Op()->Type() == "send") {
     // TODO(paddle-dev): getting the first var is not safe.
-    op_dev_id =
-        GetVarDeviceID(*result, node->inputs[0]->Name(), *sharded_var_device);
+    op_dev_id = GetVarDeviceID(node->inputs[0]->Name(), *sharded_var_device);
     PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]),
                    "This hack no longer holds, please fix.");
     // the variable name which contains .block means it was splited by
@@ -825,8 +810,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
     auto recv_param_grad = boost::get<std::vector<std::string>>(
         node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
     if (recv_param_grad.size() == 2U) {
-      op_dev_id =
-          GetVarDeviceID(*result, recv_param_grad[1], *sharded_var_device);
+      op_dev_id = GetVarDeviceID(recv_param_grad[1], *sharded_var_device);
       VLOG(10) << "recv param " << recv_param_grad[0]
                << " get grad place: " << recv_param_grad[1]
                << " place: " << op_dev_id;
@@ -861,8 +845,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
     for (ir::Node *output : node->outputs) {
       int outvar_dev_id = op_dev_id;
       if (node->Op()->Type() == "fetch_barrier") {
-        outvar_dev_id =
-            GetVarDeviceID(*result, output->Name(), *sharded_var_device);
+        outvar_dev_id = GetVarDeviceID(output->Name(), *sharded_var_device);
         PADDLE_ENFORCE_NE(outvar_dev_id, -1, "output name %s", output->Name());
       }
       p = places_[outvar_dev_id];
@@ -879,6 +862,14 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
   return op_dev_id;
 }
 
+bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const {
+  PADDLE_ENFORCE(all_vars_.count(og) != 0);
+  if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
+    return true;
+  }
+  return false;
+}
+
 bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const {
   return boost::get<int>(
              node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index 5736102ddc..7029e9dc18 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -45,7 +45,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
 #endif
 
   int GetVarDeviceID(
-      const ir::Graph &graph, const std::string &varname,
+      const std::string &varname,
       const std::unordered_map<std::string, int> &sharded_var_device) const;
 
   bool IsScaleLossOp(ir::Node *node) const;
@@ -57,12 +57,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
       ir::Graph *result, ir::Node *node,
       std::unordered_map<std::string, int> *sharded_var_device) const;
 
-  std::vector<std::string> FindDistTrainSendVars(
-      const std::vector<ir::Node *> &nodes) const;
-
-  std::vector<std::string> FindDistTrainRecvVars(
-      const std::vector<ir::Node *> &nodes) const;
-
   void CreateComputationalOps(ir::Graph *result, ir::Node *node,
                               size_t num_places) const;
 
@@ -77,7 +71,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
                              int dev_id) const;
 
   int GetOpDeviceID(
-      const ir::Graph &graph, ir::Node *node,
+      ir::Node *node,
       const std::unordered_map<std::string, int> &sharded_var_device) const;
 
   void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;
@@ -100,6 +94,15 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
   void SetCommunicationContext(OpHandleBase *op_handle,
                                const platform::Place &p) const;
 
+  std::vector<ir::Node *> SortForReduceMode(
+      const std::vector<ir::Node *> &) const;
+
+  int GetOpDeviceID(
+      ir::Node *node,
+      const std::unordered_map<std::string, int> &shared_var_device,
+      std::unordered_map<std::string, std::vector<ir::Node *>> *delay_ops)
+      const;
+
   mutable std::string loss_var_name_;
   mutable std::vector<platform::Place> places_;
   mutable std::vector<Scope *> local_scopes_;
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index 8670dcfed7..3eb5bdba3b 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -23,66 +23,8 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
-namespace {
-
-void CheckProgram(const ProgramDesc &program) {
-#define _INT(role) static_cast<int>(role)
-
-  std::map<int, bool> visit;
-  for (OpDesc *op : program.Block(0).AllOps()) {
-    // For backward compatibility, some program doesn't have role added.
-    if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue;
-    int role_id =
-        boost::get<int>(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
-    visit[role_id] = true;
-    switch (role_id) {
-      case _INT(OpRole::kForward):
-        if (visit.find(_INT(OpRole::kBackward)) != visit.end()) {
-          LOG(ERROR) << "Cannot add backward operator before forward operator "
-                     << op->Type();
-        }
-        break;
-      case _INT(OpRole::kBackward):
-      case _INT(OpRole::kBackward) | _INT(OpRole::kLoss):
-        PADDLE_ENFORCE(
-            visit.find(_INT(OpRole::kOptimize)) == visit.end(),
-            "Cannot add backward operator %s after optimize operator.",
-            op->Type());
-        break;
-      case _INT(OpRole::kForward) | _INT(OpRole::kLoss):
-        PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) |
-                                  _INT(OpRole::kLoss)) == visit.end(),
-                       "Cannot add backward|loss operator before "
-                       "forward|loss operator %s.",
-                       op->Type());
-        PADDLE_ENFORCE(
-            visit.find(_INT(OpRole::kOptimize)) == visit.end(),
-            "Cannot add forward|loss operator %s after optimize operator.",
-            op->Type());
-        break;
-      case _INT(OpRole::kOptimize):
-      case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched):
-        PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(),
-                       "Optimize operators %s must follow backward operator.",
-                       op->Type());
-        break;
-      case _INT(OpRole::kLRSched):
-      case _INT(OpRole::kDist):
-      case _INT(OpRole::kRPC):
-      case _INT(OpRole::kNotSpecified):
-        break;
-      default:
-        LOG(FATAL) << "Unknown operator role. Don't add new role because "
-                      "you don't know what you are doing.";
-    }
-  }
-
-#undef _INT
-}
-}  // namespace
 
 Graph::Graph(const ProgramDesc &program) : program_(program) {
-  CheckProgram(program_);
   auto var_nodes = InitFromProgram(program_);
   ResolveHazard(var_nodes);
 }
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index a921f469f5..e14b74a873 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -320,6 +320,7 @@ void ParallelExecutor::BCastParamsToDevices(
     if (paddle::platform::is_gpu_place(main_tensor.place())) {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       std::vector<void *> buffers;
+      buffers.reserve(member_->places_.size());
       size_t numel = main_tensor.numel();
       ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
       for (size_t i = 0; i < member_->places_.size(); ++i) {
@@ -353,9 +354,7 @@ void ParallelExecutor::BCastParamsToDevices(
 #endif
     } else {
       platform::CPUPlace cpu;
-      for (size_t i = 0; i < member_->places_.size(); ++i) {
-        if (i == 0) continue;
-
+      for (size_t i = 1; i < member_->places_.size(); ++i) {
         auto local_scope = member_->local_scopes_[i];
         auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
 
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 74cf76da95..c97a93ec36 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -148,7 +148,7 @@ class ParallelExecutor(object):
                 trainers_endpoints), "num_trainers == len(end_points)"
             build_strategy.trainers_endpoints = trainers_endpoints
 
-        # step5: get persistable_vars, parameter_vars, places. persistable_vars
+        # step6: get persistable_vars, places. persistable_vars
         # need be broadcast to other local_scope.
         persistable_vars = set([
             cpt.to_text(v.name) for v in [
@@ -164,7 +164,7 @@ class ParallelExecutor(object):
 
         places = list(map(place_obj, self._places))
 
-        # step6: init ParallelExecutor
+        # step7: init ParallelExecutor
         self.executor = core.ParallelExecutor(
             places, persistable_vars, main.desc,
             cpt.to_text(loss_name)
diff --git a/python/paddle/fluid/tests/unittests/test_weight_decay.py b/python/paddle/fluid/tests/unittests/test_weight_decay.py
new file mode 100644
index 0000000000..f37d2bfb2e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_weight_decay.py
@@ -0,0 +1,188 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import contextlib
+
+import unittest
+from functools import partial
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+
+import paddle.fluid as fluid
+
+
+def get_places():
+    places = []
+    if core.is_compiled_with_cuda():
+        places.append(core.CUDAPlace(0))
+    return places
+
+
+@contextlib.contextmanager
+def prog_scope_guard(main_prog, startup_prog):
+    scope = fluid.core.Scope()
+    with fluid.unique_name.guard():
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(main_prog, startup_prog):
+                yield
+
+
+def bow_net(data,
+            label,
+            dict_dim,
+            is_sparse=False,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2):
+    """
+    BOW net
+    This model is from https://github.com/PaddlePaddle/models:
+    fluid/PaddleNLP/text_classification/nets.py
+    """
+    emb = fluid.layers.embedding(
+        input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim])
+    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
+    bow_tanh = fluid.layers.tanh(bow)
+    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
+    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
+    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    return avg_cost
+
+
+class TestWeightDecay(unittest.TestCase):
+    def setUp(self):
+        self.word_dict = paddle.dataset.imdb.word_dict()
+        reader = paddle.batch(
+            paddle.dataset.imdb.train(self.word_dict), batch_size=4)()
+        self.train_data = [next(reader) for _ in range(5)]
+        self.learning_rate = .5
+
+    def run_executor(self, place, feed_list, loss):
+        exe = fluid.Executor(place)
+        feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
+        exe.run(fluid.default_startup_program())
+        main_prog = fluid.default_main_program()
+        loss_set = []
+        for data in self.train_data:
+            out = exe.run(main_prog,
+                          feed=feeder.feed(data),
+                          fetch_list=[loss.name])
+
+            print("loss              %s" % (np.average(out)))
+            loss_set.append(np.average(out))
+
+        return loss_set
+
+    def run_parallel_exe(self,
+                         place,
+                         feed_list,
+                         loss,
+                         use_cuda=True,
+                         use_reduce=False,
+                         use_fast_executor=False,
+                         use_ir_memory_optimize=False):
+        exe = fluid.Executor(place)
+        feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
+        exe.run(fluid.default_startup_program())
+
+        exec_strategy = fluid.ExecutionStrategy()
+        if use_fast_executor:
+            exec_strategy.use_experimental_executor = True
+
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
+                if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
+        build_strategy.memory_optimize = use_ir_memory_optimize
+
+        parallel_exe = fluid.ParallelExecutor(
+            use_cuda,
+            loss_name=loss.name,
+            exec_strategy=exec_strategy,
+            build_strategy=build_strategy)
+
+        loss_set = []
+        for data in self.train_data:
+            out = parallel_exe.run(feed=feeder.feed(data),
+                                   fetch_list=[loss.name])
+            print("loss              %s" % (np.average(out)))
+            loss_set.append(np.average(out))
+
+        return loss_set
+
+    def check_weight_decay(self,
+                           place,
+                           model,
+                           use_parallel_exe=False,
+                           use_reduce=False):
+        main_prog = fluid.framework.Program()
+        startup_prog = fluid.framework.Program()
+        startup_prog.random_seed = 1
+        with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog):
+
+            data = fluid.layers.data(
+                name="words", shape=[1], dtype="int64", lod_level=1)
+            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+            avg_cost = model(data, label, len(self.word_dict))
+
+            param_list = [(var, var * self.learning_rate)
+                          for var in main_prog.block(0).all_parameters()]
+
+            optimizer = fluid.optimizer.Adagrad(
+                learning_rate=self.learning_rate)
+
+            optimizer.minimize(avg_cost)
+
+            for params in param_list:
+                updated_p = fluid.layers.elementwise_sub(
+                    x=params[0], y=params[1])
+                fluid.layers.assign(input=updated_p, output=params[0])
+
+            if use_parallel_exe:
+                loss = self.run_parallel_exe(
+                    place, [data, label],
+                    loss=avg_cost,
+                    use_cuda=True,
+                    use_reduce=use_reduce)
+            else:
+                loss = self.run_executor(place, [data, label], loss=avg_cost)
+
+        return loss
+
+    def test_weight_decay(self):
+        model = partial(bow_net, is_sparse=False)
+        for place in get_places():
+            loss = self.check_weight_decay(place, model, use_parallel_exe=False)
+
+            loss2 = self.check_weight_decay(
+                place, model, use_parallel_exe=True, use_reduce=False)
+
+            for i in range(len(loss)):
+                assert np.isclose(a=loss[i], b=loss2[i], rtol=5e-5)
+
+            loss3 = self.check_weight_decay(
+                place, model, use_parallel_exe=True, use_reduce=True)
+
+            for i in range(len(loss)):
+                assert np.isclose(a=loss[i], b=loss3[i], rtol=5e-5)
+
+
+if __name__ == '__main__':
+    unittest.main()

From efa630eadbfd60270ccd8dbe2f9951ef34541cde Mon Sep 17 00:00:00 2001
From: Yu Yang <reyoung@126.com>
Date: Thu, 27 Dec 2018 14:39:41 +0800
Subject: [PATCH 75/77] Refine Dockerfile (#14908)

* Refine Dockerfile

* Add tasks, cmake gen

* Fix code error

* Disable compile after paddle_build.sh

* Refine

* Skip on PY35 CI

* Change env

* Refine paddle_build.sh

* Expose gen_fluid_lib

* Refine mkldnn.cmake

* Refine mkldnn.cmake

* Refine mkldnnlib

* Skip unstable tests
---
 Dockerfile                                    | 76 +++++++++----------
 cmake/external/mkldnn.cmake                   |  4 +-
 cmake/inference_lib.cmake                     |  2 +-
 paddle/scripts/paddle_build.sh                | 18 +++--
 .../test_image_classification_resnet.py       | 12 +--
 .../tests/unittests/test_dist_se_resnext.py   | 15 ++++
 6 files changed, 73 insertions(+), 54 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 84e1edbee9..716b164ab8 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -94,52 +94,52 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
 # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
 # version(1.7.1 for now), which causes building documentation failed.
-RUN pip3 install -U wheel && \
-    pip3 install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip3.6 install -U wheel && \
-    pip3.6 install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3.6 install sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip3.7 install -U wheel && \
-    pip3.7 install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3.7 install sphinx-rtd-theme==0.1.9 recommonmark && \
+RUN pip3 --no-cache-dir install -U wheel && \
+    pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip3.6 --no-cache-dir install -U wheel && \
+    pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip3.7 --no-cache-dir install -U wheel && \
+    pip3.7 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3.7 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
     easy_install -U pip && \
-    pip install -U pip setuptools wheel && \
-    pip install -U docopt PyYAML sphinx==1.5.6 && \
-    pip install sphinx-rtd-theme==0.1.9 recommonmark
-
-RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3 install opencv-python && \
-    pip3.6 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip3.6 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.6 install opencv-python && \
-    pip3.7 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip3.7 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.7 install opencv-python && \
-    pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip install opencv-python
+    pip --no-cache-dir install -U pip setuptools wheel && \
+    pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
+    pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark
+
+RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3 --no-cache-dir install opencv-python && \
+    pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3.6 --no-cache-dir install opencv-python && \
+    pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3.7 --no-cache-dir install opencv-python && \
+    pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip --no-cache-dir install opencv-python
 
 #For docstring checker
-RUN pip3 install pylint pytest astroid isort
-RUN pip3.6 install pylint pytest astroid isort
-RUN pip3.7 install pylint pytest astroid isort
-RUN pip install pylint pytest astroid isort LinkChecker
+RUN pip3 --no-cache-dir install pylint pytest astroid isort
+RUN pip3.6 --no-cache-dir install pylint pytest astroid isort
+RUN pip3.7 --no-cache-dir install pylint pytest astroid isort
+RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker
 
 COPY ./python/requirements.txt /root/
-RUN pip3 install -r /root/requirements.txt
-RUN pip3.6 install -r /root/requirements.txt
-RUN pip3.7 install -r /root/requirements.txt
-RUN pip install -r /root/requirements.txt
+RUN pip3 --no-cache-dir install -r /root/requirements.txt
+RUN pip3.6 --no-cache-dir install -r /root/requirements.txt
+RUN pip3.7 --no-cache-dir install -r /root/requirements.txt
+RUN pip --no-cache-dir install -r /root/requirements.txt
 
 # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
 # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
-RUN apt-get install -y libssl-dev libffi-dev
-RUN pip3 install certifi urllib3[secure]
-RUN pip3.6 install certifi urllib3[secure]
-RUN pip3.7 install certifi urllib3[secure]
-RUN pip install certifi urllib3[secure]
+RUN apt-get install -y libssl-dev libffi-dev && apt-get clean -y
+RUN pip3 --no-cache-dir install certifi urllib3[secure]
+RUN pip3.6 --no-cache-dir install certifi urllib3[secure]
+RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
+RUN pip --no-cache-dir install certifi urllib3[secure]
 
 
 # Install woboq_codebrowser to /woboq
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index c29375cd05..a9b99e9ab8 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -106,10 +106,10 @@ else(WIN32)
     SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
     ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
             COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
-            DEPENDS mkldnn)
+            DEPENDS mkldnn shared_mkldnn)
 endif(WIN32)
 ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB})
-
+ADD_DEPENDENCIES(mkldnn_shared_lib ${MKLDNN_PROJECT} mkldnn)
 IF(WITH_C_API)
   INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib)
 ENDIF()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 48279bc809..3e11d332ff 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -136,7 +136,7 @@ if (WITH_MKLDNN)
     copy(mkldnn_lib
             SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
             DSTS ${dst_dir} ${dst_dir}/lib
-            DEPS mkldnn
+            DEPS mkldnn_shared_lib
             )
 endif ()
 
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 418dc13468..1220f80100 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 #=================================================
 #                   Utils
 #=================================================
@@ -418,13 +417,6 @@ EOF
         else
             ctest --output-on-failure
         fi
-
-        # make install should also be test when unittest
-        make install -j `nproc`
-        pip install ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
-        if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
-            paddle version
-        fi
     fi
 }
 
@@ -922,6 +914,7 @@ function main() {
         ;;
       assert_api)
         assert_api_not_changed ${PYTHON_ABI:-""}
+        assert_api_spec_approvals
         ;;
       test_inference)
         gen_capi_package
@@ -946,6 +939,15 @@ function main() {
         run_test
         assert_api_not_changed ${PYTHON_ABI:-""}
         ;;
+      cmake_gen)
+        cmake_gen ${PYTHON_ABI:-""}
+        ;;
+      gen_fluid_lib)
+        gen_fluid_lib
+        ;;
+      test_fluid_lib)
+        test_fluid_lib
+        ;;
       *)
         print_usage
         exit 0
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
index d744a00242..e87c1d58c8 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -185,8 +185,10 @@ def main(use_cuda, parallel):
 
 
 if __name__ == '__main__':
-    for use_cuda in (False, True):
-        for parallel in (False, True):
-            if use_cuda and not core.is_compiled_with_cuda():
-                continue
-            main(use_cuda=use_cuda, parallel=parallel)
+    on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
+    if not on_ci:
+        for use_cuda in (False, True):
+            for parallel in (False, True):
+                if use_cuda and not core.is_compiled_with_cuda():
+                    continue
+                main(use_cuda=use_cuda, parallel=parallel)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
index c2a4e5ca0c..28602d3251 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
@@ -15,6 +15,18 @@
 from __future__ import print_function
 import unittest
 from test_dist_base import TestDistBase
+import os
+
+
+def skip_ci(func):
+    on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
+
+    def __func__(*args, **kwargs):
+        if on_ci:
+            return
+        return func(*args, **kwargs)
+
+    return __func__
 
 
 class TestDistSeResneXt2x2(TestDistBase):
@@ -22,6 +34,7 @@ class TestDistSeResneXt2x2(TestDistBase):
         self._sync_mode = True
         self._use_reader_alloc = False
 
+    @skip_ci
     def test_dist_train(self):
         self.check_with_place("dist_se_resnext.py", delta=1e-7)
 
@@ -32,6 +45,7 @@ class TestDistseResnXt2x2WithMemopt(TestDistBase):
         self._mem_opt = True
         self._use_reader_alloc = False
 
+    @skip_ci
     def test_dist_train(self):
         self.check_with_place("dist_se_resnext.py", delta=1e-7)
 
@@ -41,6 +55,7 @@ class TestDistSeResneXt2x2Async(TestDistBase):
         self._sync_mode = False
         self._use_reader_alloc = False
 
+    @skip_ci
     def test_dist_train(self):
         self.check_with_place("dist_se_resnext.py", delta=100)
 

From f7294f8b251a3907a872c9b7a5b3d02ecdfdbe76 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Thu, 27 Dec 2018 13:41:23 +0800
Subject: [PATCH 76/77] register float16

test=develop
---
 paddle/fluid/operators/fill_constant_op.cc    | 3 ++-
 paddle/fluid/operators/fill_constant_op.cu.cc | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 73f38de08e..c86430524e 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -87,4 +87,5 @@ REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, ops::FillConstantOpMaker,
 REGISTER_OP_CPU_KERNEL(fill_constant, ops::FillConstantKernel<float>,
                        ops::FillConstantKernel<double>,
                        ops::FillConstantKernel<int64_t>,
-                       ops::FillConstantKernel<int>);
+                       ops::FillConstantKernel<int>,
+                       ops::FillConstantKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/fill_constant_op.cu.cc b/paddle/fluid/operators/fill_constant_op.cu.cc
index fba5583505..77027b5a87 100644
--- a/paddle/fluid/operators/fill_constant_op.cu.cc
+++ b/paddle/fluid/operators/fill_constant_op.cu.cc
@@ -17,4 +17,6 @@ limitations under the License. */
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(fill_constant, ops::FillConstantKernel<float>,
                         ops::FillConstantKernel<double>,
-                        ops::FillConstantKernel<int64_t>);
+                        ops::FillConstantKernel<int64_t>,
+                        ops::FillConstantKernel<int>,
+                        ops::FillConstantKernel<paddle::platform::float16>);

From e26cced7ccad46c3165b9c8dc2ee8831c0f5aa8d Mon Sep 17 00:00:00 2001
From: Wu Yi <typhoonzero1986@gmail.com>
Date: Thu, 27 Dec 2018 18:51:01 +0800
Subject: [PATCH 77/77] refine batch merge pass (#14777)

* refine batch merge pass

* refine batch merge pass test=develop
---
 .../framework/ir/multi_batch_merge_pass.cc    | 29 ++++++++++++++++---
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
index bd5b76426e..9e77f98e9e 100644
--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
@@ -75,6 +75,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
   std::vector<Node*> optimize_ops;
   std::vector<Node*> lr_ops;  // ops other than forward/backward/optimize
   std::unordered_set<std::string> grad_names;
+  std::unordered_map<std::string, std::string> gradname2paramname;
 
   std::vector<ir::Node*> nodes = TopologySortOperations(*graph);
   auto origin_nodes = graph->ReleaseNodes();
@@ -99,6 +100,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
       auto op_role_vars = boost::get<std::vector<std::string>>(op_role_var);
       for (size_t i = 0; i < op_role_vars.size(); i += 2) {
         grad_names.insert(op_role_vars[i + 1]);
+        gradname2paramname[op_role_vars[i + 1]] = op_role_vars[i];
       }
     } else if (op_role & static_cast<int>(framework::OpRole::kLRSched)) {
       lr_ops.push_back(node);
@@ -109,7 +111,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
 
   // 2. copy forward backward
   ir::Node* prev_repeat_last_op_node = nullptr;
-  // record origin_grad -> repeated grad list map.
+  // record origin_grad -> repeated_grad_list map.
   std::map<ir::Node*, std::vector<ir::Node*>> grad_repeated_map;
   std::map<std::string, std::vector<ir::Node*>> created;
   std::unordered_set<std::string> bn_vars_need_rename;
@@ -124,10 +126,16 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
         if (grad_names.find(outname) != grad_names.end()) {
           std::string new_gname = string::Sprintf("%s.repeat.%d", outname, i);
           repeated_op.RenameOutput(outname, new_gname);
+          // remove op_role_var for backward ops that outputs grad for a
+          // parameter.
+          repeated_op.SetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName(),
+                              std::vector<std::string>());
         }
       }
       // 3.5 let batch_norm ops use independent vars, note batch_norm_grad do
-      // not need this update
+      // not need this update, because only moving mean and variance should be
+      // differ, trainable parameter scale and bias is the same as other
+      // parameters.
       if (node->Name() == "batch_norm") {
         // NOTE: assume bn op created by layers use save var as output mean and
         // variance
@@ -224,16 +232,25 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
         var->inputs.push_back(repeated_node);
       }
     }
-  }
+  }  // end copy forward backward
 
-  // 5. create GRAD merge op node
+  // 5. create GRAD merge op node: sum(repeat.0...repeat.n) ->
+  // scale(1/num_repeats)
   for (auto kv : grad_repeated_map) {
     OpDesc sum_op;
     sum_op.SetType("sum");
     std::vector<std::string> repeated_grad_names;
+    std::vector<std::string> param_grad_op_role_var;
     for (auto r : kv.second) {
       repeated_grad_names.push_back(r->Var()->Name());
     }
+    // NOTE: use op_role_var to control allreduce op appending in
+    //       multi_devices_graph_pass, we want to append op_role_var
+    //       only once for the merged gradient, so break after first call.
+    param_grad_op_role_var.push_back(
+        gradname2paramname.at(kv.first->Var()->Name()));        // param
+    param_grad_op_role_var.push_back(kv.first->Var()->Name());  // grad
+
     sum_op.SetInput("X", repeated_grad_names);
     sum_op.SetOutput("Out", {kv.first->Var()->Name()});
     sum_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
@@ -256,6 +273,10 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
     scale_op.SetAttr("scale", static_cast<float>(1.0f / num_repeats));
     scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
                      static_cast<int>(OpRole::kBackward));
+
+    scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName(),
+                     param_grad_op_role_var);
+
     auto scale_op_node = result.CreateOpNode(&scale_op);
     scale_op_node->inputs.push_back(sum_out_var_node);
     sum_out_var_node->outputs.push_back(scale_op_node);