From 681514e15ffbba78def454402f24d5a56f66546c Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 10 Sep 2018 12:20:08 +0800
Subject: [PATCH 01/17] Make all scope pointer to shared

---
 .../fast_threaded_ssa_graph_executor.cc       |  3 +-
 .../fast_threaded_ssa_graph_executor.h        | 11 ++++---
 .../framework/details/fetch_op_handle.cc      |  2 +-
 .../fluid/framework/details/fetch_op_handle.h |  4 +--
 .../scope_buffered_ssa_graph_executor.cc      |  3 +-
 .../scope_buffered_ssa_graph_executor.h       |  5 +--
 .../details/threaded_ssa_graph_executor.cc    |  3 +-
 .../details/threaded_ssa_graph_executor.h     | 11 ++++---
 paddle/fluid/framework/parallel_executor.cc   | 31 ++++++++++++-------
 paddle/fluid/framework/parallel_executor.h    | 21 +++++++------
 paddle/fluid/framework/scope.cc               | 11 ++++---
 paddle/fluid/framework/scope.h                |  2 +-
 12 files changed, 63 insertions(+), 44 deletions(-)
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index 7606f2bc06..a9b89614ae 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -22,7 +22,8 @@ namespace framework {
 namespace details {
 
 FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
-    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
+    const ExecutionStrategy &strategy,
+    const std::vector<std::shared_ptr<Scope>> &local_scopes,
     const std::vector<platform::Place> &places,
     std::unique_ptr<ir::Graph> &&graph)
     : strategy_(strategy),
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
index dad3a231cb..fb615d70b7 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -29,16 +29,17 @@ namespace details {
 class OpHandleBase;
 class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
  public:
-  FastThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
-                               const std::vector<Scope *> &local_scopes,
-                               const std::vector<platform::Place> &places,
-                               std::unique_ptr<ir::Graph> &&graph);
+  FastThreadedSSAGraphExecutor(
+      const ExecutionStrategy &strategy,
+      const std::vector<std::shared_ptr<Scope>> &local_scopes,
+      const std::vector<platform::Place> &places,
+      std::unique_ptr<ir::Graph> &&graph);
   FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
   const ir::Graph &Graph() const override;
 
  private:
   ExecutionStrategy strategy_;
-  std::vector<Scope *> local_scopes_;
+  std::vector<std::shared_ptr<Scope>> local_scopes_;
   std::vector<platform::Place> places_;
   std::unique_ptr<ir::Graph> graph_;
 
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index fe18b2060c..2f4aefd39d 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -22,7 +22,7 @@ namespace framework {
 namespace details {
 
 FetchOpHandle::FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset,
-                             std::vector<Scope *> *local_scopes)
+                             std::vector<std::shared_ptr<Scope>> *local_scopes)
     : OpHandleBase(node),
       data_(data),
       offset_(offset),
diff --git a/paddle/fluid/framework/details/fetch_op_handle.h b/paddle/fluid/framework/details/fetch_op_handle.h
index 6ce42f92d7..a207e36b8a 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_op_handle.h
@@ -29,7 +29,7 @@ namespace details {
 struct FetchOpHandle : public OpHandleBase {
  public:
   FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset,
-                std::vector<Scope *> *local_scopes);
+                std::vector<std::shared_ptr<Scope>> *local_scopes);
 
   ~FetchOpHandle();
 
@@ -47,7 +47,7 @@ struct FetchOpHandle : public OpHandleBase {
  private:
   FeedFetchList *data_;
   size_t offset_;
-  std::vector<Scope *> *local_scopes_;
+  std::vector<std::shared_ptr<Scope>> *local_scopes_;
   std::vector<LoDTensor> tensors_;
 };
 
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index 5bd974d6b7..bf5671c679 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -23,7 +23,8 @@ namespace paddle {
 namespace framework {
 namespace details {
 ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
-    ExecutionStrategy strategy, std::vector<Scope *> local_scopes,
+    ExecutionStrategy strategy,
+    std::vector<std::shared_ptr<Scope>> local_scopes,
     std::vector<VariableInfo> var_infos, std::vector<platform::Place> places,
     std::unique_ptr<SSAGraphExecutor> &&underlying_executor)
     : strategy_(std::move(strategy)),
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
index 5e87e0bf50..ec31755af5 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -37,7 +37,8 @@ struct VariableInfo {
 class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
  public:
   ScopeBufferedSSAGraphExecutor(
-      ExecutionStrategy strategy, std::vector<Scope*> local_scopes,
+      ExecutionStrategy strategy,
+      std::vector<std::shared_ptr<Scope>> local_scopes,
       std::vector<VariableInfo> var_infos, std::vector<platform::Place> places,
       std::unique_ptr<SSAGraphExecutor>&& underlying_executor);
 
@@ -52,7 +53,7 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
 
   ExecutionStrategy strategy_;
   std::unique_ptr<SSAGraphExecutor> underlying_executor_;
-  std::vector<Scope*> local_scopes_;
+  std::vector<std::shared_ptr<Scope>> local_scopes_;
   std::vector<VariableInfo> var_infos_;
   std::vector<platform::Place> places_;
 };
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index c9e331ef35..cc6f444363 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -21,7 +21,8 @@ namespace paddle {
 namespace framework {
 namespace details {
 ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
-    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
+    const ExecutionStrategy &strategy,
+    const std::vector<std::shared_ptr<Scope>> &local_scopes,
     const std::vector<platform::Place> &places,
     std::unique_ptr<ir::Graph> &&graph)
     : graph_(std::move(graph)),
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 9135c1f5d4..2a74af6c3d 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -38,10 +38,11 @@ namespace details {
 
 class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  public:
-  ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
-                           const std::vector<Scope *> &local_scopes,
-                           const std::vector<platform::Place> &places,
-                           std::unique_ptr<ir::Graph> &&graph);
+  ThreadedSSAGraphExecutor(
+      const ExecutionStrategy &strategy,
+      const std::vector<std::shared_ptr<Scope>> &local_scopes,
+      const std::vector<platform::Place> &places,
+      std::unique_ptr<ir::Graph> &&graph);
 
   const ir::Graph &Graph() const override { return *graph_; }
   // Run a SSAGraph by a thread pool
@@ -57,7 +58,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  private:
   std::unique_ptr<ir::Graph> graph_;
   std::unique_ptr<::ThreadPool> pool_;
-  std::vector<Scope *> local_scopes_;
+  std::vector<std::shared_ptr<Scope>> local_scopes_;
   std::vector<platform::Place> places_;
   platform::DeviceContextPool fetch_ctxs_;
   ExceptionHolder exception_holder_;
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 81cb24bdda..93c74deb3e 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -39,7 +39,8 @@ std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
     const ProgramDesc &main_program, const std::vector<platform::Place> &places,
     const std::string &loss_var_name,
     const std::unordered_set<std::string> &param_names,
-    const std::vector<Scope *> &local_scopes, const bool use_cuda,
+    const std::vector<std::shared_ptr<Scope>> &local_scopes,
+    const bool use_cuda,
 #ifdef PADDLE_WITH_CUDA
     const BuildStrategy &strategy, platform::NCCLContextMap *nccl_ctxs) {
 #else
@@ -66,8 +67,8 @@ std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
                                                      &loss_var_name);
   multi_devices_pass->SetNotOwned<const std::unordered_set<std::string>>(
       "params", &param_names);
-  multi_devices_pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
-                                                              &local_scopes);
+  multi_devices_pass->SetNotOwned<const std::vector<std::shared_ptr<Scope>>>(
+      "local_scopes", &local_scopes);
   multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy", &strategy);
 
 #ifdef PADDLE_WITH_CUDA
@@ -100,8 +101,8 @@ class ParallelExecutorPrivate {
       : places_(places) {}
 
   std::vector<platform::Place> places_;
-  std::vector<Scope *> local_scopes_;
-  Scope *global_scope_;
+  std::vector<std::shared_ptr<Scope>> local_scopes_;
+  std::shared_ptr<Scope> global_scope_;
   std::unique_ptr<details::SSAGraphExecutor> executor_;
 
 #ifdef PADDLE_WITH_CUDA
@@ -112,7 +113,7 @@ class ParallelExecutorPrivate {
   bool use_all_reduce_;
 };
 
-std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
+std::vector<std::shared_ptr<Scope>> &ParallelExecutor::GetLocalScopes() {
   return member_->local_scopes_;
 }
 
@@ -121,7 +122,8 @@ ParallelExecutor::ParallelExecutor(
     const std::unordered_set<std::string> &params,
     const std::unordered_set<std::string> &bcast_vars,
     const ProgramDesc &main_program, const std::string &loss_var_name,
-    Scope *scope, const std::vector<Scope *> &local_scopes,
+    const std::shared_ptr<Scope> &scope,
+    const std::vector<std::shared_ptr<Scope>> &local_scopes,
     const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy,
     size_t num_trainers, size_t trainer_id)
     : member_(new ParallelExecutorPrivate(places)) {
@@ -142,13 +144,13 @@ ParallelExecutor::ParallelExecutor(
     member_->own_local_scope_ = true;
     member_->local_scopes_.emplace_back(member_->global_scope_);
     for (size_t i = 1; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.emplace_back(&scope->NewScope());
+      member_->local_scopes_.emplace_back(scope->NewSharedScope());
     }
   } else {
     member_->own_local_scope_ = false;
     PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
     for (size_t i = 0; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope());
+      member_->local_scopes_.emplace_back(local_scopes[i]->NewSharedScope());
     }
   }
 
@@ -321,7 +323,7 @@ void ParallelExecutor::FeedTensorsIntoLocalScopes(
 
   for (size_t i = 0; i < tensors.size(); ++i) {
     auto &map = tensors[i];
-    auto *scope = member_->local_scopes_[i];
+    auto &scope = member_->local_scopes_[i];
     for (auto &pair : map) {
       auto *trg = scope->Var(pair.first)->GetMutable<LoDTensor>();
       trg->ShareDataWith(pair.second);
@@ -351,8 +353,15 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
 
 ParallelExecutor::~ParallelExecutor() {
   if (member_->own_local_scope_) {
+    std::vector<Scope *> local_scopes_ptrs;
+    local_scopes_ptrs.reserve(member_->local_scopes_.size());
     for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
-      member_->global_scope_->DeleteScope(member_->local_scopes_[i]);
+      local_scopes_ptrs.emplace_back(member_->local_scopes_[i].get());
+      member_->local_scopes_[i].reset();
+    }
+
+    for (size_t i = 0; i != local_scopes_ptrs.size(); ++i) {
+      member_->global_scope_->DeleteScope(local_scopes_ptrs[i]);
     }
   }
 }
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 5fb748fa20..ce1076e44b 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -39,19 +39,20 @@ class ParallelExecutor {
   DISABLE_COPY_AND_ASSIGN(ParallelExecutor);
 
  public:
-  explicit ParallelExecutor(const std::vector<platform::Place> &places,
-                            const std::unordered_set<std::string> &params,
-                            const std::unordered_set<std::string> &bcast_vars,
-                            const ProgramDesc &main_program,
-                            const std::string &loss_var_name, Scope *scope,
-                            const std::vector<Scope *> &local_scopes,
-                            const ExecutionStrategy &exec_strategy,
-                            const BuildStrategy &build_strategy,
-                            size_t num_trainers = 1, size_t trainer_id = 0);
+  explicit ParallelExecutor(
+      const std::vector<platform::Place> &places,
+      const std::unordered_set<std::string> &params,
+      const std::unordered_set<std::string> &bcast_vars,
+      const ProgramDesc &main_program, const std::string &loss_var_name,
+      const std::shared_ptr<Scope> &scope,
+      const std::vector<std::shared_ptr<Scope>> &local_scopes,
+      const ExecutionStrategy &exec_strategy,
+      const BuildStrategy &build_strategy, size_t num_trainers = 1,
+      size_t trainer_id = 0);
 
   ~ParallelExecutor();
 
-  std::vector<Scope *> &GetLocalScopes();
+  std::vector<std::shared_ptr<Scope>> &GetLocalScopes();
 
   /**
    * Feed tensors to local scopes. The size of tensors should be equal to the
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 50f374e370..fa6bf4429d 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -38,8 +38,8 @@ Scope::~Scope() { DropKids(); }
 
 Scope& Scope::NewScope() const {
   std::unique_lock<std::mutex> lock(mutex_);
-  kids_.push_back(new Scope(this));
-  return *kids_.back();
+  kids_.push_back(std::shared_ptr<Scope>(new Scope(this)));
+  return kids_.back().get();
 }
 
 Variable* Scope::Var(const std::string& name) {
@@ -68,7 +68,6 @@ const Scope* Scope::FindScope(const Variable* var) const {
 
 void Scope::DropKids() {
   std::unique_lock<std::mutex> lock(mutex_);
-  for (Scope* s : kids_) delete s;
   kids_.clear();
 }
 
@@ -84,8 +83,12 @@ std::vector<std::string> Scope::LocalVarNames() const {
 
 void Scope::DeleteScope(Scope* scope) const {
   std::unique_lock<std::mutex> lock(mutex_);
-  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
+  auto it = std::find_if(this->kids_.begin(), this->kids_.end(),
+                         [&scope](const std::shared_ptr<Scope>& kid) {
+                           return kid.get() == scope;
+                         });
   PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
+  it->reset();
   this->kids_.erase(it);
   // When making memory benchmark on Fluid, we have to delete scope sync.
   if (FLAGS_benchmark || FLAGS_eager_delete_scope) {
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index e246241c0a..0ba5d34798 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -105,7 +105,7 @@ class Scope {
   Variable* FindVarLocally(const std::string& name) const;
 
   // Scope in `kids_` are owned by this class.
-  mutable std::list<Scope*> kids_;
+  mutable std::list<std::shared_ptr<Scope>> kids_;
   Scope const* parent_{nullptr};
 
   DISABLE_COPY_AND_ASSIGN(Scope);

From dc863aac7edeccbe8362d625b2c1e6eeca885000 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 10 Sep 2018 14:29:19 +0800
Subject: [PATCH 02/17] Add kids exists detection in Scope

---
 .../fast_threaded_ssa_graph_executor.cc       |  3 +-
 .../fast_threaded_ssa_graph_executor.h        | 11 +++---
 .../framework/details/fetch_op_handle.cc      |  2 +-
 .../fluid/framework/details/fetch_op_handle.h |  4 +--
 .../scope_buffered_ssa_graph_executor.cc      |  3 +-
 .../scope_buffered_ssa_graph_executor.h       |  5 ++-
 .../details/threaded_ssa_graph_executor.cc    |  3 +-
 .../details/threaded_ssa_graph_executor.h     | 11 +++---
 paddle/fluid/framework/parallel_executor.cc   | 34 ++++++++-----------
 paddle/fluid/framework/parallel_executor.h    | 21 ++++++------
 paddle/fluid/framework/scope.cc               | 17 ++++++----
 paddle/fluid/framework/scope.h                |  5 ++-
 .../test_image_classification_resnet.py       |  5 +--
 .../test_image_classification_vgg.py          |  5 +--
 .../test_recognize_digits_conv.py             |  5 +--
 .../test_recognize_digits_mlp.py              |  5 +--
 16 files changed, 60 insertions(+), 79 deletions(-)

diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index a9b89614ae..7606f2bc06 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -22,8 +22,7 @@ namespace framework {
 namespace details {
 
 FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
-    const ExecutionStrategy &strategy,
-    const std::vector<std::shared_ptr<Scope>> &local_scopes,
+    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places,
     std::unique_ptr<ir::Graph> &&graph)
     : strategy_(strategy),
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
index fb615d70b7..dad3a231cb 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -29,17 +29,16 @@ namespace details {
 class OpHandleBase;
 class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
  public:
-  FastThreadedSSAGraphExecutor(
-      const ExecutionStrategy &strategy,
-      const std::vector<std::shared_ptr<Scope>> &local_scopes,
-      const std::vector<platform::Place> &places,
-      std::unique_ptr<ir::Graph> &&graph);
+  FastThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
+                               const std::vector<Scope *> &local_scopes,
+                               const std::vector<platform::Place> &places,
+                               std::unique_ptr<ir::Graph> &&graph);
   FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
   const ir::Graph &Graph() const override;
 
  private:
   ExecutionStrategy strategy_;
-  std::vector<std::shared_ptr<Scope>> local_scopes_;
+  std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
   std::unique_ptr<ir::Graph> graph_;
 
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 2f4aefd39d..fe18b2060c 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -22,7 +22,7 @@ namespace framework {
 namespace details {
 
 FetchOpHandle::FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset,
-                             std::vector<std::shared_ptr<Scope>> *local_scopes)
+                             std::vector<Scope *> *local_scopes)
     : OpHandleBase(node),
       data_(data),
       offset_(offset),
diff --git a/paddle/fluid/framework/details/fetch_op_handle.h b/paddle/fluid/framework/details/fetch_op_handle.h
index a207e36b8a..6ce42f92d7 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_op_handle.h
@@ -29,7 +29,7 @@ namespace details {
 struct FetchOpHandle : public OpHandleBase {
  public:
   FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset,
-                std::vector<std::shared_ptr<Scope>> *local_scopes);
+                std::vector<Scope *> *local_scopes);
 
   ~FetchOpHandle();
 
@@ -47,7 +47,7 @@ struct FetchOpHandle : public OpHandleBase {
  private:
   FeedFetchList *data_;
   size_t offset_;
-  std::vector<std::shared_ptr<Scope>> *local_scopes_;
+  std::vector<Scope *> *local_scopes_;
   std::vector<LoDTensor> tensors_;
 };
 
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index bf5671c679..5bd974d6b7 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -23,8 +23,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
-    ExecutionStrategy strategy,
-    std::vector<std::shared_ptr<Scope>> local_scopes,
+    ExecutionStrategy strategy, std::vector<Scope *> local_scopes,
     std::vector<VariableInfo> var_infos, std::vector<platform::Place> places,
     std::unique_ptr<SSAGraphExecutor> &&underlying_executor)
     : strategy_(std::move(strategy)),
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
index ec31755af5..5e87e0bf50 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -37,8 +37,7 @@ struct VariableInfo {
 class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
  public:
   ScopeBufferedSSAGraphExecutor(
-      ExecutionStrategy strategy,
-      std::vector<std::shared_ptr<Scope>> local_scopes,
+      ExecutionStrategy strategy, std::vector<Scope*> local_scopes,
       std::vector<VariableInfo> var_infos, std::vector<platform::Place> places,
       std::unique_ptr<SSAGraphExecutor>&& underlying_executor);
 
@@ -53,7 +52,7 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
 
   ExecutionStrategy strategy_;
   std::unique_ptr<SSAGraphExecutor> underlying_executor_;
-  std::vector<std::shared_ptr<Scope>> local_scopes_;
+  std::vector<Scope*> local_scopes_;
   std::vector<VariableInfo> var_infos_;
   std::vector<platform::Place> places_;
 };
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index cc6f444363..c9e331ef35 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -21,8 +21,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
-    const ExecutionStrategy &strategy,
-    const std::vector<std::shared_ptr<Scope>> &local_scopes,
+    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places,
     std::unique_ptr<ir::Graph> &&graph)
     : graph_(std::move(graph)),
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 2a74af6c3d..9135c1f5d4 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -38,11 +38,10 @@ namespace details {
 
 class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  public:
-  ThreadedSSAGraphExecutor(
-      const ExecutionStrategy &strategy,
-      const std::vector<std::shared_ptr<Scope>> &local_scopes,
-      const std::vector<platform::Place> &places,
-      std::unique_ptr<ir::Graph> &&graph);
+  ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
+                           const std::vector<Scope *> &local_scopes,
+                           const std::vector<platform::Place> &places,
+                           std::unique_ptr<ir::Graph> &&graph);
 
   const ir::Graph &Graph() const override { return *graph_; }
   // Run a SSAGraph by a thread pool
@@ -58,7 +57,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  private:
   std::unique_ptr<ir::Graph> graph_;
   std::unique_ptr<::ThreadPool> pool_;
-  std::vector<std::shared_ptr<Scope>> local_scopes_;
+  std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
   platform::DeviceContextPool fetch_ctxs_;
   ExceptionHolder exception_holder_;
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 93c74deb3e..5b8c75a93d 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -39,8 +39,7 @@ std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
     const ProgramDesc &main_program, const std::vector<platform::Place> &places,
     const std::string &loss_var_name,
     const std::unordered_set<std::string> &param_names,
-    const std::vector<std::shared_ptr<Scope>> &local_scopes,
-    const bool use_cuda,
+    const std::vector<Scope *> &local_scopes, const bool use_cuda,
 #ifdef PADDLE_WITH_CUDA
     const BuildStrategy &strategy, platform::NCCLContextMap *nccl_ctxs) {
 #else
@@ -67,8 +66,8 @@ std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
                                                      &loss_var_name);
   multi_devices_pass->SetNotOwned<const std::unordered_set<std::string>>(
       "params", &param_names);
-  multi_devices_pass->SetNotOwned<const std::vector<std::shared_ptr<Scope>>>(
-      "local_scopes", &local_scopes);
+  multi_devices_pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
+                                                              &local_scopes);
   multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy", &strategy);
 
 #ifdef PADDLE_WITH_CUDA
@@ -101,8 +100,8 @@ class ParallelExecutorPrivate {
       : places_(places) {}
 
   std::vector<platform::Place> places_;
-  std::vector<std::shared_ptr<Scope>> local_scopes_;
-  std::shared_ptr<Scope> global_scope_;
+  std::vector<Scope *> local_scopes_;
+  Scope *global_scope_;
   std::unique_ptr<details::SSAGraphExecutor> executor_;
 
 #ifdef PADDLE_WITH_CUDA
@@ -113,7 +112,7 @@ class ParallelExecutorPrivate {
   bool use_all_reduce_;
 };
 
-std::vector<std::shared_ptr<Scope>> &ParallelExecutor::GetLocalScopes() {
+std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
   return member_->local_scopes_;
 }
 
@@ -122,8 +121,7 @@ ParallelExecutor::ParallelExecutor(
     const std::unordered_set<std::string> &params,
     const std::unordered_set<std::string> &bcast_vars,
     const ProgramDesc &main_program, const std::string &loss_var_name,
-    const std::shared_ptr<Scope> &scope,
-    const std::vector<std::shared_ptr<Scope>> &local_scopes,
+    Scope *scope, const std::vector<Scope *> &local_scopes,
     const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy,
     size_t num_trainers, size_t trainer_id)
     : member_(new ParallelExecutorPrivate(places)) {
@@ -144,13 +142,13 @@ ParallelExecutor::ParallelExecutor(
     member_->own_local_scope_ = true;
     member_->local_scopes_.emplace_back(member_->global_scope_);
     for (size_t i = 1; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.emplace_back(scope->NewSharedScope());
+      member_->local_scopes_.emplace_back(&scope->NewScope());
     }
   } else {
     member_->own_local_scope_ = false;
     PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
     for (size_t i = 0; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.emplace_back(local_scopes[i]->NewSharedScope());
+      member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope());
     }
   }
 
@@ -323,7 +321,7 @@ void ParallelExecutor::FeedTensorsIntoLocalScopes(
 
   for (size_t i = 0; i < tensors.size(); ++i) {
     auto &map = tensors[i];
-    auto &scope = member_->local_scopes_[i];
+    auto *scope = member_->local_scopes_[i];
     for (auto &pair : map) {
       auto *trg = scope->Var(pair.first)->GetMutable<LoDTensor>();
       trg->ShareDataWith(pair.second);
@@ -353,15 +351,11 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
 
 ParallelExecutor::~ParallelExecutor() {
   if (member_->own_local_scope_) {
-    std::vector<Scope *> local_scopes_ptrs;
-    local_scopes_ptrs.reserve(member_->local_scopes_.size());
     for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
-      local_scopes_ptrs.emplace_back(member_->local_scopes_[i].get());
-      member_->local_scopes_[i].reset();
-    }
-
-    for (size_t i = 0; i != local_scopes_ptrs.size(); ++i) {
-      member_->global_scope_->DeleteScope(local_scopes_ptrs[i]);
+      Scope *local_scope = member_->local_scopes_[i];
+      if (member_->global_scope_->HasKid(local_scope)) {
+        member_->global_scope_->DeleteScope(local_scope);
+      }
     }
   }
 }
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index ce1076e44b..5fb748fa20 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -39,20 +39,19 @@ class ParallelExecutor {
   DISABLE_COPY_AND_ASSIGN(ParallelExecutor);
 
  public:
-  explicit ParallelExecutor(
-      const std::vector<platform::Place> &places,
-      const std::unordered_set<std::string> &params,
-      const std::unordered_set<std::string> &bcast_vars,
-      const ProgramDesc &main_program, const std::string &loss_var_name,
-      const std::shared_ptr<Scope> &scope,
-      const std::vector<std::shared_ptr<Scope>> &local_scopes,
-      const ExecutionStrategy &exec_strategy,
-      const BuildStrategy &build_strategy, size_t num_trainers = 1,
-      size_t trainer_id = 0);
+  explicit ParallelExecutor(const std::vector<platform::Place> &places,
+                            const std::unordered_set<std::string> &params,
+                            const std::unordered_set<std::string> &bcast_vars,
+                            const ProgramDesc &main_program,
+                            const std::string &loss_var_name, Scope *scope,
+                            const std::vector<Scope *> &local_scopes,
+                            const ExecutionStrategy &exec_strategy,
+                            const BuildStrategy &build_strategy,
+                            size_t num_trainers = 1, size_t trainer_id = 0);
 
   ~ParallelExecutor();
 
-  std::vector<std::shared_ptr<Scope>> &GetLocalScopes();
+  std::vector<Scope *> &GetLocalScopes();
 
   /**
    * Feed tensors to local scopes. The size of tensors should be equal to the
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index fa6bf4429d..2be655b89a 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -38,8 +38,8 @@ Scope::~Scope() { DropKids(); }
 
 Scope& Scope::NewScope() const {
   std::unique_lock<std::mutex> lock(mutex_);
-  kids_.push_back(std::shared_ptr<Scope>(new Scope(this)));
-  return kids_.back().get();
+  kids_.push_back(new Scope(this));
+  return *kids_.back();
 }
 
 Variable* Scope::Var(const std::string& name) {
@@ -68,9 +68,16 @@ const Scope* Scope::FindScope(const Variable* var) const {
 
 void Scope::DropKids() {
   std::unique_lock<std::mutex> lock(mutex_);
+  for (Scope* s : kids_) delete s;
   kids_.clear();
 }
 
+bool Scope::HasKid(const Scope* scope) const {
+  std::unique_lock<std::mutex> lock(mutex_);
+  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
+  return it != this->kids_.end();
+}
+
 std::vector<std::string> Scope::LocalVarNames() const {
   std::unique_lock<std::mutex> lock(mutex_);
   std::vector<std::string> known_vars;
@@ -83,12 +90,8 @@ std::vector<std::string> Scope::LocalVarNames() const {
 
 void Scope::DeleteScope(Scope* scope) const {
   std::unique_lock<std::mutex> lock(mutex_);
-  auto it = std::find_if(this->kids_.begin(), this->kids_.end(),
-                         [&scope](const std::shared_ptr<Scope>& kid) {
-                           return kid.get() == scope;
-                         });
+  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
-  it->reset();
   this->kids_.erase(it);
   // When making memory benchmark on Fluid, we have to delete scope sync.
   if (FLAGS_benchmark || FLAGS_eager_delete_scope) {
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index 0ba5d34798..b6165a595d 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -71,6 +71,9 @@ class Scope {
   /// Drop all kids scopes belonged to this scope.
   void DropKids();
 
+  /// Find if a scope exists in the kid scopes
+  bool HasKid(const Scope* scope) const;
+
   // enumerate all the variables current contains.
   std::vector<std::string> LocalVarNames() const;
 
@@ -105,7 +108,7 @@ class Scope {
   Variable* FindVarLocally(const std::string& name) const;
 
   // Scope in `kids_` are owned by this class.
-  mutable std::list<std::shared_ptr<Scope>> kids_;
+  mutable std::list<Scope*> kids_;
   Scope const* parent_{nullptr};
 
   DISABLE_COPY_AND_ASSIGN(Scope);
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
index e5ae95e2d9..de276755bb 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -178,7 +178,4 @@ if __name__ == '__main__':
         for parallel in (False, True):
             if use_cuda and not core.is_compiled_with_cuda():
                 continue
-            # TODO(minqiyang): remove this line after fixing the deletion
-            # order problem of Scope in ParallelExecutor in manylinux
-            if six.PY2:
-                main(use_cuda=use_cuda, parallel=parallel)
+            main(use_cuda=use_cuda, parallel=parallel)
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
index ff91be72c9..dd547f3448 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
@@ -152,7 +152,4 @@ if __name__ == '__main__':
         for parallel in (False, True):
             if use_cuda and not core.is_compiled_with_cuda():
                 continue
-            # TODO(minqiyang): remove this line after fixing the deletion
-            # order problem of Scope in ParallelExecutor in manylinux
-            if six.PY2:
-                main(use_cuda=use_cuda, parallel=parallel)
+            main(use_cuda=use_cuda, parallel=parallel)
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
index fa72c939e5..973308498b 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
@@ -155,7 +155,4 @@ if __name__ == '__main__':
         for parallel in (False, True):
             if use_cuda and not core.is_compiled_with_cuda():
                 continue
-            # TODO(minqiyang): remove this line after fixing the deletion
-            # order problem of Scope in ParallelExecutor in manylinux
-            if six.PY2:
-                main(use_cuda=use_cuda, parallel=parallel)
+            main(use_cuda=use_cuda, parallel=parallel)
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
index 440d2a3083..cb4aeb430e 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
@@ -137,7 +137,4 @@ if __name__ == '__main__':
         for parallel in (False, True):
             if use_cuda and not core.is_compiled_with_cuda():
                 continue
-            # TODO(minqiyang): remove this line after fixing the deletion
-            # order problem of Scope in ParallelExecutor in manylinux
-            if six.PY2:
-                main(use_cuda=use_cuda, parallel=parallel)
+            main(use_cuda=use_cuda, parallel=parallel)

From e0436ad8bbaed57b9c2c60f100d1e1f86fe42e07 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Tue, 11 Sep 2018 16:07:07 +0800
Subject: [PATCH 03/17] refine fusion lstm infershape

---
 paddle/fluid/framework/operator.cc           | 277 ++++++++-----------
 paddle/fluid/framework/shape_runtime_infer.h |  86 ++++++
 paddle/fluid/operators/fusion_lstm_op.cc     |  81 ++++--
 3 files changed, 260 insertions(+), 184 deletions(-)
 create mode 100644 paddle/fluid/framework/shape_runtime_infer.h

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index d58d6e4f3e..36025db7ba 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
+#include "paddle/fluid/framework/shape_runtime_infer.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -458,187 +459,147 @@ bool OpSupportGPU(const std::string& op_type) {
   return false;
 }
 
-class RuntimeInferShapeContext : public InferShapeContext {
- public:
-  RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope)
-      : op_(op), scope_(scope) {}
-
-  bool HasInput(const std::string& name) const override {
-    if (!op_.HasInputs(name)) {
-      return false;
-    }
-    auto& ins = Inputs(name);
-    size_t length = ins.size();
-    if (length == 0) {
-      return false;
-    }
-    PADDLE_ENFORCE_EQ(length, 1UL,
-                      "Input %s should not have more than one inputs", name);
-    auto ipt = ins[0];
-    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
-    return var != nullptr;
+bool RuntimeInferShapeContext::HasInput(const std::string& name) const {
+  if (!op_.HasInputs(name)) {
+    return false;
   }
-
-  bool HasOutput(const std::string& name) const override {
-    if (!op_.HasOutputs(name)) {
-      return false;
-    }
-    auto& outs = Outputs(name);
-    size_t length = outs.size();
-    if (length == 0) {
-      return false;
-    }
-    PADDLE_ENFORCE_EQ(length, 1UL,
-                      "Output %s should not have more than one inputs", name);
-    auto ipt = outs[0];
-    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
-    return var != nullptr;
+  auto& ins = Inputs(name);
+  size_t length = ins.size();
+  if (length == 0) {
+    return false;
   }
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Input %s should not have more than one inputs", name);
+  auto ipt = ins[0];
+  auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
+  return var != nullptr;
+}
 
-  bool HasInputs(const std::string& name) const override {
-    if (!op_.HasInputs(name)) {
-      return false;
-    }
-    auto inputs = op_.Inputs(name);
-    if (inputs.empty()) {
-      return false;
-    }
-    for (auto& input : inputs) {
-      if (scope_.FindVar(input) == nullptr) {
-        return false;
-      }
-    }
-    return true;
+bool RuntimeInferShapeContext::HasOutput(const std::string& name) const {
+  if (!op_.HasOutputs(name)) {
+    return false;
   }
+  auto& outs = Outputs(name);
+  size_t length = outs.size();
+  if (length == 0) {
+    return false;
+  }
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Output %s should not have more than one inputs", name);
+  auto ipt = outs[0];
+  auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
+  return var != nullptr;
+}
 
-  bool HasOutputs(const std::string& name) const override {
-    if (!op_.HasOutputs(name)) {
-      return false;
-    }
-    auto outputs = op_.Outputs(name);
-    if (outputs.empty()) {
+bool RuntimeInferShapeContext::HasInputs(const std::string& name) const {
+  if (!op_.HasInputs(name)) {
+    return false;
+  }
+  auto inputs = op_.Inputs(name);
+  if (inputs.empty()) {
+    return false;
+  }
+  for (auto& input : inputs) {
+    if (scope_.FindVar(input) == nullptr) {
       return false;
     }
-    for (auto& output : outputs) {
-      if (scope_.FindVar(output) == nullptr) {
-        return false;
-      }
-    }
-    return true;
   }
+  return true;
+}
 
-  AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
-
-  const std::vector<std::string>& Inputs(
-      const std::string& name) const override {
-    return op_.Inputs(name);
+bool RuntimeInferShapeContext::HasOutputs(const std::string& name) const {
+  if (!op_.HasOutputs(name)) {
+    return false;
   }
-
-  const std::vector<std::string>& Outputs(
-      const std::string& name) const override {
-    return op_.Outputs(name);
+  auto outputs = op_.Outputs(name);
+  if (outputs.empty()) {
+    return false;
   }
+  for (auto& output : outputs) {
+    if (scope_.FindVar(output) == nullptr) {
+      return false;
+    }
+  }
+  return true;
+}
 
-  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
-                size_t j = 0) const override {
-    PADDLE_ENFORCE_LT(i, Inputs(in).size());
-    PADDLE_ENFORCE_LT(j, Outputs(out).size());
-    Variable* in_var = scope_.FindVar(Inputs(in)[i]);
-    Variable* out_var = scope_.FindVar(Outputs(out)[j]);
-    if (!in_var->IsType<LoDTensor>()) return;
-    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
-                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
-    auto in_tensor = in_var->Get<LoDTensor>();
-    auto* out_tensor = out_var->GetMutable<LoDTensor>();
-    out_tensor->set_lod(in_tensor.lod());
+void RuntimeInferShapeContext::ShareLoD(const std::string& in,
+                                        const std::string& out, size_t i,
+                                        size_t j) const {
+  PADDLE_ENFORCE_LT(i, Inputs(in).size());
+  PADDLE_ENFORCE_LT(j, Outputs(out).size());
+  Variable* in_var = scope_.FindVar(Inputs(in)[i]);
+  Variable* out_var = scope_.FindVar(Outputs(out)[j]);
+  if (!in_var->IsType<LoDTensor>()) return;
+  PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
+                 "The %d-th output of Output(%s) must be LoDTensor.", j, out);
+  auto in_tensor = in_var->Get<LoDTensor>();
+  auto* out_tensor = out_var->GetMutable<LoDTensor>();
+  out_tensor->set_lod(in_tensor.lod());
 
 // TODO(dzhwinter) : reuse ShareLoD in most operators.
 // Need to call ShareLayout explicitly in sequence related ops.
 // Shall we have a better method to shared info between in/out Tensor?
 #ifdef PADDLE_WITH_MKLDNN
-    // Fix me: ugly workaround below
-    // Correct solution:
-    //    set_layout() should NOT be called here (i.e. ShareLoD). Instead,
-    //    layout of output tensor should be set "manually" in Compute()
-    //    of each OPKernel. The reason layout should NOT be shared between
-    //    input and output "automatically" (now by InferShape()->ShareLoD())
-    //    is that layout transform may occur after InferShape().
-    // Workaround:
-    //    Skip set_layout() when input layout is kMKLDNN
-    //    This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN
-    //    OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called
-    //    in Compute()
-    if (in_tensor.layout() != DataLayout::kMKLDNN)
+  // Fix me: ugly workaround below
+  // Correct solution:
+  //    set_layout() should NOT be called here (i.e. ShareLoD). Instead,
+  //    layout of output tensor should be set "manually" in Compute()
+  //    of each OPKernel. The reason layout should NOT be shared between
+  //    input and output "automatically" (now by InferShape()->ShareLoD())
+  //    is that layout transform may occur after InferShape().
+  // Workaround:
+  //    Skip set_layout() when input layout is kMKLDNN
+  //    This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN
+  //    OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called
+  //    in Compute()
+  if (in_tensor.layout() != DataLayout::kMKLDNN)
 #endif
-      out_tensor->set_layout(in_tensor.layout());
-  }
-
-  void ShareLayout(const std::string& in, const std::string& out, size_t i = 0,
-                   size_t j = 0) const {
-    PADDLE_ENFORCE_LT(i, Inputs(in).size());
-    PADDLE_ENFORCE_LT(j, Outputs(out).size());
-    Variable* in_var = scope_.FindVar(Inputs(in)[i]);
-    Variable* out_var = scope_.FindVar(Outputs(out)[j]);
-    if (!in_var->IsType<LoDTensor>()) return;
-    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
-                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
-    auto in_tensor = in_var->Get<LoDTensor>();
-    auto* out_tensor = out_var->GetMutable<LoDTensor>();
     out_tensor->set_layout(in_tensor.layout());
-  }
-
-  bool IsRuntime() const override { return true; }
-
- protected:
-  DDim GetDim(const std::string& name) const override {
-    Variable* var = scope_.FindVar(name);
-    PADDLE_ENFORCE_NOT_NULL(var);
-    if (var->IsType<LoDTensor>()) {
-      return var->Get<LoDTensor>().dims();
-    } else if (var->IsType<SelectedRows>()) {
-      return var->Get<SelectedRows>().GetCompleteDims();
-    } else {
-      PADDLE_THROW(
-          "Only LoDTensor/SelectedRows support 'GetDim', but Variable %s's "
-          "type_id is %s.",
-          name, var->Type().name());
-    }
-  }
-
-  std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
-    PADDLE_THROW("Only compile time support this method");
-  }
-
-  void SetDim(const std::string& name, const DDim& dim) override {
-    Variable* var = scope_.FindVar(name);
-    if (var->IsType<LoDTensor>()) {
-      var->GetMutable<LoDTensor>()->Resize(dim);
-    } else if (var->IsType<SelectedRows>()) {
-      var->GetMutable<SelectedRows>()->set_height(dim[0]);
-    } else {
-      PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.",
-                   name, var->Type().name());
-    }
-  }
-
-  void SetRepeatedDims(const std::string& name,
-                       const std::vector<DDim>& dims) override {
-    PADDLE_THROW("Only compile time support this method");
-  }
+}
 
-  proto::VarType::Type GetVarType(const std::string& name) const override {
-    auto* var = scope_.FindVar(name);
-    return ToVarType(var->Type());
+void RuntimeInferShapeContext::ShareLayout(const std::string& in,
+                                           const std::string& out, size_t i,
+                                           size_t j) const {
+  PADDLE_ENFORCE_LT(i, Inputs(in).size());
+  PADDLE_ENFORCE_LT(j, Outputs(out).size());
+  Variable* in_var = scope_.FindVar(Inputs(in)[i]);
+  Variable* out_var = scope_.FindVar(Outputs(out)[j]);
+  if (!in_var->IsType<LoDTensor>()) return;
+  PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
+                 "The %d-th output of Output(%s) must be LoDTensor.", j, out);
+  auto in_tensor = in_var->Get<LoDTensor>();
+  auto* out_tensor = out_var->GetMutable<LoDTensor>();
+  out_tensor->set_layout(in_tensor.layout());
+}
+
+DDim RuntimeInferShapeContext::GetDim(const std::string& name) const {
+  Variable* var = scope_.FindVar(name);
+  PADDLE_ENFORCE_NOT_NULL(var);
+  if (var->IsType<LoDTensor>()) {
+    return var->Get<LoDTensor>().dims();
+  } else if (var->IsType<SelectedRows>()) {
+    return var->Get<SelectedRows>().GetCompleteDims();
+  } else {
+    PADDLE_THROW(
+        "Only LoDTensor/SelectedRows support 'GetDim', but Variable %s's "
+        "type_id is %s.",
+        name, var->Type().name());
   }
+}
 
-  InferShapeVarPtr GetVarPtr(const std::string& name) override {
-    return scope_.FindVar(name);
+void RuntimeInferShapeContext::SetDim(const std::string& name,
+                                      const DDim& dim) {
+  Variable* var = scope_.FindVar(name);
+  if (var->IsType<LoDTensor>()) {
+    var->GetMutable<LoDTensor>()->Resize(dim);
+  } else if (var->IsType<SelectedRows>()) {
+    var->GetMutable<SelectedRows>()->set_height(dim[0]);
+  } else {
+    PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.", name,
+                 var->Type().name());
   }
-
- private:
-  const OperatorBase& op_;
-  const Scope& scope_;
-};
+}
 
 static void CheckTensorNANOrInf(const std::string& name,
                                 const framework::Tensor& tensor) {
diff --git a/paddle/fluid/framework/shape_runtime_infer.h b/paddle/fluid/framework/shape_runtime_infer.h
new file mode 100644
index 0000000000..04d4e33f7a
--- /dev/null
+++ b/paddle/fluid/framework/shape_runtime_infer.h
@@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/shape_inference.h"
+#include "paddle/fluid/framework/var_type.h"
+
+namespace paddle {
+namespace framework {
+
+class RuntimeInferShapeContext : public InferShapeContext {
+ public:
+  RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope)
+      : op_(op), scope_(scope) {}
+
+  bool HasInput(const std::string& name) const override;
+  bool HasOutput(const std::string& name) const override;
+  bool HasInputs(const std::string& name) const override;
+  bool HasOutputs(const std::string& name) const override;
+
+  const OperatorBase& OpBase() const { return op_; }
+
+  const Scope& InferScope() const { return scope_; }
+  AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
+
+  const std::vector<std::string>& Inputs(
+      const std::string& name) const override {
+    return op_.Inputs(name);
+  }
+
+  const std::vector<std::string>& Outputs(
+      const std::string& name) const override {
+    return op_.Outputs(name);
+  }
+
+  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
+                size_t j = 0) const override;
+
+  void ShareLayout(const std::string& in, const std::string& out, size_t i = 0,
+                   size_t j = 0) const;
+
+  bool IsRuntime() const override { return true; }
+
+ protected:
+  DDim GetDim(const std::string& name) const override;
+  void SetDim(const std::string& name, const DDim& dim) override;
+
+  std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
+    PADDLE_THROW("Only compile time support this method");
+  }
+  void SetRepeatedDims(const std::string& name,
+                       const std::vector<DDim>& dims) override {
+    PADDLE_THROW("Only compile time support this method");
+  }
+
+  proto::VarType::Type GetVarType(const std::string& name) const override {
+    auto* var = scope_.FindVar(name);
+    return ToVarType(var->Type());
+  }
+
+  InferShapeVarPtr GetVarPtr(const std::string& name) override {
+    return scope_.FindVar(name);
+  }
+
+ private:
+  const OperatorBase& op_;
+  const Scope& scope_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc
index ef23ab3f98..ae9d5d78ae 100644
--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fusion_lstm_op.h"
 #include <string>
+#include "paddle/fluid/framework/shape_runtime_infer.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
@@ -24,26 +25,54 @@ namespace paddle {
 namespace operators {
 
 void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("WeightX"),
-                 "Input(WeightX) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("WeightH"),
-                 "Input(WeightH) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("Bias"),
-                 "Input(Bias) of LSTM should not be null.");
-
-  PADDLE_ENFORCE(ctx->HasOutput("XX"),
-                 "Output(XX) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
-                 "Output(Hidden) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Cell"),
-                 "Output(Cell) of LSTM should not be null.");
+  auto* runtime_ctx = dynamic_cast<framework::RuntimeInferShapeContext*>(ctx);
+  if (runtime_ctx == nullptr) {
+    LOG(FATAL) << "Should have runtime infer context";
+  }
+  const auto& ins = runtime_ctx->OpBase().Inputs();
+  const auto& outs = runtime_ctx->OpBase().Outputs();
+  const auto& scope = runtime_ctx->InferScope();
+  const auto ins_end = ins.end();
+  const auto outs_end = outs.end();
+  auto fair_input = [&](const std::string& name) -> bool {
+    auto it = ins.find(name);
+    if (it == ins_end) {
+      return false;
+    }
+    const auto& in = it->second;
+    if (in.size() != 1 || in[0] == framework::kEmptyVarName) {
+      return false;
+    }
+    return scope.FindVar(in[0]) != nullptr;
+  };
+  auto fair_output = [&](const std::string& name) -> bool {
+    auto it = outs.find(name);
+    if (it == outs_end) {
+      return false;
+    }
+    const auto& out = it->second;
+    if (out.size() != 1 || out[0] == framework::kEmptyVarName) {
+      return false;
+    }
+    return scope.FindVar(out[0]) != nullptr;
+  };
+
+  PADDLE_ENFORCE(fair_input("X"), "Assert only one Input(X) of LSTM.");
+  PADDLE_ENFORCE(fair_input("WeightX"),
+                 "Assert only one Input(WeightX) of LSTM.");
+  PADDLE_ENFORCE(fair_input("WeightH"),
+                 "Assert only one Input(WeightH) of LSTM.");
+  PADDLE_ENFORCE(fair_input("Bias"), "Assert only one Input(Bias) of LSTM.");
+  PADDLE_ENFORCE(fair_output("XX"), "Assert only one Output(XX) of LSTM.");
+  PADDLE_ENFORCE(fair_output("Hidden"),
+                 "Assert only one Output(Hidden) of LSTM.");
+  PADDLE_ENFORCE(fair_output("Cell"), "Assert only one Output(Cell) of LSTM.");
 
   auto x_dims = ctx->GetInputDim("X");
   PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
 
-  if (ctx->HasInput("H0")) {
-    PADDLE_ENFORCE(ctx->HasInput("C0"),
+  if (fair_input("H0")) {
+    PADDLE_ENFORCE(fair_input("C0"),
                    "Input(Cell) and Input(Hidden) of LSTM should not "
                    "be null at the same time.");
     auto h_dims = ctx->GetInputDim("H0");
@@ -95,16 +124,16 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
     xx_width = wx_dims[1];
   } else {
     xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
-    PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
-                   "Output(BatchedInput) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
-                   "Output(BatchedHidden) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"),
-                   "Output(BatchedCell) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
-                   "Output(ReorderedH0) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"),
-                   "Output(ReorderedC0) of LSTM should not be null.");
+    PADDLE_ENFORCE(fair_output("BatchedInput"),
+                   "Assert only one Output(BatchedInput) of LSTM.");
+    PADDLE_ENFORCE(fair_output("BatchedHidden"),
+                   "Assert only one Output(BatchedHidden) of LSTM.");
+    PADDLE_ENFORCE(fair_output("BatchedCell"),
+                   "Assert only one Output(BatchedCell) of LSTM.");
+    PADDLE_ENFORCE(fair_output("ReorderedH0"),
+                   "Assert only one Output(ReorderedH0) of LSTM");
+    PADDLE_ENFORCE(fair_output("ReorderedC0"),
+                   "Assert only one Output(ReorderedC0) of LSTM.");
     ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
     ctx->SetOutputDim("BatchedHidden", out_dims);
     ctx->SetOutputDim("BatchedCell", out_dims);

From a5556d44175931682bb049451639948c0da7ed6e Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Tue, 11 Sep 2018 17:49:54 +0800
Subject: [PATCH 04/17] refine attentionlstm infershape

---
 paddle/fluid/operators/attention_lstm_op.cc | 88 ++++++++++++++-------
 1 file changed, 60 insertions(+), 28 deletions(-)

diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index 39b0c85699..ac4ddb5502 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/attention_lstm_op.h"
 #include <string>
+#include "paddle/fluid/framework/shape_runtime_infer.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
@@ -23,29 +24,60 @@ namespace paddle {
 namespace operators {
 
 void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("X"),
-                 "Input(X) of AttentionLSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("C0"),
-                 "Input(C0) of AttentionLSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("LSTMWeight"),
-                 "Input(LSTMWeight) of AttentionLSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("LSTMBias"),
-                 "Input(LSTMBias) of AttentionLSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("AttentionWeight"),
-                 "Input(AttentionWeight) of AttentionLSTM should not be null.");
-
-  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
-                 "Output(Hidden) of AttentionLSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Cell"),
-                 "Output(Cell) of AttentionLSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("AttentionedX"),
-                 "Output(AttentionedX) of AttentionLSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("AttentionFCOut"),
-                 "Output(AttentionFCOut) of AttentionLSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("LSTMX"),
-                 "Output(LSTMX) of AttentionLSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("LSTMOUT"),
-                 "Output(LSTMOUT) of AttentionLSTM should not be null.");
+  auto* runtime_ctx = dynamic_cast<framework::RuntimeInferShapeContext*>(ctx);
+  if (runtime_ctx == nullptr) {
+    LOG(FATAL) << "Should have runtime infer context";
+  }
+  const auto& ins = runtime_ctx->OpBase().Inputs();
+  const auto& outs = runtime_ctx->OpBase().Outputs();
+  const auto& scope = runtime_ctx->InferScope();
+  const auto ins_end = ins.end();
+  const auto outs_end = outs.end();
+  auto fair_input = [&](const std::string& name) -> bool {
+    auto it = ins.find(name);
+    if (it == ins_end) {
+      return false;
+    }
+    const auto& in = it->second;
+    if (in.size() != 1 || in[0] == framework::kEmptyVarName) {
+      return false;
+    }
+    return scope.FindVar(in[0]) != nullptr;
+  };
+  auto fair_output = [&](const std::string& name) -> bool {
+    auto it = outs.find(name);
+    if (it == outs_end) {
+      return false;
+    }
+    const auto& out = it->second;
+    if (out.size() != 1 || out[0] == framework::kEmptyVarName) {
+      return false;
+    }
+    return scope.FindVar(out[0]) != nullptr;
+  };
+
+  PADDLE_ENFORCE(fair_input("X"), "Assert only one Input(X) of AttentionLSTM.");
+  PADDLE_ENFORCE(fair_input("C0"),
+                 "Assert only one Input(C0) of AttentionLSTM.");
+  PADDLE_ENFORCE(fair_input("LSTMWeight"),
+                 "Assert only one Input(LSTMWeight) of AttentionLSTM.");
+  PADDLE_ENFORCE(fair_input("LSTMBias"),
+                 "Assert only one Input(LSTMBias) of AttentionLSTM.");
+  PADDLE_ENFORCE(fair_input("AttentionWeight"),
+                 "Assert only one Input(AttentionWeight) of AttentionLSTM.");
+
+  PADDLE_ENFORCE(fair_output("Hidden"),
+                 "Assert only one Output(Hidden) of AttentionLSTM.");
+  PADDLE_ENFORCE(fair_output("Cell"),
+                 "Assert only one Output(Cell) of AttentionLSTM.");
+  PADDLE_ENFORCE(fair_output("AttentionedX"),
+                 "Assert only one Output(AttentionedX) of AttentionLSTM.");
+  PADDLE_ENFORCE(fair_output("AttentionFCOut"),
+                 "Assert only one Output(AttentionFCOut) of AttentionLSTM.");
+  PADDLE_ENFORCE(fair_output("LSTMX"),
+                 "Assert only one Output(LSTMX) of AttentionLSTM.");
+  PADDLE_ENFORCE(fair_output("LSTMOUT"),
+                 "Assert only one Output(LSTMOUT) of AttentionLSTM.");
 
   auto x_dims = ctx->GetInputDim("X");
   const int M = x_dims[1];
@@ -65,7 +97,7 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
   auto c_dims = ctx->GetInputDim("C0");
   PADDLE_ENFORCE_EQ(c_dims.size(), 2, "Input(C0)'s rank must be 2.");
   PADDLE_ENFORCE_EQ(c_dims[1], D, "C0 dims should be N x %d.", D);
-  if (ctx->HasInput("H0")) {
+  if (fair_input("H0")) {
     auto h_dims = ctx->GetInputDim("H0");
     PADDLE_ENFORCE(h_dims == c_dims,
                    "The dimension of Input(H0) and Input(C0) "
@@ -79,7 +111,7 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
                     "AttentionWeight shapes must be (%d + %d) * 1.", M, D);
   PADDLE_ENFORCE_EQ(atten_w_dims[1], 1,
                     "AttentionWeight shapes must be (%d + %d) * 1.", M, D);
-  if (ctx->HasInput("AttentionBias")) {
+  if (fair_input("AttentionBias")) {
     auto atten_b_dims = ctx->GetInputDim("AttentionBias");
     PADDLE_ENFORCE_EQ(atten_b_dims.size(), 2,
                       "Input(AttentionBias)'s rank must be 2.");
@@ -89,7 +121,7 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
                       "AttentionBias shapes must be 1 * 1.");
   }
 
-  if (ctx->HasInput("AttentionScalar")) {
+  if (fair_input("AttentionScalar")) {
     auto dims = ctx->GetInputDim("AttentionScalar");
     PADDLE_ENFORCE_EQ(dims.size(), 2,
                       "Input(AttentionScalar)'s rank must be 2.");
@@ -97,10 +129,10 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
     PADDLE_ENFORCE_EQ(dims[1], 1, "AttentionScalar shapes must be 1 * 1.");
   }
 
-  if (ctx->HasInput("AttentionScalarBias")) {
+  if (fair_input("AttentionScalarBias")) {
     auto dims = ctx->GetInputDim("AttentionScalarBias");
     PADDLE_ENFORCE(
-        ctx->HasInput("AttentionScalar"),
+        fair_input("AttentionScalar"),
         "AttentionScalar should not be null when have AttentionScalarBias.");
     PADDLE_ENFORCE_EQ(dims.size(), 2,
                       "Input(AttentionScalarBias)'s rank must be 2.");

From 916f42bcbf7bc308f2135be5f341b8628cc883dc Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Tue, 11 Sep 2018 18:00:20 +0800
Subject: [PATCH 05/17] refine fusion gru infershape

---
 paddle/fluid/operators/fusion_gru_op.cc | 65 +++++++++++++++++++------
 1 file changed, 49 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/operators/fusion_gru_op.cc b/paddle/fluid/operators/fusion_gru_op.cc
index 916f84cb4a..bcdcb2ac4d 100644
--- a/paddle/fluid/operators/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fusion_gru_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/fusion_gru_op.h"
 #include <cstring>  // for memcpy
 #include <string>
+#include "paddle/fluid/framework/shape_runtime_infer.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
@@ -25,14 +26,46 @@ namespace paddle {
 namespace operators {
 
 void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("WeightX"),
-                 "Input(WeightX) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("WeightH"),
-                 "Input(WeightH) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("XX"), "Output(XX) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
-                 "Output(Hidden) of GRU should not be null.");
+  auto* runtime_ctx = dynamic_cast<framework::RuntimeInferShapeContext*>(ctx);
+  if (runtime_ctx == nullptr) {
+    LOG(FATAL) << "Should have runtime infer context";
+  }
+  const auto& ins = runtime_ctx->OpBase().Inputs();
+  const auto& outs = runtime_ctx->OpBase().Outputs();
+  const auto& scope = runtime_ctx->InferScope();
+  const auto ins_end = ins.end();
+  const auto outs_end = outs.end();
+  auto fair_input = [&](const std::string& name) -> bool {
+    auto it = ins.find(name);
+    if (it == ins_end) {
+      return false;
+    }
+    const auto& in = it->second;
+    if (in.size() != 1 || in[0] == framework::kEmptyVarName) {
+      return false;
+    }
+    return scope.FindVar(in[0]) != nullptr;
+  };
+  auto fair_output = [&](const std::string& name) -> bool {
+    auto it = outs.find(name);
+    if (it == outs_end) {
+      return false;
+    }
+    const auto& out = it->second;
+    if (out.size() != 1 || out[0] == framework::kEmptyVarName) {
+      return false;
+    }
+    return scope.FindVar(out[0]) != nullptr;
+  };
+
+  PADDLE_ENFORCE(fair_input("X"), "Assert only one Input(X) of GRU.");
+  PADDLE_ENFORCE(fair_input("WeightX"),
+                 "Assert only one Input(WeightX) of GRU.");
+  PADDLE_ENFORCE(fair_input("WeightH"),
+                 "Assert only one Input(WeightH) of GRU.");
+  PADDLE_ENFORCE(fair_output("XX"), "Assert only one Output(XX) of GRU.");
+  PADDLE_ENFORCE(fair_output("Hidden"),
+                 "Assert only one Output(Hidden) of GRU.");
 
   auto x_dims = ctx->GetInputDim("X");
   PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
@@ -58,12 +91,12 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
                     "should be 3 * %d.",
                     frame_size);
 
-  if (ctx->HasInput("H0")) {
+  if (fair_input("H0")) {
     auto h0_dims = ctx->GetInputDim("H0");
     PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
                       "The width of H0 must be equal to frame_size.");
   }
-  if (ctx->HasInput("Bias")) {
+  if (fair_input("Bias")) {
     auto b_dims = ctx->GetInputDim("Bias");
     PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
     PADDLE_ENFORCE_EQ(b_dims[0], 1,
@@ -79,12 +112,12 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
     xx_width = wx_dims[1];
   } else {
     xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
-    PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
-                   "Output(ReorderedH0) of GRU should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
-                   "Output(BatchedInput) of GRU should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"),
-                   "Output(BatchedOut) of GRU should not be null.");
+    PADDLE_ENFORCE(fair_output("ReorderedH0"),
+                   "Assert only one Output(ReorderedH0) of GRU.");
+    PADDLE_ENFORCE(fair_output("BatchedInput"),
+                   "Assert only one Output(BatchedInput) of GRU.");
+    PADDLE_ENFORCE(fair_output("BatchedOut"),
+                   "Assert only one Output(BatchedOut) of GRU.");
     ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
     ctx->SetOutputDim("BatchedOut", out_dims);
   }

From 8e0fe035d478a8bfb7bea888b986eafa827dcbf1 Mon Sep 17 00:00:00 2001
From: superjomn <yanchunwei@outlook.com>
Date: Tue, 11 Sep 2018 10:16:19 +0000
Subject: [PATCH 06/17] fix ner_test when bs>1

---
 paddle/fluid/inference/tests/api/analyzer_ner_tester.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
index 661b047ed7..6e8e43add7 100644
--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -144,8 +144,9 @@ void TestChineseNERPrediction(bool use_analysis) {
     size_t num_samples;
     for (int i = 0; i < FLAGS_repeat; i++) {
       DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+      // Just one batch, the num_samples remains the same.
       num_samples = data.num_samples;
-      for (size_t bid = 0; bid < num_samples; ++bid) {
+      for (size_t bid = 0; bid < num_samples / FLAGS_batch_size; ++bid) {
         PrepareInputs(&input_slots, &data, FLAGS_batch_size);
         timer.tic();
         predictor->Run(input_slots, &outputs);

From 8a1abe54d797de7c4f17ab92d2268c3cebf83b66 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Tue, 11 Sep 2018 18:30:49 +0800
Subject: [PATCH 07/17] clean fusion infershape code

---
 paddle/fluid/operators/attention_lstm_op.cc   | 35 +----------
 paddle/fluid/operators/fusion_gru_op.cc       | 35 +----------
 .../operators/fusion_infershape_define.h      | 60 +++++++++++++++++++
 paddle/fluid/operators/fusion_lstm_op.cc      | 35 +----------
 4 files changed, 66 insertions(+), 99 deletions(-)
 create mode 100644 paddle/fluid/operators/fusion_infershape_define.h

diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index ac4ddb5502..7531aa9a46 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/attention_lstm_op.h"
 #include <string>
-#include "paddle/fluid/framework/shape_runtime_infer.h"
+#include "paddle/fluid/operators/fusion_infershape_define.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
@@ -24,38 +24,7 @@ namespace paddle {
 namespace operators {
 
 void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
-  auto* runtime_ctx = dynamic_cast<framework::RuntimeInferShapeContext*>(ctx);
-  if (runtime_ctx == nullptr) {
-    LOG(FATAL) << "Should have runtime infer context";
-  }
-  const auto& ins = runtime_ctx->OpBase().Inputs();
-  const auto& outs = runtime_ctx->OpBase().Outputs();
-  const auto& scope = runtime_ctx->InferScope();
-  const auto ins_end = ins.end();
-  const auto outs_end = outs.end();
-  auto fair_input = [&](const std::string& name) -> bool {
-    auto it = ins.find(name);
-    if (it == ins_end) {
-      return false;
-    }
-    const auto& in = it->second;
-    if (in.size() != 1 || in[0] == framework::kEmptyVarName) {
-      return false;
-    }
-    return scope.FindVar(in[0]) != nullptr;
-  };
-  auto fair_output = [&](const std::string& name) -> bool {
-    auto it = outs.find(name);
-    if (it == outs_end) {
-      return false;
-    }
-    const auto& out = it->second;
-    if (out.size() != 1 || out[0] == framework::kEmptyVarName) {
-      return false;
-    }
-    return scope.FindVar(out[0]) != nullptr;
-  };
-
+  FUSION_INFERSHAPE_INIT;
   PADDLE_ENFORCE(fair_input("X"), "Assert only one Input(X) of AttentionLSTM.");
   PADDLE_ENFORCE(fair_input("C0"),
                  "Assert only one Input(C0) of AttentionLSTM.");
diff --git a/paddle/fluid/operators/fusion_gru_op.cc b/paddle/fluid/operators/fusion_gru_op.cc
index bcdcb2ac4d..b10d311f05 100644
--- a/paddle/fluid/operators/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fusion_gru_op.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/fusion_gru_op.h"
 #include <cstring>  // for memcpy
 #include <string>
-#include "paddle/fluid/framework/shape_runtime_infer.h"
+#include "paddle/fluid/operators/fusion_infershape_define.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
@@ -26,38 +26,7 @@ namespace paddle {
 namespace operators {
 
 void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
-  auto* runtime_ctx = dynamic_cast<framework::RuntimeInferShapeContext*>(ctx);
-  if (runtime_ctx == nullptr) {
-    LOG(FATAL) << "Should have runtime infer context";
-  }
-  const auto& ins = runtime_ctx->OpBase().Inputs();
-  const auto& outs = runtime_ctx->OpBase().Outputs();
-  const auto& scope = runtime_ctx->InferScope();
-  const auto ins_end = ins.end();
-  const auto outs_end = outs.end();
-  auto fair_input = [&](const std::string& name) -> bool {
-    auto it = ins.find(name);
-    if (it == ins_end) {
-      return false;
-    }
-    const auto& in = it->second;
-    if (in.size() != 1 || in[0] == framework::kEmptyVarName) {
-      return false;
-    }
-    return scope.FindVar(in[0]) != nullptr;
-  };
-  auto fair_output = [&](const std::string& name) -> bool {
-    auto it = outs.find(name);
-    if (it == outs_end) {
-      return false;
-    }
-    const auto& out = it->second;
-    if (out.size() != 1 || out[0] == framework::kEmptyVarName) {
-      return false;
-    }
-    return scope.FindVar(out[0]) != nullptr;
-  };
-
+  FUSION_INFERSHAPE_INIT;
   PADDLE_ENFORCE(fair_input("X"), "Assert only one Input(X) of GRU.");
   PADDLE_ENFORCE(fair_input("WeightX"),
                  "Assert only one Input(WeightX) of GRU.");
diff --git a/paddle/fluid/operators/fusion_infershape_define.h b/paddle/fluid/operators/fusion_infershape_define.h
new file mode 100644
index 0000000000..89521672b0
--- /dev/null
+++ b/paddle/fluid/operators/fusion_infershape_define.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_FLUID_OPERATORS_FUSION_INFERSHAPE_DEFINE_H_
+#define PADDLE_FLUID_OPERATORS_FUSION_INFERSHAPE_DEFINE_H_
+
+#include <string>
+#include "paddle/fluid/framework/shape_runtime_infer.h"
+
+namespace paddle {
+namespace operators {
+
+#define FUSION_INFERSHAPE_INIT                                                 \
+  auto* runtime_ctx = dynamic_cast<framework::RuntimeInferShapeContext*>(ctx); \
+  if (runtime_ctx == nullptr) {                                                \
+    LOG(FATAL) << "Should have runtime infer context";                         \
+  }                                                                            \
+  const auto& ins = runtime_ctx->OpBase().Inputs();                            \
+  const auto& outs = runtime_ctx->OpBase().Outputs();                          \
+  const auto& scope = runtime_ctx->InferScope();                               \
+  const auto ins_end = ins.end();                                              \
+  const auto outs_end = outs.end();                                            \
+  auto fair_input = [&](const std::string& name) -> bool {                     \
+    auto it = ins.find(name);                                                  \
+    if (it == ins_end) {                                                       \
+      return false;                                                            \
+    }                                                                          \
+    const auto& in = it->second;                                               \
+    if (in.size() != 1 || in[0] == framework::kEmptyVarName) {                 \
+      return false;                                                            \
+    }                                                                          \
+    return scope.FindVar(in[0]) != nullptr;                                    \
+  };                                                                           \
+  auto fair_output = [&](const std::string& name) -> bool {                    \
+    auto it = outs.find(name);                                                 \
+    if (it == outs_end) {                                                      \
+      return false;                                                            \
+    }                                                                          \
+    const auto& out = it->second;                                              \
+    if (out.size() != 1 || out[0] == framework::kEmptyVarName) {               \
+      return false;                                                            \
+    }                                                                          \
+    return scope.FindVar(out[0]) != nullptr;                                   \
+  }
+
+}  // namespace operators
+}  // namespace paddle
+
+#endif  // PADDLE_FLUID_OPERATORS_FUSION_INFERSHAPE_DEFINE_H_
diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc
index ae9d5d78ae..08af98f850 100644
--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fusion_lstm_op.h"
 #include <string>
-#include "paddle/fluid/framework/shape_runtime_infer.h"
+#include "paddle/fluid/operators/fusion_infershape_define.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
@@ -25,38 +25,7 @@ namespace paddle {
 namespace operators {
 
 void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
-  auto* runtime_ctx = dynamic_cast<framework::RuntimeInferShapeContext*>(ctx);
-  if (runtime_ctx == nullptr) {
-    LOG(FATAL) << "Should have runtime infer context";
-  }
-  const auto& ins = runtime_ctx->OpBase().Inputs();
-  const auto& outs = runtime_ctx->OpBase().Outputs();
-  const auto& scope = runtime_ctx->InferScope();
-  const auto ins_end = ins.end();
-  const auto outs_end = outs.end();
-  auto fair_input = [&](const std::string& name) -> bool {
-    auto it = ins.find(name);
-    if (it == ins_end) {
-      return false;
-    }
-    const auto& in = it->second;
-    if (in.size() != 1 || in[0] == framework::kEmptyVarName) {
-      return false;
-    }
-    return scope.FindVar(in[0]) != nullptr;
-  };
-  auto fair_output = [&](const std::string& name) -> bool {
-    auto it = outs.find(name);
-    if (it == outs_end) {
-      return false;
-    }
-    const auto& out = it->second;
-    if (out.size() != 1 || out[0] == framework::kEmptyVarName) {
-      return false;
-    }
-    return scope.FindVar(out[0]) != nullptr;
-  };
-
+  FUSION_INFERSHAPE_INIT;
   PADDLE_ENFORCE(fair_input("X"), "Assert only one Input(X) of LSTM.");
   PADDLE_ENFORCE(fair_input("WeightX"),
                  "Assert only one Input(WeightX) of LSTM.");

From 7dd54afd0c7f328891fbb0df15e434aa9afba216 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Tue, 11 Sep 2018 12:12:46 +0000
Subject: [PATCH 08/17] fix program desc unit test error

---
 paddle/fluid/framework/program_desc_test.cc | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/program_desc_test.cc b/paddle/fluid/framework/program_desc_test.cc
index 925ea98dbe..7e689a37da 100644
--- a/paddle/fluid/framework/program_desc_test.cc
+++ b/paddle/fluid/framework/program_desc_test.cc
@@ -87,8 +87,17 @@ TEST(ProgramDesc, copy_ctor) {
     ASSERT_EQ(op_origin->Inputs(), op_copy->Inputs());
     ASSERT_EQ(op_origin->Outputs(), op_copy->Outputs());
 
-    ASSERT_EQ(op_copy->Proto()->SerializeAsString(),
-              op_origin->Proto()->SerializeAsString());
+    ASSERT_EQ(op_origin->Proto()->attrs().size(),
+              op_copy->Proto()->attrs().size());
+    for (auto it = op_origin->Proto()->attrs().begin();
+         it != op_origin->Proto()->attrs().end(); ++it) {
+      for (auto it_2 = op_copy->Proto()->attrs().begin();
+           it_2 != op_copy->Proto()->attrs().end(); ++it_2) {
+        if (it->name() == it_2->name()) {
+          ASSERT_TRUE(it_2->SerializeAsString() == it->SerializeAsString());
+        }
+      }
+    }
 
     if (op->Type() == "op_with_subblock") {
       ASSERT_EQ(1, op->GetBlockAttrId("sub_block"));

From 8bb824bb93629fbf69d7e93ffc0dca85e726300c Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 12 Sep 2018 00:06:58 +0800
Subject: [PATCH 09/17] refine infershape hasinput and hasoutput

---
 paddle/fluid/framework/operator.cc            | 274 ++++++++++--------
 paddle/fluid/framework/shape_runtime_infer.h  |  86 ------
 paddle/fluid/operators/attention_lstm_op.cc   |  35 ++-
 paddle/fluid/operators/fusion_gru_op.cc       |  22 +-
 .../operators/fusion_infershape_define.h      |  60 ----
 paddle/fluid/operators/fusion_lstm_op.cc      |  31 +-
 6 files changed, 197 insertions(+), 311 deletions(-)
 delete mode 100644 paddle/fluid/framework/shape_runtime_infer.h
 delete mode 100644 paddle/fluid/operators/fusion_infershape_define.h

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 36025db7ba..bbd141cb3b 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -21,7 +21,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
-#include "paddle/fluid/framework/shape_runtime_infer.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -459,147 +458,184 @@ bool OpSupportGPU(const std::string& op_type) {
   return false;
 }
 
-bool RuntimeInferShapeContext::HasInput(const std::string& name) const {
-  if (!op_.HasInputs(name)) {
-    return false;
-  }
-  auto& ins = Inputs(name);
-  size_t length = ins.size();
-  if (length == 0) {
-    return false;
-  }
-  PADDLE_ENFORCE_EQ(length, 1UL,
-                    "Input %s should not have more than one inputs", name);
-  auto ipt = ins[0];
-  auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
-  return var != nullptr;
-}
+class RuntimeInferShapeContext : public InferShapeContext {
+ public:
+  RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope)
+      : op_(op), scope_(scope) {}
 
-bool RuntimeInferShapeContext::HasOutput(const std::string& name) const {
-  if (!op_.HasOutputs(name)) {
-    return false;
-  }
-  auto& outs = Outputs(name);
-  size_t length = outs.size();
-  if (length == 0) {
-    return false;
-  }
-  PADDLE_ENFORCE_EQ(length, 1UL,
-                    "Output %s should not have more than one inputs", name);
-  auto ipt = outs[0];
-  auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
-  return var != nullptr;
-}
+  bool HasInput(const std::string& name) const override {
+    // has only one input
+    const auto& ins = op_.Inputs();
+    auto it = ins.find(name);
+    if (it == ins.end()) {
+      return false;
+    }
+    const auto& in = it->second;
 
-bool RuntimeInferShapeContext::HasInputs(const std::string& name) const {
-  if (!op_.HasInputs(name)) {
-    return false;
-  }
-  auto inputs = op_.Inputs(name);
-  if (inputs.empty()) {
-    return false;
-  }
-  for (auto& input : inputs) {
-    if (scope_.FindVar(input) == nullptr) {
+    if (in.size() != 1 || in[0] == kEmptyVarName) {
       return false;
     }
+    return scope_.FindVar(in[0]) != nullptr;
   }
-  return true;
-}
 
-bool RuntimeInferShapeContext::HasOutputs(const std::string& name) const {
-  if (!op_.HasOutputs(name)) {
-    return false;
+  bool HasOutput(const std::string& name) const override {
+    // has only one output
+    const auto& outs = op_.Outputs();
+    auto it = outs.find(name);
+    if (it == outs.end()) {
+      return false;
+    }
+    const auto& out = it->second;
+    if (out.size() != 1 || out[0] == kEmptyVarName) {
+      return false;
+    }
+    return scope_.FindVar(out[0]) != nullptr;
   }
-  auto outputs = op_.Outputs(name);
-  if (outputs.empty()) {
-    return false;
+
+  bool HasInputs(const std::string& name) const override {
+    if (!op_.HasInputs(name)) {
+      return false;
+    }
+    auto inputs = op_.Inputs(name);
+    if (inputs.empty()) {
+      return false;
+    }
+    for (auto& input : inputs) {
+      if (scope_.FindVar(input) == nullptr) {
+        return false;
+      }
+    }
+    return true;
   }
-  for (auto& output : outputs) {
-    if (scope_.FindVar(output) == nullptr) {
+
+  bool HasOutputs(const std::string& name) const override {
+    if (!op_.HasOutputs(name)) {
+      return false;
+    }
+    auto outputs = op_.Outputs(name);
+    if (outputs.empty()) {
       return false;
     }
+    for (auto& output : outputs) {
+      if (scope_.FindVar(output) == nullptr) {
+        return false;
+      }
+    }
+    return true;
   }
-  return true;
-}
 
-void RuntimeInferShapeContext::ShareLoD(const std::string& in,
-                                        const std::string& out, size_t i,
-                                        size_t j) const {
-  PADDLE_ENFORCE_LT(i, Inputs(in).size());
-  PADDLE_ENFORCE_LT(j, Outputs(out).size());
-  Variable* in_var = scope_.FindVar(Inputs(in)[i]);
-  Variable* out_var = scope_.FindVar(Outputs(out)[j]);
-  if (!in_var->IsType<LoDTensor>()) return;
-  PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
-                 "The %d-th output of Output(%s) must be LoDTensor.", j, out);
-  auto in_tensor = in_var->Get<LoDTensor>();
-  auto* out_tensor = out_var->GetMutable<LoDTensor>();
-  out_tensor->set_lod(in_tensor.lod());
+  AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
+
+  const std::vector<std::string>& Inputs(
+      const std::string& name) const override {
+    return op_.Inputs(name);
+  }
+
+  const std::vector<std::string>& Outputs(
+      const std::string& name) const override {
+    return op_.Outputs(name);
+  }
+
+  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
+                size_t j = 0) const override {
+    PADDLE_ENFORCE_LT(i, Inputs(in).size());
+    PADDLE_ENFORCE_LT(j, Outputs(out).size());
+    Variable* in_var = scope_.FindVar(Inputs(in)[i]);
+    Variable* out_var = scope_.FindVar(Outputs(out)[j]);
+    if (!in_var->IsType<LoDTensor>()) return;
+    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
+                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
+    auto in_tensor = in_var->Get<LoDTensor>();
+    auto* out_tensor = out_var->GetMutable<LoDTensor>();
+    out_tensor->set_lod(in_tensor.lod());
 
 // TODO(dzhwinter) : reuse ShareLoD in most operators.
 // Need to call ShareLayout explicitly in sequence related ops.
 // Shall we have a better method to shared info between in/out Tensor?
 #ifdef PADDLE_WITH_MKLDNN
-  // Fix me: ugly workaround below
-  // Correct solution:
-  //    set_layout() should NOT be called here (i.e. ShareLoD). Instead,
-  //    layout of output tensor should be set "manually" in Compute()
-  //    of each OPKernel. The reason layout should NOT be shared between
-  //    input and output "automatically" (now by InferShape()->ShareLoD())
-  //    is that layout transform may occur after InferShape().
-  // Workaround:
-  //    Skip set_layout() when input layout is kMKLDNN
-  //    This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN
-  //    OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called
-  //    in Compute()
-  if (in_tensor.layout() != DataLayout::kMKLDNN)
+    // Fix me: ugly workaround below
+    // Correct solution:
+    //    set_layout() should NOT be called here (i.e. ShareLoD). Instead,
+    //    layout of output tensor should be set "manually" in Compute()
+    //    of each OPKernel. The reason layout should NOT be shared between
+    //    input and output "automatically" (now by InferShape()->ShareLoD())
+    //    is that layout transform may occur after InferShape().
+    // Workaround:
+    //    Skip set_layout() when input layout is kMKLDNN
+    //    This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN
+    //    OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called
+    //    in Compute()
+    if (in_tensor.layout() != DataLayout::kMKLDNN)
 #endif
+      out_tensor->set_layout(in_tensor.layout());
+  }
+
+  void ShareLayout(const std::string& in, const std::string& out, size_t i = 0,
+                   size_t j = 0) const {
+    PADDLE_ENFORCE_LT(i, Inputs(in).size());
+    PADDLE_ENFORCE_LT(j, Outputs(out).size());
+    Variable* in_var = scope_.FindVar(Inputs(in)[i]);
+    Variable* out_var = scope_.FindVar(Outputs(out)[j]);
+    if (!in_var->IsType<LoDTensor>()) return;
+    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
+                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
+    auto in_tensor = in_var->Get<LoDTensor>();
+    auto* out_tensor = out_var->GetMutable<LoDTensor>();
     out_tensor->set_layout(in_tensor.layout());
-}
+  }
 
-void RuntimeInferShapeContext::ShareLayout(const std::string& in,
-                                           const std::string& out, size_t i,
-                                           size_t j) const {
-  PADDLE_ENFORCE_LT(i, Inputs(in).size());
-  PADDLE_ENFORCE_LT(j, Outputs(out).size());
-  Variable* in_var = scope_.FindVar(Inputs(in)[i]);
-  Variable* out_var = scope_.FindVar(Outputs(out)[j]);
-  if (!in_var->IsType<LoDTensor>()) return;
-  PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
-                 "The %d-th output of Output(%s) must be LoDTensor.", j, out);
-  auto in_tensor = in_var->Get<LoDTensor>();
-  auto* out_tensor = out_var->GetMutable<LoDTensor>();
-  out_tensor->set_layout(in_tensor.layout());
-}
-
-DDim RuntimeInferShapeContext::GetDim(const std::string& name) const {
-  Variable* var = scope_.FindVar(name);
-  PADDLE_ENFORCE_NOT_NULL(var);
-  if (var->IsType<LoDTensor>()) {
-    return var->Get<LoDTensor>().dims();
-  } else if (var->IsType<SelectedRows>()) {
-    return var->Get<SelectedRows>().GetCompleteDims();
-  } else {
-    PADDLE_THROW(
-        "Only LoDTensor/SelectedRows support 'GetDim', but Variable %s's "
-        "type_id is %s.",
-        name, var->Type().name());
+  bool IsRuntime() const override { return true; }
+
+ protected:
+  DDim GetDim(const std::string& name) const override {
+    Variable* var = scope_.FindVar(name);
+    PADDLE_ENFORCE_NOT_NULL(var);
+    if (var->IsType<LoDTensor>()) {
+      return var->Get<LoDTensor>().dims();
+    } else if (var->IsType<SelectedRows>()) {
+      return var->Get<SelectedRows>().GetCompleteDims();
+    } else {
+      PADDLE_THROW(
+          "Only LoDTensor/SelectedRows support 'GetDim', but Variable %s's "
+          "type_id is %s.",
+          name, var->Type().name());
+    }
   }
-}
 
-void RuntimeInferShapeContext::SetDim(const std::string& name,
-                                      const DDim& dim) {
-  Variable* var = scope_.FindVar(name);
-  if (var->IsType<LoDTensor>()) {
-    var->GetMutable<LoDTensor>()->Resize(dim);
-  } else if (var->IsType<SelectedRows>()) {
-    var->GetMutable<SelectedRows>()->set_height(dim[0]);
-  } else {
-    PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.", name,
-                 var->Type().name());
+  std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
+    PADDLE_THROW("Only compile time support this method");
   }
-}
+
+  void SetDim(const std::string& name, const DDim& dim) override {
+    Variable* var = scope_.FindVar(name);
+    if (var->IsType<LoDTensor>()) {
+      var->GetMutable<LoDTensor>()->Resize(dim);
+    } else if (var->IsType<SelectedRows>()) {
+      var->GetMutable<SelectedRows>()->set_height(dim[0]);
+    } else {
+      PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.",
+                   name, var->Type().name());
+    }
+  }
+
+  void SetRepeatedDims(const std::string& name,
+                       const std::vector<DDim>& dims) override {
+    PADDLE_THROW("Only compile time support this method");
+  }
+
+  proto::VarType::Type GetVarType(const std::string& name) const override {
+    auto* var = scope_.FindVar(name);
+    return ToVarType(var->Type());
+  }
+
+  InferShapeVarPtr GetVarPtr(const std::string& name) override {
+    return scope_.FindVar(name);
+  }
+
+ private:
+  const OperatorBase& op_;
+  const Scope& scope_;
+};
 
 static void CheckTensorNANOrInf(const std::string& name,
                                 const framework::Tensor& tensor) {
diff --git a/paddle/fluid/framework/shape_runtime_infer.h b/paddle/fluid/framework/shape_runtime_infer.h
deleted file mode 100644
index 04d4e33f7a..0000000000
--- a/paddle/fluid/framework/shape_runtime_infer.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/shape_inference.h"
-#include "paddle/fluid/framework/var_type.h"
-
-namespace paddle {
-namespace framework {
-
-class RuntimeInferShapeContext : public InferShapeContext {
- public:
-  RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope)
-      : op_(op), scope_(scope) {}
-
-  bool HasInput(const std::string& name) const override;
-  bool HasOutput(const std::string& name) const override;
-  bool HasInputs(const std::string& name) const override;
-  bool HasOutputs(const std::string& name) const override;
-
-  const OperatorBase& OpBase() const { return op_; }
-
-  const Scope& InferScope() const { return scope_; }
-  AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
-
-  const std::vector<std::string>& Inputs(
-      const std::string& name) const override {
-    return op_.Inputs(name);
-  }
-
-  const std::vector<std::string>& Outputs(
-      const std::string& name) const override {
-    return op_.Outputs(name);
-  }
-
-  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
-                size_t j = 0) const override;
-
-  void ShareLayout(const std::string& in, const std::string& out, size_t i = 0,
-                   size_t j = 0) const;
-
-  bool IsRuntime() const override { return true; }
-
- protected:
-  DDim GetDim(const std::string& name) const override;
-  void SetDim(const std::string& name, const DDim& dim) override;
-
-  std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
-    PADDLE_THROW("Only compile time support this method");
-  }
-  void SetRepeatedDims(const std::string& name,
-                       const std::vector<DDim>& dims) override {
-    PADDLE_THROW("Only compile time support this method");
-  }
-
-  proto::VarType::Type GetVarType(const std::string& name) const override {
-    auto* var = scope_.FindVar(name);
-    return ToVarType(var->Type());
-  }
-
-  InferShapeVarPtr GetVarPtr(const std::string& name) override {
-    return scope_.FindVar(name);
-  }
-
- private:
-  const OperatorBase& op_;
-  const Scope& scope_;
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index 7531aa9a46..9b943440a8 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/attention_lstm_op.h"
 #include <string>
-#include "paddle/fluid/operators/fusion_infershape_define.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
@@ -24,28 +23,28 @@ namespace paddle {
 namespace operators {
 
 void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
-  FUSION_INFERSHAPE_INIT;
-  PADDLE_ENFORCE(fair_input("X"), "Assert only one Input(X) of AttentionLSTM.");
-  PADDLE_ENFORCE(fair_input("C0"),
+  PADDLE_ENFORCE(ctx->HasInput("X"),
+                 "Assert only one Input(X) of AttentionLSTM.");
+  PADDLE_ENFORCE(ctx->HasInput("C0"),
                  "Assert only one Input(C0) of AttentionLSTM.");
-  PADDLE_ENFORCE(fair_input("LSTMWeight"),
+  PADDLE_ENFORCE(ctx->HasInput("LSTMWeight"),
                  "Assert only one Input(LSTMWeight) of AttentionLSTM.");
-  PADDLE_ENFORCE(fair_input("LSTMBias"),
+  PADDLE_ENFORCE(ctx->HasInput("LSTMBias"),
                  "Assert only one Input(LSTMBias) of AttentionLSTM.");
-  PADDLE_ENFORCE(fair_input("AttentionWeight"),
+  PADDLE_ENFORCE(ctx->HasInput("AttentionWeight"),
                  "Assert only one Input(AttentionWeight) of AttentionLSTM.");
 
-  PADDLE_ENFORCE(fair_output("Hidden"),
+  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
                  "Assert only one Output(Hidden) of AttentionLSTM.");
-  PADDLE_ENFORCE(fair_output("Cell"),
+  PADDLE_ENFORCE(ctx->HasOutput("Cell"),
                  "Assert only one Output(Cell) of AttentionLSTM.");
-  PADDLE_ENFORCE(fair_output("AttentionedX"),
+  PADDLE_ENFORCE(ctx->HasOutput("AttentionedX"),
                  "Assert only one Output(AttentionedX) of AttentionLSTM.");
-  PADDLE_ENFORCE(fair_output("AttentionFCOut"),
+  PADDLE_ENFORCE(ctx->HasOutput("AttentionFCOut"),
                  "Assert only one Output(AttentionFCOut) of AttentionLSTM.");
-  PADDLE_ENFORCE(fair_output("LSTMX"),
+  PADDLE_ENFORCE(ctx->HasOutput("LSTMX"),
                  "Assert only one Output(LSTMX) of AttentionLSTM.");
-  PADDLE_ENFORCE(fair_output("LSTMOUT"),
+  PADDLE_ENFORCE(ctx->HasOutput("LSTMOUT"),
                  "Assert only one Output(LSTMOUT) of AttentionLSTM.");
 
   auto x_dims = ctx->GetInputDim("X");
@@ -66,7 +65,7 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
   auto c_dims = ctx->GetInputDim("C0");
   PADDLE_ENFORCE_EQ(c_dims.size(), 2, "Input(C0)'s rank must be 2.");
   PADDLE_ENFORCE_EQ(c_dims[1], D, "C0 dims should be N x %d.", D);
-  if (fair_input("H0")) {
+  if (ctx->HasInput("H0")) {
     auto h_dims = ctx->GetInputDim("H0");
     PADDLE_ENFORCE(h_dims == c_dims,
                    "The dimension of Input(H0) and Input(C0) "
@@ -80,7 +79,7 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
                     "AttentionWeight shapes must be (%d + %d) * 1.", M, D);
   PADDLE_ENFORCE_EQ(atten_w_dims[1], 1,
                     "AttentionWeight shapes must be (%d + %d) * 1.", M, D);
-  if (fair_input("AttentionBias")) {
+  if (ctx->HasInput("AttentionBias")) {
     auto atten_b_dims = ctx->GetInputDim("AttentionBias");
     PADDLE_ENFORCE_EQ(atten_b_dims.size(), 2,
                       "Input(AttentionBias)'s rank must be 2.");
@@ -90,7 +89,7 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
                       "AttentionBias shapes must be 1 * 1.");
   }
 
-  if (fair_input("AttentionScalar")) {
+  if (ctx->HasInput("AttentionScalar")) {
     auto dims = ctx->GetInputDim("AttentionScalar");
     PADDLE_ENFORCE_EQ(dims.size(), 2,
                       "Input(AttentionScalar)'s rank must be 2.");
@@ -98,10 +97,10 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
     PADDLE_ENFORCE_EQ(dims[1], 1, "AttentionScalar shapes must be 1 * 1.");
   }
 
-  if (fair_input("AttentionScalarBias")) {
+  if (ctx->HasInput("AttentionScalarBias")) {
     auto dims = ctx->GetInputDim("AttentionScalarBias");
     PADDLE_ENFORCE(
-        fair_input("AttentionScalar"),
+        ctx->HasInput("AttentionScalar"),
         "AttentionScalar should not be null when have AttentionScalarBias.");
     PADDLE_ENFORCE_EQ(dims.size(), 2,
                       "Input(AttentionScalarBias)'s rank must be 2.");
diff --git a/paddle/fluid/operators/fusion_gru_op.cc b/paddle/fluid/operators/fusion_gru_op.cc
index b10d311f05..31e87d9113 100644
--- a/paddle/fluid/operators/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fusion_gru_op.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/fusion_gru_op.h"
 #include <cstring>  // for memcpy
 #include <string>
-#include "paddle/fluid/operators/fusion_infershape_define.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
@@ -26,14 +25,13 @@ namespace paddle {
 namespace operators {
 
 void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
-  FUSION_INFERSHAPE_INIT;
-  PADDLE_ENFORCE(fair_input("X"), "Assert only one Input(X) of GRU.");
-  PADDLE_ENFORCE(fair_input("WeightX"),
+  PADDLE_ENFORCE(ctx->HasInput("X"), "Assert only one Input(X) of GRU.");
+  PADDLE_ENFORCE(ctx->HasInput("WeightX"),
                  "Assert only one Input(WeightX) of GRU.");
-  PADDLE_ENFORCE(fair_input("WeightH"),
+  PADDLE_ENFORCE(ctx->HasInput("WeightH"),
                  "Assert only one Input(WeightH) of GRU.");
-  PADDLE_ENFORCE(fair_output("XX"), "Assert only one Output(XX) of GRU.");
-  PADDLE_ENFORCE(fair_output("Hidden"),
+  PADDLE_ENFORCE(ctx->HasOutput("XX"), "Assert only one Output(XX) of GRU.");
+  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
                  "Assert only one Output(Hidden) of GRU.");
 
   auto x_dims = ctx->GetInputDim("X");
@@ -60,12 +58,12 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
                     "should be 3 * %d.",
                     frame_size);
 
-  if (fair_input("H0")) {
+  if (ctx->HasInput("H0")) {
     auto h0_dims = ctx->GetInputDim("H0");
     PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
                       "The width of H0 must be equal to frame_size.");
   }
-  if (fair_input("Bias")) {
+  if (ctx->HasInput("Bias")) {
     auto b_dims = ctx->GetInputDim("Bias");
     PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
     PADDLE_ENFORCE_EQ(b_dims[0], 1,
@@ -81,11 +79,11 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
     xx_width = wx_dims[1];
   } else {
     xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
-    PADDLE_ENFORCE(fair_output("ReorderedH0"),
+    PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
                    "Assert only one Output(ReorderedH0) of GRU.");
-    PADDLE_ENFORCE(fair_output("BatchedInput"),
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
                    "Assert only one Output(BatchedInput) of GRU.");
-    PADDLE_ENFORCE(fair_output("BatchedOut"),
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"),
                    "Assert only one Output(BatchedOut) of GRU.");
     ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
     ctx->SetOutputDim("BatchedOut", out_dims);
diff --git a/paddle/fluid/operators/fusion_infershape_define.h b/paddle/fluid/operators/fusion_infershape_define.h
deleted file mode 100644
index 89521672b0..0000000000
--- a/paddle/fluid/operators/fusion_infershape_define.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_FLUID_OPERATORS_FUSION_INFERSHAPE_DEFINE_H_
-#define PADDLE_FLUID_OPERATORS_FUSION_INFERSHAPE_DEFINE_H_
-
-#include <string>
-#include "paddle/fluid/framework/shape_runtime_infer.h"
-
-namespace paddle {
-namespace operators {
-
-#define FUSION_INFERSHAPE_INIT                                                 \
-  auto* runtime_ctx = dynamic_cast<framework::RuntimeInferShapeContext*>(ctx); \
-  if (runtime_ctx == nullptr) {                                                \
-    LOG(FATAL) << "Should have runtime infer context";                         \
-  }                                                                            \
-  const auto& ins = runtime_ctx->OpBase().Inputs();                            \
-  const auto& outs = runtime_ctx->OpBase().Outputs();                          \
-  const auto& scope = runtime_ctx->InferScope();                               \
-  const auto ins_end = ins.end();                                              \
-  const auto outs_end = outs.end();                                            \
-  auto fair_input = [&](const std::string& name) -> bool {                     \
-    auto it = ins.find(name);                                                  \
-    if (it == ins_end) {                                                       \
-      return false;                                                            \
-    }                                                                          \
-    const auto& in = it->second;                                               \
-    if (in.size() != 1 || in[0] == framework::kEmptyVarName) {                 \
-      return false;                                                            \
-    }                                                                          \
-    return scope.FindVar(in[0]) != nullptr;                                    \
-  };                                                                           \
-  auto fair_output = [&](const std::string& name) -> bool {                    \
-    auto it = outs.find(name);                                                 \
-    if (it == outs_end) {                                                      \
-      return false;                                                            \
-    }                                                                          \
-    const auto& out = it->second;                                              \
-    if (out.size() != 1 || out[0] == framework::kEmptyVarName) {               \
-      return false;                                                            \
-    }                                                                          \
-    return scope.FindVar(out[0]) != nullptr;                                   \
-  }
-
-}  // namespace operators
-}  // namespace paddle
-
-#endif  // PADDLE_FLUID_OPERATORS_FUSION_INFERSHAPE_DEFINE_H_
diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc
index 08af98f850..55e465e3af 100644
--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fusion_lstm_op.h"
 #include <string>
-#include "paddle/fluid/operators/fusion_infershape_define.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
@@ -25,23 +24,23 @@ namespace paddle {
 namespace operators {
 
 void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
-  FUSION_INFERSHAPE_INIT;
-  PADDLE_ENFORCE(fair_input("X"), "Assert only one Input(X) of LSTM.");
-  PADDLE_ENFORCE(fair_input("WeightX"),
+  PADDLE_ENFORCE(ctx->HasInput("X"), "Assert only one Input(X) of LSTM.");
+  PADDLE_ENFORCE(ctx->HasInput("WeightX"),
                  "Assert only one Input(WeightX) of LSTM.");
-  PADDLE_ENFORCE(fair_input("WeightH"),
+  PADDLE_ENFORCE(ctx->HasInput("WeightH"),
                  "Assert only one Input(WeightH) of LSTM.");
-  PADDLE_ENFORCE(fair_input("Bias"), "Assert only one Input(Bias) of LSTM.");
-  PADDLE_ENFORCE(fair_output("XX"), "Assert only one Output(XX) of LSTM.");
-  PADDLE_ENFORCE(fair_output("Hidden"),
+  PADDLE_ENFORCE(ctx->HasInput("Bias"), "Assert only one Input(Bias) of LSTM.");
+  PADDLE_ENFORCE(ctx->HasOutput("XX"), "Assert only one Output(XX) of LSTM.");
+  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
                  "Assert only one Output(Hidden) of LSTM.");
-  PADDLE_ENFORCE(fair_output("Cell"), "Assert only one Output(Cell) of LSTM.");
+  PADDLE_ENFORCE(ctx->HasOutput("Cell"),
+                 "Assert only one Output(Cell) of LSTM.");
 
   auto x_dims = ctx->GetInputDim("X");
   PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
 
-  if (fair_input("H0")) {
-    PADDLE_ENFORCE(fair_input("C0"),
+  if (ctx->HasInput("H0")) {
+    PADDLE_ENFORCE(ctx->HasInput("C0"),
                    "Input(Cell) and Input(Hidden) of LSTM should not "
                    "be null at the same time.");
     auto h_dims = ctx->GetInputDim("H0");
@@ -93,15 +92,15 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
     xx_width = wx_dims[1];
   } else {
     xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
-    PADDLE_ENFORCE(fair_output("BatchedInput"),
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
                    "Assert only one Output(BatchedInput) of LSTM.");
-    PADDLE_ENFORCE(fair_output("BatchedHidden"),
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
                    "Assert only one Output(BatchedHidden) of LSTM.");
-    PADDLE_ENFORCE(fair_output("BatchedCell"),
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"),
                    "Assert only one Output(BatchedCell) of LSTM.");
-    PADDLE_ENFORCE(fair_output("ReorderedH0"),
+    PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
                    "Assert only one Output(ReorderedH0) of LSTM");
-    PADDLE_ENFORCE(fair_output("ReorderedC0"),
+    PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"),
                    "Assert only one Output(ReorderedC0) of LSTM.");
     ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
     ctx->SetOutputDim("BatchedHidden", out_dims);

From 312e92ab072297dae3bf2baf6479b51bfc9b88e6 Mon Sep 17 00:00:00 2001
From: Shan Yi <35982308+shanyi15@users.noreply.github.com>
Date: Wed, 12 Sep 2018 10:43:32 +0800
Subject: [PATCH 10/17] update-readme

---
 README.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 60ffbe7281..45186ec4ef 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
 
 
-### Latest PaddlePaddle Release: [Fluid 0.14.0](https://github.com/PaddlePaddle/Paddle/tree/v0.14.0)
+### Latest PaddlePaddle Release: [Fluid 0.15.0](https://github.com/PaddlePaddle/Paddle/tree/v0.15.0)
 ### Install Latest Stable Release:
 ```
 # Linux CPU
@@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==0.14.0.post85
 
 ## Installation
 
-It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/install/install_doc.html) on our website.
+It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/install/install_doc.html) on our website.
 
 ## Documentation
 
-We provide [English](http://paddlepaddle.org/documentation/docs/en/0.14.0/getstarted/index_en.html) and
-[Chinese](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/index.html) documentation.
+We provide [English](http://paddlepaddle.org/documentation/docs/en/0.15.0/getstarted/index_en.html) and
+[Chinese](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/index.html) documentation.
 
 - [Deep Learning 101](https://github.com/PaddlePaddle/book)
 
   You might want to start from this online interactive book that can run in a Jupyter Notebook.
 
-- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/user_guides/howto/training/cluster_howto.html)
+- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/user_guides/howto/training/cluster_howto.html)
 
   You can run distributed training jobs on MPI clusters.
 
-- [Python API](http://paddlepaddle.org/documentation/api/zh/0.14.0/fluid.html)
+- [Python API](http://paddlepaddle.org/documentation/api/zh/0.15.0/fluid.html)
 
    Our new API enables much shorter programs.
 
-- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/advanced_usage/development/contribute_to_paddle.html)
+- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/advanced_usage/development/contribute_to_paddle.html)
 
    We appreciate your contributions!
 

From 36d6e44681c3ebe1ff3992b37a981ca468580080 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 12 Sep 2018 03:41:39 +0000
Subject: [PATCH 11/17] fix test_py_reader_using_executor error

---
 .../fluid/tests/unittests/test_py_reader_using_executor.py     | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
index 931cac409f..0fb9518a45 100644
--- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
@@ -96,7 +96,8 @@ class TestPyReaderUsingExecutor(unittest.TestCase):
         self.queue_capacity = 50
 
     def test(self):
-        for use_cuda in [False, True]:
+        for use_cuda in ([False, True]
+                         if core.core.is_compiled_with_cuda() else [False]):
             for use_parallel_executor in [False, True]:
                 for use_double_buffer in [False, True]:
                     print('Test Parameters:'),

From d61c11764af1249c8acc6937f2c25a8ae6c86c3e Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 12 Sep 2018 12:50:50 +0800
Subject: [PATCH 12/17] follow comment add enforce

---
 paddle/fluid/framework/operator.cc | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index bbd141cb3b..b7fae7171a 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -471,10 +471,11 @@ class RuntimeInferShapeContext : public InferShapeContext {
       return false;
     }
     const auto& in = it->second;
-
-    if (in.size() != 1 || in[0] == kEmptyVarName) {
+    if (in.size() == 0 || in[0] == kEmptyVarName) {
       return false;
     }
+    PADDLE_ENFORCE_EQ(in.size(), 1UL,
+                      "Input %s should not have more than one inputs", name);
     return scope_.FindVar(in[0]) != nullptr;
   }
 
@@ -486,9 +487,11 @@ class RuntimeInferShapeContext : public InferShapeContext {
       return false;
     }
     const auto& out = it->second;
-    if (out.size() != 1 || out[0] == kEmptyVarName) {
+    if (out.size() == 0 || out[0] == kEmptyVarName) {
       return false;
     }
+    PADDLE_ENFORCE_EQ(out.size(), 1UL,
+                      "Output %s should not have more than one outputs", name);
     return scope_.FindVar(out[0]) != nullptr;
   }
 

From d41176411fd4f5f06155c7c73264f1145ecccee7 Mon Sep 17 00:00:00 2001
From: Jiabin Yang <marsyang199376@gmail.com>
Date: Wed, 12 Sep 2018 13:08:02 +0800
Subject: [PATCH 13/17] Update test_py_reader_using_executor.py

---
 .../fluid/tests/unittests/test_py_reader_using_executor.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
index 0fb9518a45..b7fad9b3a6 100644
--- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
@@ -97,7 +97,7 @@ class TestPyReaderUsingExecutor(unittest.TestCase):
 
     def test(self):
         for use_cuda in ([False, True]
-                         if core.core.is_compiled_with_cuda() else [False]):
+                         if core.is_compiled_with_cuda() else [False]):
             for use_parallel_executor in [False, True]:
                 for use_double_buffer in [False, True]:
                     print('Test Parameters:'),

From bdd957b4be7a023426f76ae6e3153aa5a0e1686f Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 12 Sep 2018 05:18:22 +0000
Subject: [PATCH 14/17] fix test_parallel_executor_transformer

---
 .../tests/unittests/test_parallel_executor_transformer.py     | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
index 5ad922725a..a55b2002ed 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
@@ -20,6 +20,7 @@ import numpy as np
 from parallel_executor_test_base import TestParallelExecutorBase
 import unittest
 import paddle
+import paddle.fluid.core as core
 import paddle.dataset.wmt16 as wmt16
 import os
 
@@ -170,7 +171,8 @@ class TestTransformer(TestParallelExecutorBase):
                 writer.complete_append_tensor()
 
     def test_main(self):
-        self.check_network_convergence(transformer, use_cuda=True)
+        if core.is_compiled_with_cuda():
+            self.check_network_convergence(transformer, use_cuda=True)
         self.check_network_convergence(transformer, use_cuda=False, iter=5)
 
 

From 41de582bb092dfa67bd2a1fa5d3b469db1ae81e2 Mon Sep 17 00:00:00 2001
From: Sylwester Fraczek <sylwester.fraczek@intel.com>
Date: Wed, 12 Sep 2018 10:22:11 +0200
Subject: [PATCH 15/17] create conv relu pass for MKLDNN (#13258)

---
 paddle/fluid/framework/ir/CMakeLists.txt      |   6 +
 .../ir/conv_relu_mkldnn_fuse_pass.cc          |  90 +++++++++++++++
 .../framework/ir/conv_relu_mkldnn_fuse_pass.h |  39 +++++++
 .../ir/conv_relu_mkldnn_fuse_pass_tester.cc   | 108 ++++++++++++++++++
 .../framework/ir/graph_pattern_detector.cc    |  33 ++++++
 .../framework/ir/graph_pattern_detector.h     |  22 ++++
 6 files changed, 298 insertions(+)
 create mode 100644 paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
 create mode 100644 paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
 create mode 100644 paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index ce3ebed00b..7004f484a9 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -28,6 +28,9 @@ cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph grap
 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
 pass_library(fc_fuse_pass inference)
+if(WITH_MKLDNN)
+  pass_library(conv_relu_mkldnn_fuse_pass inference)
+endif()
 pass_library(attention_lstm_fuse_pass inference)
 pass_library(infer_clean_graph_pass inference)
 pass_library(fc_lstm_fuse_pass inference)
@@ -42,3 +45,6 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r
 cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
+if(WITH_MKLDNN)
+  cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
+endif()
diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
new file mode 100644
index 0000000000..4408cb45ac
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
@@ -0,0 +1,90 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init("conv_relu_mkldnn_fuse", graph.get());
+
+  std::unordered_set<Node*> nodes2delete;
+
+  GraphPatternDetector gpd;
+  auto* conv_input = gpd.mutable_pattern()
+                         ->NewNode("conv_relu_mkldnn_fuse/conv_input")
+                         ->AsInput()
+                         ->assert_is_op_input("conv2d", "Input");
+  patterns::ConvReLU conv_relu_pattern(gpd.mutable_pattern(),
+                                       "conv_relu_mkldnn_fuse");
+  conv_relu_pattern(conv_input);
+
+  int found_conv_relu_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "handle ConvReLU fuse";
+    GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
+                              conv_relu_pattern);  // Filter
+    GET_IR_NODE_FROM_SUBGRAPH(conv_bias, conv_bias, conv_relu_pattern);  // Bias
+    GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern);    // tmp
+    GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_relu_pattern);  // CONV op
+    GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_relu_pattern);  // Out
+    GET_IR_NODE_FROM_SUBGRAPH(relu, relu, conv_relu_pattern);  // ReLU op
+
+    // Create an ConvReLU Node.
+    OpDesc desc;
+    std::string conv_relu_i_in = subgraph.at(conv_input)->Name();
+    std::string conv_relu_w_in = conv_weight->Name();
+    std::string conv_relu_b_in = conv_bias->Name();
+    std::string conv_relu_out = relu_out->Name();
+    desc.SetInput("Input", std::vector<std::string>({conv_relu_i_in}));
+    desc.SetInput("Filter", std::vector<std::string>({conv_relu_w_in}));
+    desc.SetInput("Bias", std::vector<std::string>({conv_relu_b_in}));
+    desc.SetOutput("Out", std::vector<std::string>({conv_relu_out}));
+    desc.SetType("conv2d");
+    for (auto& attr : conv->Op()->GetAttrMap()) {
+      desc.SetAttr(attr.first, attr.second);
+    }
+    desc.SetAttr("fuse_relu", true);
+    auto conv_relu_node = g->CreateOpNode(&desc);  // OpDesc will be copied.
+    GraphSafeRemoveNodes(graph.get(), {conv, relu, conv_out});
+
+    PADDLE_ENFORCE(subgraph.count(conv_input));
+    IR_NODE_LINK_TO(subgraph.at(conv_input), conv_relu_node);
+    IR_NODE_LINK_TO(conv_weight, conv_relu_node);
+    IR_NODE_LINK_TO(conv_bias, conv_relu_node);
+    IR_NODE_LINK_TO(conv_relu_node, relu_out);
+
+    found_conv_relu_count++;
+  };
+
+  gpd(graph.get(), handler);
+
+  AddStatis(found_conv_relu_count);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(conv_relu_mkldnn_fuse_pass,
+              paddle::framework::ir::ConvReLUFusePass);
diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
new file mode 100644
index 0000000000..b5de0d5487
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Fuse the CONV and ReLU to a ConvReLUOp.
+ */
+class ConvReLUFusePass : public FusePassBase {
+ public:
+  virtual ~ConvReLUFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
new file mode 100644
index 0000000000..82b5fa1886
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  if (type == "conv2d") {
+    op->SetAttr("use_mkldnn", true);
+    op->SetInput("Input", {inputs[0]});
+    op->SetInput("Filter", {inputs[1]});
+    op->SetInput("Bias", {inputs[2]});
+  } else if (type == "relu") {
+    op->SetInput("X", inputs);
+  }
+  op->SetOutput("Out", outputs);
+}
+
+// a->OP0->b
+// b->OP1->c
+// (c, weights, bias)->conv->f
+// (f)->relu->g
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+  for (auto& v :
+       std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::SELECTED_ROWS);
+    if (v == "weights" || v == "bias") {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "OP0", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"b"}));
+  SetOp(&prog, "OP1", std::vector<std::string>({"b"}),
+        std::vector<std::string>({"c"}));
+  SetOp(&prog, "conv2d", std::vector<std::string>({"c", "weights", "bias"}),
+        std::vector<std::string>({"f"}));
+  SetOp(&prog, "relu", std::vector<std::string>({"f"}),
+        std::vector<std::string>({"g"}));
+
+  return prog;
+}
+
+TEST(ConvReLUFusePass, basic) {
+  auto prog = BuildProgramDesc();
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("conv_relu_mkldnn_fuse_pass");
+
+  int original_nodes_num = graph->Nodes().size();
+
+  graph = pass->Apply(std::move(graph));
+
+  int current_nodes_num = graph->Nodes().size();
+
+  // Remove 3 Nodes: CONV, RELU, conv_out
+  // Add 1 Node: ConvReLU
+  EXPECT_EQ(original_nodes_num - 2, current_nodes_num);
+
+  // Assert conv_relu op in newly generated graph
+  int conv_relu_count = 0;
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()->Type() == "conv2d") {
+      if (node->Op()->HasAttr("use_mkldnn")) {
+        bool use_mkldnn = boost::get<bool>(node->Op()->GetAttr("use_mkldnn"));
+        if (use_mkldnn) {
+          if (node->Op()->HasAttr("fuse_relu")) {
+            bool fuse_relu = boost::get<bool>(node->Op()->GetAttr("fuse_relu"));
+            if (fuse_relu) {
+              ++conv_relu_count;
+            }
+          }
+        }
+      }
+    }
+  }
+  EXPECT_EQ(conv_relu_count, 1);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(conv_relu_mkldnn_fuse_pass);
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 5825a129b7..11d5998aaf 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -522,6 +522,39 @@ bool VarLinksFromOp(Node* node, const std::string& op_type) {
   return false;
 }
 
+PDNode* patterns::ConvReLU::operator()(
+    paddle::framework::ir::PDNode* conv_input) {
+  // Create Operators
+  conv_input->assert_is_op_input("conv2d", "Input");
+  auto* conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
+  auto* relu_op = pattern->NewNode(relu_repr())->assert_is_op("relu");
+  // Create variables
+  // Filter
+  auto* conv_weight_var = pattern->NewNode(conv_weight_repr())
+                              ->AsInput()
+                              ->assert_is_persistable_var()
+                              ->assert_is_op_input("conv2d", "Filter");
+  // Bias
+  auto* conv_bias_var = pattern->NewNode(conv_bias_repr())
+                            ->AsInput()
+                            ->assert_is_persistable_var()
+                            ->assert_is_op_input("conv2d", "Bias");
+  // intermediate variable, will be removed in the IR after fuse.
+  auto* conv_out_var = pattern->NewNode(conv_out_repr())
+                           ->AsIntermediate()
+                           ->assert_is_only_output_of_op("conv2d")
+                           ->assert_is_op_input("relu");
+  // output
+  auto* relu_out_var = pattern->NewNode(relu_out_repr())
+                           ->AsOutput()
+                           ->assert_is_op_output("relu");
+
+  conv_op->LinksFrom({conv_input, conv_weight_var, conv_bias_var})
+      .LinksTo({conv_out_var});
+  relu_op->LinksFrom({conv_out_var}).LinksTo({relu_out_var});
+  return relu_out_var;
+}
+
 PDNode* patterns::FC::operator()(paddle::framework::ir::PDNode* x,
                                  bool with_bias) {
   // Create shared nodes.
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 57482a07b6..371384dc56 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -360,6 +360,28 @@ struct PatternBase {
   size_t id_;
 };
 
+// CONV with ReLU
+// op: conv + relu
+// named nodes:
+// conv_input, conv_weight,
+// conv_bias, conv_out, conv,
+// relu_out, relu
+struct ConvReLU : public PatternBase {
+  ConvReLU(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "conv_relu") {}
+
+  PDNode* operator()(PDNode* conv_input);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(conv);
+  PATTERN_DECL_NODE(relu);
+  // declare variable node's name
+  PATTERN_DECL_NODE(conv_weight);
+  PATTERN_DECL_NODE(conv_bias);
+  PATTERN_DECL_NODE(conv_out);
+  PATTERN_DECL_NODE(relu_out);
+};
+
 // FC with bias
 // op: mul + elementwise_add
 // named nodes:

From b12322ce959a2ab79a1bae4e7aaf9e4b42d56909 Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Wed, 12 Sep 2018 19:06:17 +0800
Subject: [PATCH 16/17] fix fusion_lstm unique_name bug

---
 paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc     | 5 ++---
 paddle/fluid/inference/analysis/ir_pass_manager.cc | 8 ++++++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
index f7fda87357..aa95d3e9f6 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -51,7 +51,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
     if (with_fc_bias) {
       // Add FC-bias with LSTM-bias and create a new weight
       PADDLE_ENFORCE(scope);
-      const std::string& new_bias_var = name_scope + "_bias.new";
+      const std::string& new_bias_var = patterns::UniqueKey("NewBias");
       auto* bias_var = scope->Var(new_bias_var);
       PADDLE_ENFORCE(bias_var);
       auto* bias_tensor = bias_var->GetMutable<framework::LoDTensor>();
@@ -120,7 +120,6 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
 
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-
     GET_IR_NODE_FROM_SUBGRAPH(lstm, lstm, lstm_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, lstm_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, lstm_pattern);
@@ -136,7 +135,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
                    fc_bias);
       // Remove unneeded nodes.
       std::unordered_set<const Node*> marked_nodes(
-          {mul, lstm, elementwise_add});
+          {mul, lstm, elementwise_add, fc_bias});
       GraphSafeRemoveNodes(graph, marked_nodes);
     } else {
       GET_IR_NODE_FROM_SUBGRAPH(fc_out, mul_out, fc_pattern);
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 30c1e8e93d..e76708baf4 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
 #include <string>
+#include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/scope.h"
@@ -37,13 +38,16 @@ IRPassManager::IRPassManager(const ProgramDesc &program,
 void IRPassManager::Apply(const std::vector<std::string> &passes) {
   // Apply all the passes
   std::string pre_pass;
+  int pass_num = 0;
   for (const std::string &pass_name : passes) {
     PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass_name);
     auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
     if (pass_name == "graph_viz_pass") {
-      std::string dot_file_path =
-          "ir_" + (pre_pass.empty() ? "origin" : pre_pass) + ".dot";
+      std::string dot_file_path = std::to_string(pass_num) + "_ir_" +
+                                  (pre_pass.empty() ? "origin" : pre_pass) +
+                                  ".dot";
       pass->Set("graph_viz_path", new std::string(std::move(dot_file_path)));
+      pass_num++;
     }
     graph_ = pass->Apply(std::move(graph_));
     pre_pass = pass_name;

From e69d9c845b30d7150f122c41805b1bc5bf75136c Mon Sep 17 00:00:00 2001
From: Bai Yifan <bai.yf@qq.com>
Date: Thu, 13 Sep 2018 09:49:22 +0800
Subject: [PATCH 17/17] code fix (#13365)

---
 paddle/fluid/operators/softmax_with_cross_entropy_op.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 148faec4af..a07c17348e 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -31,7 +31,8 @@ __global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels,
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < batch_size;
        i += blockDim.x * gridDim.x) {
     int idx = i * class_num + labels[i];
-    logit_grad[idx] -= static_cast<T>(1.);
+    logit_grad[idx] -=
+        ignore_index == labels[i] ? static_cast<T>(0.) : static_cast<T>(1.);
   }
 }