From dee6d1606f35a6cda8b788bb163763ca8a1d3257 Mon Sep 17 00:00:00 2001
From: Haihao Shen <haihao.shen@intel.com>
Date: Fri, 31 Aug 2018 15:35:36 +0800
Subject: [PATCH 01/85] Enable conv and batch norm by default

---
 python/paddle/fluid/transpiler/inference_transpiler.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py
index f79fcb24bb..02fefe32df 100644
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -60,11 +60,10 @@ class InferenceTranspiler(object):
         if not isinstance(scope, core.Scope):
             raise TypeError("scope should be as Scope type or None")
         use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))
+        self._fuse_batch_norm(program, place, scope)
         if use_mkldnn:
             self._fuse_relu_mkldnn(program)
             self._fuse_conv_bias_mkldnn(program)
-        else:
-            self._fuse_batch_norm(program, place, scope)
 
     def _fuse_relu_mkldnn(self, program):
         '''

From 5ec2fb0c93d0e799ea1fc215be0072488399c31e Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Fri, 31 Aug 2018 11:32:35 +0000
Subject: [PATCH 02/85] add flexibledfs for find path between two nodes

---
 .../inference/analysis/data_flow_graph.cc     | 37 ++++++++++
 .../inference/analysis/data_flow_graph.h      |  3 +
 .../analysis/data_flow_graph_tester.cc        | 71 +++++++++++++++++++
 3 files changed, 111 insertions(+)

diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc
index 100a7504b8..e4f4bbf43c 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -480,6 +480,8 @@ void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
     for (auto *out : op_nodes[i]->outlinks) {
       if (follow_up_input_names.count(out->name())) {
         filtered_subgraph_outlinks.push_back(out);
+      } else {
+        out->SetDeleted();
       }
     }
     PADDLE_ENFORCE_GE(filtered_subgraph_outlinks.size(), 1UL);
@@ -487,6 +489,41 @@ void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
   }
 }
 
+void FlexibleDFS(const std::vector<Node *> &source, bool reverse,
+                 const std::function<bool(const Node *)> &enter,
+                 const std::function<bool(const Node *)> &leave) {
+  typedef struct {
+    const Node *node;
+    bool leave;
+  } FNode;
+  std::vector<FNode> stack;
+  for (auto &node : source) {
+    stack.push_back(FNode{node, false});
+  }
+  std::unordered_set<const Node *> visited;
+  while (!stack.empty()) {
+    auto fnode = stack.back();
+    stack.pop_back();
+
+    if (fnode.leave) {
+      if (leave && !leave(fnode.node)) return;
+    }
+    if (visited.count(fnode.node)) continue;
+    visited.insert(fnode.node);
+
+    if (enter && !enter(fnode.node)) return;
+
+    if (leave) stack.push_back(FNode{fnode.node, true});
+    const std::vector<Node *> iter_nodes =
+        reverse == true ? fnode.node->inlinks : fnode.node->outlinks;
+    for (const Node *node : iter_nodes) {
+      if (!visited.count(node)) {
+        stack.push_back(FNode{node, false});
+      }
+    }
+  }
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
index 437e097acd..4fefc175f3 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -204,6 +204,9 @@ std::pair<std::vector<Node *>, std::vector<Node *>>
 ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph);  // NOLINT
 
 void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph);
+void FlexibleDFS(const std::vector<Node *> &source, bool reverse,
+                 const std::function<bool(const Node *)> &enter,
+                 const std::function<bool(const Node *)> &leave);
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
index 1682011c3d..040ca19514 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
@@ -160,6 +160,77 @@ TEST(DataFlowGraph, Build_IR_Graph) {
   ASSERT_EQ(graph.nodes.size(), ir_graph.Nodes().size());
 }
 
+// FlexibleDFS
+/*
+ * Graph topology
+ * inputs: 0
+ * 0 -> 1
+ * 1 -> 2
+ * 1 -> 3
+ * 3 -> 4
+ * 4 -> 5
+ * 5 -> 2
+ */
+TEST(DataFlowGraph, flexibledfs) {
+  DataFlowGraph graph;
+
+  for (int i = 0; i < 6; i++) {
+    auto* node = graph.nodes.Create(Node::Type::kValue);
+    node->SetName("node-" + std::to_string(i));
+  }
+
+  auto add_link = [&](int i, int j) {
+    Node* source = graph.nodes.GetMutable(i);
+    Node* target = graph.nodes.GetMutable(j);
+    target->inlinks.push_back(source);
+    source->outlinks.push_back(target);
+  };
+
+  add_link(0, 1);
+  add_link(1, 2);
+  add_link(1, 3);
+  add_link(3, 4);
+  add_link(4, 5);
+  add_link(5, 2);
+  graph.Build();
+
+  std::vector<const Node*> order;
+  FlexibleDFS(graph.inputs(), false, nullptr, [&order](const Node* n) {
+    order.push_back(n);
+    return true;
+  });
+
+  ASSERT_EQ(order.size(), 6UL);
+
+  order.clear();
+  // reverse dfs
+  FlexibleDFS(graph.outputs(), true, nullptr, [&order](const Node* n) {
+    order.push_back(n);
+    return true;
+  });
+
+  ASSERT_EQ(order.size(), 6UL);
+
+  // If we delete
+  Node* last_node = graph.nodes.GetMutable(2);
+  Node* direct_node = graph.nodes.GetMutable(1);
+  std::vector<Node*> source_nodes;
+  for (Node* node : last_node->inlinks) {
+    if (node != direct_node) source_nodes.push_back(node);
+  }
+
+  bool has_cycle = false;
+  FlexibleDFS(source_nodes, true, nullptr,
+              [&has_cycle, direct_node](const Node* n) {
+                if (n == direct_node) {
+                  has_cycle = true;
+                  return false;
+                }
+                return true;
+              });
+  ASSERT_TRUE(has_cycle);
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle

From 82a1b35b9b64d96e2715e58cd4779a1324f1e139 Mon Sep 17 00:00:00 2001
From: fengjiayi <hzfengjy@126.com>
Date: Tue, 4 Sep 2018 11:25:43 +0800
Subject: [PATCH 03/85] Revert "Revert "Add CudnnHolder and use it in Conv and
 ConvTranspose op""

This reverts commit 151e169eb75a8ee96e0c1e50605fa811cb65acf4.
---
 paddle/fluid/framework/rw_lock.h              | 71 ++++++++++++++++++
 paddle/fluid/operators/conv_cudnn_op.cu.cc    | 57 +++++++-------
 .../operators/conv_transpose_cudnn_op.cu.cc   | 59 +++++++--------
 paddle/fluid/platform/device_context.cc       | 74 ++++++++++++++++---
 paddle/fluid/platform/device_context.h        |  8 +-
 5 files changed, 196 insertions(+), 73 deletions(-)

diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h
index a068d3543d..da163835e8 100644
--- a/paddle/fluid/framework/rw_lock.h
+++ b/paddle/fluid/framework/rw_lock.h
@@ -56,5 +56,76 @@ struct RWLock {
 };
 #endif
 
+class RWLockGuard {
+ public:
+  enum Status { kUnLock, kWRLock, kRDLock };
+
+  RWLockGuard(RWLock* rw_lock, Status init_status)
+      : lock_(rw_lock), status_(Status::kUnLock) {
+    switch (init_status) {
+      case Status::kRDLock: {
+        RDLock();
+        break;
+      }
+      case Status::kWRLock: {
+        WRLock();
+        break;
+      }
+      case Status::kUnLock: {
+        break;
+      }
+    }
+  }
+
+  void WRLock() {
+    switch (status_) {
+      case Status::kUnLock: {
+        lock_->WRLock();
+        status_ = Status::kWRLock;
+        break;
+      }
+      case Status::kWRLock: {
+        break;
+      }
+      case Status::kRDLock: {
+        PADDLE_THROW(
+            "Please unlock read lock first before invoking write lock.");
+        break;
+      }
+    }
+  }
+
+  void RDLock() {
+    switch (status_) {
+      case Status::kUnLock: {
+        lock_->RDLock();
+        status_ = Status::kRDLock;
+        break;
+      }
+      case Status::kRDLock: {
+        break;
+      }
+      case Status::kWRLock: {
+        PADDLE_THROW(
+            "Please unlock write lock first before invoking read lock.");
+        break;
+      }
+    }
+  }
+
+  void UnLock() {
+    if (status_ != Status::kUnLock) {
+      lock_->UNLock();
+      status_ = Status::kUnLock;
+    }
+  }
+
+  ~RWLockGuard() { UnLock(); }
+
+ private:
+  RWLock* lock_;
+  Status status_;
+};
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index 22cbf680c0..4a7a6bcf71 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -118,7 +118,6 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
         output_channels / groups * output_height * output_width * output_depth;
     int group_offset_filter = filter->numel() / groups;
     // ------------------- cudnn conv workspace ---------------------
-    void* cudnn_workspace = nullptr;
     size_t workspace_size_in_bytes;  // final workspace to allocate.
     size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
     if (user_workspace_size > 0) {
@@ -159,20 +158,18 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
                       "workspace_size to be allocated exceeds the limit");
 
-    // Allocate on GPU memory
-    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
     // ------------------- cudnn conv forward ---------------------
     ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
     for (int i = 0; i < groups; i++) {
-      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
-          handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
-          cudnn_filter_desc, filter_data + i * group_offset_filter,
-          cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
-          &beta, cudnn_output_desc, output_data + i * group_offset_out));
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
+            handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
+            cudnn_filter_desc, filter_data + i * group_offset_filter,
+            cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
+            &beta, cudnn_output_desc, output_data + i * group_offset_out));
+      };
+      dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
     }
-    // Release the cudnn workspace
-    paddle::memory::Free(gpu, cudnn_workspace);
   }
 };
 
@@ -314,11 +311,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
               cudnn_filter_desc, filter_algo, &tmp_size));
       workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
     }
-    // ------------------- cudnn conv workspace ---------------------
-    // Already on GPU
-    void* cudnn_workspace = nullptr;
-    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
+
     // ------------------- cudnn conv backward data ---------------------
     ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
     if (input_grad) {
@@ -326,12 +319,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       // Because beta is zero, it is unnecessary to reset input_grad.
 
       for (int i = 0; i < groups; i++) {
-        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
-            handle, &alpha, cudnn_filter_desc,
-            filter_data + i * group_offset_filter, cudnn_output_grad_desc,
-            output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo,
-            cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
-            input_grad_data + i * group_offset_in));
+        auto cudnn_func = [&](void* cudnn_workspace) {
+          CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
+              handle, &alpha, cudnn_filter_desc,
+              filter_data + i * group_offset_filter, cudnn_output_grad_desc,
+              output_grad_data + i * group_offset_out, cudnn_conv_desc,
+              data_algo, cudnn_workspace, workspace_size_in_bytes, &beta,
+              cudnn_input_desc, input_grad_data + i * group_offset_in));
+        };
+        dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
       }
     }
     // ------------------- cudnn conv backward filter ---------------------
@@ -339,16 +335,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset filter_grad.
       for (int i = 0; i < groups; i++) {
-        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
-            handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
-            cudnn_output_grad_desc, output_grad_data + i * group_offset_out,
-            cudnn_conv_desc, filter_algo, cudnn_workspace,
-            workspace_size_in_bytes, &beta, cudnn_filter_desc,
-            filter_grad_data + i * group_offset_filter));
+        auto cudnn_func = [&](void* cudnn_workspace) {
+          CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
+              handle, &alpha, cudnn_input_desc,
+              input_data + i * group_offset_in, cudnn_output_grad_desc,
+              output_grad_data + i * group_offset_out, cudnn_conv_desc,
+              filter_algo, cudnn_workspace, workspace_size_in_bytes, &beta,
+              cudnn_filter_desc, filter_grad_data + i * group_offset_filter));
+        };
+        dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
       }
     }
-    // Release the cudnn workspace
-    paddle::memory::Free(gpu, cudnn_workspace);
   }
 };
 
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
index 82fff68e75..73831611d0 100644
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
@@ -76,7 +76,6 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
         conv_desc.descriptor<T>(paddings, strides, dilations);
 
     // ------------------- cudnn conv workspace ---------------------
-    void* cudnn_workspace = nullptr;
     size_t workspace_size_in_bytes;  // final workspace to allocate.
     size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes;
     if (user_workspace_size > 0) {
@@ -100,25 +99,21 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
             handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
             cudnn_output_desc, algo, &workspace_size_in_bytes));
 
-    // Allocate on GPU memory
-    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
-
     // ------------------- cudnn conv transpose forward ---------------------
     int input_offset = input->numel() / input->dims()[0] / groups;
     int output_offset = output->numel() / output->dims()[0] / groups;
     int filter_offset = filter->numel() / groups;
     T alpha = 1.0f, beta = 0.0f;
     for (int g = 0; g < groups; g++) {
-      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
-          handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
-          cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
-          algo, cudnn_workspace, workspace_size_in_bytes, &beta,
-          cudnn_output_desc, output_data + output_offset * g));
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
+            handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
+            cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
+            algo, cudnn_workspace, workspace_size_in_bytes, &beta,
+            cudnn_output_desc, output_data + output_offset * g));
+      };
+      dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
     }
-
-    // Release the cudnn workspace
-    paddle::memory::Free(gpu, cudnn_workspace);
   }
 };
 
@@ -206,11 +201,6 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
           std::max(workspace_size_in_bytes, bwd_filter_ws_size);
     }
 
-    // ------------------- cudnn conv workspace ---------------------
-    // Already on GPU
-    void* cudnn_workspace = nullptr;
-    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
     // ------------------- cudnn conv backward data ---------------------
     // FIXME(typhoonzero): template type T may not be the same as cudnn call.
     int input_offset = input->numel() / input->dims()[0] / groups;
@@ -222,12 +212,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset input_grad.
       for (int g = 0; g < groups; g++) {
-        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
-            handle, &alpha, cudnn_output_desc,
-            output_grad_data + output_grad_offset * g, cudnn_filter_desc,
-            filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
-            cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
-            input_grad_data + input_offset * g));
+        auto cudnn_func = [&](void* cudnn_workspace) {
+          CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
+              handle, &alpha, cudnn_output_desc,
+              output_grad_data + output_grad_offset * g, cudnn_filter_desc,
+              filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
+              cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
+              input_grad_data + input_offset * g));
+        };
+        dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
       }
     }
 
@@ -237,17 +230,17 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       // Because beta is zero, it is unnecessary to reset filter_grad.
       // Gradient with respect to the filter
       for (int g = 0; g < groups; g++) {
-        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
-            handle, &alpha, cudnn_output_desc,
-            output_grad_data + output_grad_offset * g, cudnn_input_desc,
-            input_data + input_offset * g, cudnn_conv_desc, filter_algo,
-            cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc,
-            filter_grad_data + filter_offset * g));
+        auto cudnn_func = [&](void* cudnn_workspace) {
+          CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
+              handle, &alpha, cudnn_output_desc,
+              output_grad_data + output_grad_offset * g, cudnn_input_desc,
+              input_data + input_offset * g, cudnn_conv_desc, filter_algo,
+              cudnn_workspace, workspace_size_in_bytes, &beta,
+              cudnn_filter_desc, filter_grad_data + filter_offset * g));
+        };
+        dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
       }
     }
-
-    // Release the cudnn workspace
-    paddle::memory::Free(gpu, cudnn_workspace);
   }
 };
 
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 2cc26da013..3ec20ad7e5 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -16,6 +16,9 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/memory/memory.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/framework/rw_lock.h"
+#endif
 
 namespace paddle {
 namespace platform {
@@ -142,7 +145,59 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
   mutable unsigned int* semaphore_;
 };
 
-CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
+class CudnnHolder {
+ public:
+  CudnnHolder(const cudaStream_t* stream, const CUDAPlace& place)
+      : workspace_(nullptr), workspace_len_(0), stream_(stream), place_(place) {
+    PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
+    PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, *stream_));
+  }
+
+  cudnnHandle_t cudnn_handle() const { return cudnn_handle_; }
+
+  void RunFunc(const std::function<void(void*)>& cudnn_func,
+               size_t required_workspace_len) {
+    std::lock_guard<std::mutex> lock(mtx_);
+    if (required_workspace_len > workspace_len_) {
+      ReallocateWorkspace(required_workspace_len);
+    }
+    cudnn_func(workspace_);
+  }
+
+  ~CudnnHolder() {
+    PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
+    if (workspace_ != nullptr) {
+      paddle::memory::Free(place_, workspace_);
+    }
+  }
+
+ private:
+  void ReallocateWorkspace(size_t required_workspace_len) {
+    if (required_workspace_len <= workspace_len_) {
+      return;
+    }
+    void* new_workspace = paddle::memory::Alloc(place_, required_workspace_len);
+    if (workspace_ != nullptr) {
+      // Maybe someone is using the current workspace
+      PADDLE_ENFORCE(cudaStreamSynchronize(*stream_));
+      paddle::memory::Free(place_, workspace_);
+    }
+    workspace_ = new_workspace;
+    workspace_len_ = required_workspace_len;
+  }
+
+  cudnnHandle_t cudnn_handle_;
+  void* workspace_;
+  size_t workspace_len_;
+
+  const cudaStream_t* stream_;  // not owned;
+  const CUDAPlace place_;
+
+  std::mutex mtx_;
+};
+
+CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
+    : place_(place), cudnn_holder_(nullptr) {
   SetDeviceId(place_.device);
   compute_capability = GetCUDAComputeCapability(place_.device);
   multi_process = GetCUDAMultiProcessors(place_.device);
@@ -154,10 +209,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
   PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_));
   PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_));
   if (dynload::HasCUDNN()) {
-    PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
-    PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_));
-  } else {
-    cudnn_handle_ = nullptr;
+    cudnn_holder_.reset(new CudnnHolder(&stream_, place));
   }
 }
 
@@ -165,9 +217,6 @@ CUDADeviceContext::~CUDADeviceContext() {
   SetDeviceId(place_.device);
   Wait();
   PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_));
-  if (cudnn_handle_ != nullptr) {
-    PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
-  }
   eigen_stream_.reset();
   eigen_device_.reset();
   PADDLE_ENFORCE(cudaStreamDestroy(stream_));
@@ -196,7 +245,14 @@ cublasHandle_t CUDADeviceContext::cublas_handle() const {
   return cublas_handle_;
 }
 
-cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }
+cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
+  return cudnn_holder_->cudnn_handle();
+}
+
+void CUDADeviceContext::RunCudnnFuncWithWorkspace(
+    const std::function<void(void*)>& cudnn_func, size_t workspace_len) const {
+  cudnn_holder_->RunFunc(cudnn_func, workspace_len);
+}
 
 cudaStream_t CUDADeviceContext::stream() const { return stream_; }
 
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index b97dad20db..3ed49fc423 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -69,6 +69,7 @@ struct DefaultDeviceContextType<platform::CPUPlace> {
 #ifdef PADDLE_WITH_CUDA
 
 class EigenCudaStreamDevice;
+class CudnnHolder;
 
 class CUDADeviceContext : public DeviceContext {
  public:
@@ -96,6 +97,11 @@ class CUDADeviceContext : public DeviceContext {
   /*! \brief  Return cudnn  handle in the device context. */
   cudnnHandle_t cudnn_handle() const;
 
+  /*! \brief  Run a cudnn function with the workspace provided by
+   * CUDADeviceContext */
+  void RunCudnnFuncWithWorkspace(const std::function<void(void*)>& cudnn_func,
+                                 size_t workspace_len) const;
+
   /*! \brief  Return cuda stream in the device context. */
   cudaStream_t stream() const;
 
@@ -111,8 +117,8 @@ class CUDADeviceContext : public DeviceContext {
 
   std::unique_ptr<Eigen::GpuDevice> eigen_device_;
   std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
+  std::unique_ptr<CudnnHolder> cudnn_holder_;
   cudaStream_t stream_;
-  cudnnHandle_t cudnn_handle_;
   cublasHandle_t cublas_handle_;
 
   int compute_capability;

From 7b577b92e04ff3ac62eefe2837f90eb4d266b413 Mon Sep 17 00:00:00 2001
From: fengjiayi <hzfengjy@126.com>
Date: Tue, 4 Sep 2018 11:27:24 +0800
Subject: [PATCH 04/85] fix a memory bug in CudnnHolder

---
 paddle/fluid/platform/device_context.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 3ec20ad7e5..c6f1d1f3d5 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -176,13 +176,12 @@ class CudnnHolder {
     if (required_workspace_len <= workspace_len_) {
       return;
     }
-    void* new_workspace = paddle::memory::Alloc(place_, required_workspace_len);
     if (workspace_ != nullptr) {
       // Maybe someone is using the current workspace
       PADDLE_ENFORCE(cudaStreamSynchronize(*stream_));
       paddle::memory::Free(place_, workspace_);
     }
-    workspace_ = new_workspace;
+    workspace_ = paddle::memory::Alloc(place_, required_workspace_len);
     workspace_len_ = required_workspace_len;
   }
 

From 6edfae4234ebe28d3c14954b0117536ced65758f Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Fri, 7 Sep 2018 14:34:21 +0800
Subject: [PATCH 05/85] reset received vars on pserver

---
 .../distributed/request_handler_impl.cc       | 13 ------
 .../distributed/request_handler_impl.h        |  5 ---
 .../fluid/operators/distributed/rpc_server.cc |  8 ++++
 .../fluid/operators/distributed/rpc_server.h  |  6 ++-
 paddle/fluid/operators/listen_and_serv_op.cc  | 42 ++++++++++++++++---
 paddle/fluid/operators/listen_and_serv_op.h   | 10 ++++-
 6 files changed, 58 insertions(+), 26 deletions(-)

diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index 31159a0259..849e412504 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -67,24 +67,11 @@ bool RequestSendHandler::Handle(const std::string& varname,
         LOG(FATAL) << "sync: Can not find server side var: " << varname;
         return false;
       }
-
-      if (invar->IsType<framework::SelectedRows>()) {
-        std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
-        sparse_vars_.push_back(invar);
-      }
     }
   }
   return true;
 }
 
-void RequestSendHandler::ResetSparseVarRecorder() {
-  std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
-  for (auto* var : sparse_vars_) {
-    var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
-  }
-  sparse_vars_.clear();
-}
-
 bool RequestGetHandler::Handle(const std::string& varname,
                                framework::Scope* scope,
                                framework::Variable* invar,
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h
index 87185500f2..8be5b21bb8 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.h
+++ b/paddle/fluid/operators/distributed/request_handler_impl.h
@@ -41,11 +41,6 @@ class RequestSendHandler final : public RequestHandler {
   bool Handle(const std::string& varname, framework::Scope* scope,
               framework::Variable* var, framework::Variable** outvar,
               const std::string& out_var_name = "") override;
-  void ResetSparseVarRecorder();
-
- private:
-  std::mutex mutex_sparse_vars_;
-  std::vector<framework::Variable*> sparse_vars_;
 };
 
 class RequestGetHandler final : public RequestHandler {
diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc
index 406e7294c1..084480ae48 100644
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -101,6 +101,8 @@ void RPCServer::Complete() {
   {
     std::unique_lock<std::mutex> lock(mutex_);
     client_num_--;
+    need_reset_all_vars_ = true;
+
     VLOG(4) << "decrease client_num to: " << client_num_;
     if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) {
       barrier_counter_[kRequestGet]--;
@@ -109,6 +111,11 @@ void RPCServer::Complete() {
   barrier_cond_.notify_all();
 }
 
+bool RPCServer::NeedResetAllVars() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  return need_reset_all_vars_;
+}
+
 int RPCServer::GetClientNum() {
   std::unique_lock<std::mutex> lock(mutex_);
   return client_num_;
@@ -120,6 +127,7 @@ void RPCServer::ResetBarrierCounter() {
   for (auto& t : barrier_counter_) {
     t.second = 0;
   }
+  need_reset_all_vars_ = false;
 }
 
 void RPCServer::RegisterRPC(const std::string& rpc_name,
diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h
index d813ba03e2..d88e8c640f 100644
--- a/paddle/fluid/operators/distributed/rpc_server.h
+++ b/paddle/fluid/operators/distributed/rpc_server.h
@@ -49,7 +49,8 @@ class RPCServer {
         bind_address_(address),
         exit_flag_(false),
         selected_port_(0),
-        client_num_(client_num) {}
+        client_num_(client_num),
+        need_reset_all_vars_(false) {}
 
   virtual ~RPCServer() {}
   virtual void StartServer() = 0;
@@ -86,6 +87,8 @@ class RPCServer {
   void ResetBarrierCounter();
   RPCServerProfiler& Profiler() { return profiler_; }
 
+  bool NeedResetAllVars();
+
  protected:
   virtual void ShutDownImpl() = 0;
 
@@ -104,6 +107,7 @@ class RPCServer {
   std::atomic<int> exit_flag_;
   int selected_port_;
   int client_num_;
+  bool need_reset_all_vars_;
 
   std::unordered_map<std::string, RequestHandler*> rpc_call_map_;
   std::unordered_map<std::string, int> rpc_thread_num_;
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 4cc2159d9f..1933e6a5d0 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "gflags/gflags.h"
 
 #include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/listen_and_serv_op.h"
@@ -101,9 +102,10 @@ static int64_t GetTimestamp() {
 
 void ListenAndServOp::RunSyncLoop(
     framework::Executor *executor, framework::ProgramDesc *program,
-    framework::Scope *recv_scope,
+    framework::Scope *recv_scope, platform::DeviceContext *dev_ctx,
     const std::vector<int> &prefetch_block_id_list,
-    const int checkpoint_point_block_id) const {
+    const int checkpoint_point_block_id,
+    const std::vector<std::string> &recv_varnames) const {
   VLOG(2) << "RunSyncLoop";
   size_t num_blocks = program->Size();
   auto optimize_blocks =
@@ -166,8 +168,8 @@ void ListenAndServOp::RunSyncLoop(
     VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
 
     // reset received sparse vars to avoid reuse it in the next mini-batch
-    dynamic_cast<distributed::RequestSendHandler *>(request_send_handler_.get())
-        ->ResetSparseVarRecorder();
+    ResetReceivedVars(recv_varnames, recv_scope, dev_ctx,
+                      !rpc_service_->NeedResetAllVars());
 
     rpc_service_->SetCond(distributed::kRequestGet);
     rpc_service_->WaitBarrier(distributed::kRequestGet);
@@ -175,6 +177,33 @@ void ListenAndServOp::RunSyncLoop(
   }  // while(true)
 }
 
+void ListenAndServOp::ResetReceivedVars(
+    const std::vector<std::string> &recv_varnames, framework::Scope *recv_scope,
+    platform::DeviceContext *dev_ctx, bool only_sparse_vars) const {
+  for (auto &varname : recv_varnames) {
+    auto var = recv_scope->FindVar(varname);
+    if (var == nullptr) {
+      VLOG(2) << "can not find var " << varname << " in received scope";
+      continue;
+    }
+    if (var->IsType<framework::SelectedRows>()) {
+      var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
+    }
+    if (!only_sparse_vars) {
+      if (var->IsType<framework::LoDTensor>()) {
+        math::set_constant(*dev_ctx, var->GetMutable<framework::LoDTensor>(),
+                           static_cast<float>(0));
+      } else if (var->IsType<framework::Tensor>()) {
+        math::set_constant(*dev_ctx, var->GetMutable<framework::Tensor>(),
+                           static_cast<float>(0));
+      } else {
+        PADDLE_THROW(
+            "received var should be in [SelectedRows, LoDTensor, Tensor]");
+      }
+    }
+  }
+}
+
 void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
                                    framework::ProgramDesc *program,
                                    framework::Scope *recv_scope) const {
@@ -258,6 +287,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
 
   bool sync_mode = Attr<bool>("sync_mode");
   auto fan_in = Attr<int>("Fanin");
+  auto inputs = Inputs("X");
 
   PADDLE_ENFORCE(!rpc_service_);
   std::string endpoint = Attr<std::string>("endpoint");
@@ -351,8 +381,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   // Write to a file of server selected port for python use.
   SavePort();
   if (sync_mode) {
-    RunSyncLoop(&executor, program, &recv_scope, prefetch_block_id_list,
-                checkpoint_block_id);
+    RunSyncLoop(&executor, program, &recv_scope, &dev_ctx,
+                prefetch_block_id_list, checkpoint_block_id, inputs);
   } else {
     RunAsyncLoop(&executor, program, &recv_scope);
   }
diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h
index 978969cc51..f84baa36eb 100644
--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/operators/distributed/rpc_server.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
@@ -48,8 +49,10 @@ class ListenAndServOp : public framework::OperatorBase {
   void RunSyncLoop(framework::Executor* executor,
                    framework::ProgramDesc* program,
                    framework::Scope* recv_scope,
+                   platform::DeviceContext* dev_ctx,
                    const std::vector<int>& prefetch_block_id_list,
-                   const int checkpoint_point_block_id) const;
+                   const int checkpoint_point_block_id,
+                   const std::vector<std::string>& recv_varnames) const;
 
   void RunAsyncLoop(framework::Executor* executor,
                     framework::ProgramDesc* program,
@@ -64,6 +67,11 @@ class ListenAndServOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override;
 
+  void ResetReceivedVars(const std::vector<std::string>& recv_varnames,
+                         framework::Scope* recv_scope,
+                         platform::DeviceContext* dev_ctx,
+                         bool only_sparse_vars = true) const;
+
  protected:
   mutable std::shared_ptr<distributed::RPCServer> rpc_service_;
   mutable std::shared_ptr<distributed::RequestHandler> request_send_handler_;

From 580f55fa0f73c7d418e92672253708c648599710 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Mon, 10 Sep 2018 10:38:07 +0800
Subject: [PATCH 06/85] update by comment

---
 paddle/fluid/operators/listen_and_serv_op.cc | 9 +++++----
 paddle/fluid/operators/listen_and_serv_op.h  | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 1933e6a5d0..abbb3d06d1 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -167,9 +167,8 @@ void ListenAndServOp::RunSyncLoop(
                           recv_scope);
     VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
 
-    // reset received sparse vars to avoid reuse it in the next mini-batch
     ResetReceivedVars(recv_varnames, recv_scope, dev_ctx,
-                      !rpc_service_->NeedResetAllVars());
+                      rpc_service_->NeedResetAllVars());
 
     rpc_service_->SetCond(distributed::kRequestGet);
     rpc_service_->WaitBarrier(distributed::kRequestGet);
@@ -179,7 +178,7 @@ void ListenAndServOp::RunSyncLoop(
 
 void ListenAndServOp::ResetReceivedVars(
     const std::vector<std::string> &recv_varnames, framework::Scope *recv_scope,
-    platform::DeviceContext *dev_ctx, bool only_sparse_vars) const {
+    platform::DeviceContext *dev_ctx, bool reset_all) const {
   for (auto &varname : recv_varnames) {
     auto var = recv_scope->FindVar(varname);
     if (var == nullptr) {
@@ -187,9 +186,11 @@ void ListenAndServOp::ResetReceivedVars(
       continue;
     }
     if (var->IsType<framework::SelectedRows>()) {
+      VLOG(3) << "reset sparse var: " << varname;
       var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
     }
-    if (!only_sparse_vars) {
+    if (UNLIKELY(reset_all)) {
+      VLOG(3) << "reset dense var: " << varname;
       if (var->IsType<framework::LoDTensor>()) {
         math::set_constant(*dev_ctx, var->GetMutable<framework::LoDTensor>(),
                            static_cast<float>(0));
diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h
index f84baa36eb..5102c963b9 100644
--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -70,7 +70,7 @@ class ListenAndServOp : public framework::OperatorBase {
   void ResetReceivedVars(const std::vector<std::string>& recv_varnames,
                          framework::Scope* recv_scope,
                          platform::DeviceContext* dev_ctx,
-                         bool only_sparse_vars = true) const;
+                         bool reset_all = false) const;
 
  protected:
   mutable std::shared_ptr<distributed::RPCServer> rpc_service_;

From 926e1077ca07b86c42b87a418efcf07fb820e3af Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 24 Aug 2018 18:57:36 +0800
Subject: [PATCH 07/85] version

---
 paddle/fluid/framework/CMakeLists.txt  |  4 +++-
 paddle/fluid/framework/framework.proto | 13 ++++++++++-
 paddle/fluid/framework/program_desc.cc |  8 +++++++
 paddle/fluid/framework/program_desc.h  |  2 ++
 paddle/fluid/framework/version.cc      | 28 ++++++++++++++++++++++++
 paddle/fluid/framework/version.h       | 30 ++++++++++++++++++++++++++
 paddle/fluid/inference/io.cc           |  3 +++
 paddle/fluid/pybind/protobuf.cc        |  5 ++++-
 paddle/fluid/pybind/pybind.cc          |  4 ++++
 python/paddle/fluid/framework.py       |  3 +++
 python/paddle/fluid/io.py              |  5 +++++
 11 files changed, 102 insertions(+), 3 deletions(-)
 create mode 100644 paddle/fluid/framework/version.cc
 create mode 100644 paddle/fluid/framework/version.h

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index cc7938b2ac..8af9a8f68a 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -116,7 +116,9 @@ cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope gl
 endif(NOT WIN32)
 
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
-cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)
+
+cc_library(version SRCS version.cc)
+cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
 
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto
index c658843581..8517d01cfe 100644
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -16,6 +16,13 @@ syntax = "proto2";
 option optimize_for = LITE_RUNTIME;
 package paddle.framework.proto;
 
+// Any incompatible changes to ProgramDesc and its dependencies should
+// raise the version defined version.h.
+//
+// Serailization and Deserialization codes should be modified in a way
+// that supports old versions following the version and compatibility policy.
+message Version { optional int64 version = 1 [ default = -1 ]; }
+
 enum AttrType {
   INT = 0;
   FLOAT = 1;
@@ -180,4 +187,8 @@ message BlockDesc {
 // for more details.
 // TODO(panyx0718): A model can have multiple programs. Need a
 // way to distinguish them. Maybe ID or name?
-message ProgramDesc { repeated BlockDesc blocks = 1; }
+message ProgramDesc {
+  repeated BlockDesc blocks = 1;
+
+  optional Version version = 2;
+}
diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index a63944eaee..f2e0b79c4b 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/version.h"
 
 namespace paddle {
 namespace framework {
@@ -31,6 +32,10 @@ void ProgramDesc::Flush() {
   for (auto &block : blocks_) {
     block->Flush();
   }
+  // If not loaded, use current code version.
+  if (desc_.version().version() < 0) {
+    desc_.mutable_version()->set_version(kCurProgramVersion);
+  }
 }
 
 proto::ProgramDesc *ProgramDesc::Proto() {
@@ -38,7 +43,10 @@ proto::ProgramDesc *ProgramDesc::Proto() {
   return &desc_;
 }
 
+int ProgramDesc::Version() const { return desc_.version().version(); }
+
 ProgramDesc::ProgramDesc() {
+  desc_.mutable_version()->set_version(kCurProgramVersion);
   auto *block = desc_.mutable_blocks()->Add();
   block->set_idx(kRootBlockIndex);
   block->set_parent_idx(kNoneBlockIndex);
diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h
index a0e81cade1..9cf3714b6a 100644
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
@@ -57,6 +57,8 @@ class ProgramDesc {
 
   proto::ProgramDesc *Proto();
 
+  int Version() const;
+
   // The output variable of feed_op is referenced as feed_target.
   // This function is used to collect the output variable's name of all
   // feed_ops.
diff --git a/paddle/fluid/framework/version.cc b/paddle/fluid/framework/version.cc
new file mode 100644
index 0000000000..b0d5c26a31
--- /dev/null
+++ b/paddle/fluid/framework/version.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/version.h"
+#include <algorithm>
+
+namespace paddle {
+namespace framework {
+bool IsProgramVersionSupported(int version) {
+  static int num_supported =
+      sizeof(kSupportedProgramVersion) / sizeof(kSupportedProgramVersion[0]);
+  return std::find(kSupportedProgramVersion,
+                   kSupportedProgramVersion + num_supported,
+                   version) != kSupportedProgramVersion + num_supported;
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/version.h b/paddle/fluid/framework/version.h
new file mode 100644
index 0000000000..2960ac9782
--- /dev/null
+++ b/paddle/fluid/framework/version.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+namespace framework {
+
+// The program version the current codes generate.
+constexpr int kCurProgramVersion = 0;
+
+// The program version that was generated by previous or current codes
+// and supported by current codes.
+constexpr int kSupportedProgramVersion[] = {0};
+
+bool IsProgramVersionSupported(int version);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index cef7b2a7e3..fa59cca383 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/version.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/pybind/pybind.h"
 
@@ -124,6 +125,7 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
 
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
+  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()));
 
   LoadPersistables(executor, scope, *main_program, dirname, "");
   return main_program;
@@ -138,6 +140,7 @@ std::unique_ptr<framework::ProgramDesc> Load(
 
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
+  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()));
 
   LoadPersistables(executor, scope, *main_program, "", param_filename);
   return main_program;
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index f21f8d23f9..67501186d1 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -137,7 +137,10 @@ void BindProgramDesc(pybind11::module *m) {
              PADDLE_ENFORCE(desc->ParseFromString(data),
                             "Fail to parse ProgramDesc from string. This could "
                             "be a bug of Paddle.");
-           });
+           })
+      .def("_version", [](pd::ProgramDesc &self) -> int64_t {
+        return self.Proto()->version().version();
+      });
 }
 
 void BindBlockDesc(pybind11::module *m) {
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 5b20b87174..191241de7d 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -517,6 +517,10 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("init_glog", framework::InitGLOG);
   m.def("init_devices",
         [](bool init_p2p) { framework::InitDevices(init_p2p); });
+  m.def("_supported_version", []() {
+    std::vector<int> supported_versions;
+    return supported_versions;
+  });
 
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
   m.def("is_compiled_with_dist", IsCompiledWithDIST);
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index b0e0d27ff7..8892606486 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1564,6 +1564,9 @@ class Program(object):
         """
         return self.desc
 
+    def _version(self):
+        return self.desc._version()
+
     def clone(self, for_test=False):
         """
         Create a new, duplicated program.
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 5c4ec99c53..f72ca0a8d5 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -750,6 +750,11 @@ def load_inference_model(dirname,
         program_desc_str = f.read()
 
     program = Program.parse_from_string(program_desc_str)
+    # TODO(panyx0718): Link to our version and compatibility guide.
+    if program._version() != 0:
+        raise ValueError("Unsupported program version: %d\n" %
+                         program._version())
+    # Binary data also need versioning.
     load_persistables(executor, dirname, program, params_filename)
 
     if pserver_endpoints:

From 56a977d4363a1af3a0944dfdafcef01636a642ee Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Mon, 27 Aug 2018 18:11:06 +0800
Subject: [PATCH 08/85] add test

---
 paddle/fluid/framework/CMakeLists.txt  |  2 ++
 paddle/fluid/framework/version_test.cc | 26 ++++++++++++++++++++++++++
 paddle/fluid/pybind/pybind.cc          |  3 +++
 python/paddle/fluid/io.py              |  2 +-
 4 files changed, 32 insertions(+), 1 deletion(-)
 create mode 100644 paddle/fluid/framework/version_test.cc

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 8af9a8f68a..1c9130305c 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -118,6 +118,8 @@ endif(NOT WIN32)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
 
 cc_library(version SRCS version.cc)
+cc_test(version_test SRCS version_test.cc DEPS version)
+
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
 
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
diff --git a/paddle/fluid/framework/version_test.cc b/paddle/fluid/framework/version_test.cc
new file mode 100644
index 0000000000..cc57f713d8
--- /dev/null
+++ b/paddle/fluid/framework/version_test.cc
@@ -0,0 +1,26 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/version.h"
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace framework {
+TEST(Variable, GetMutable) {
+  EXPECT_TRUE(IsProgramVersionSupported(0));
+  EXPECT_FALSE(IsProgramVersionSupported(1));
+  EXPECT_FALSE(IsProgramVersionSupported(-1));
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 191241de7d..6d85d01477 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -33,6 +33,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/version.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -534,6 +535,8 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("set_feed_variable", framework::SetFeedVariable);
   m.def("get_fetch_variable", framework::GetFetchVariable);
 
+  m.def("_is_program_version_supported", IsProgramVersionSupported);
+
   BindProgramDesc(&m);
   BindBlockDesc(&m);
   BindVarDsec(&m);
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index f72ca0a8d5..3e02e14167 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -751,7 +751,7 @@ def load_inference_model(dirname,
 
     program = Program.parse_from_string(program_desc_str)
     # TODO(panyx0718): Link to our version and compatibility guide.
-    if program._version() != 0:
+    if not core._is_program_version_supported(program._version()):
         raise ValueError("Unsupported program version: %d\n" %
                          program._version())
     # Binary data also need versioning.

From c69cf6dde879318ea10a7ddc0dd3dabd4d9be358 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Mon, 27 Aug 2018 20:23:28 +0800
Subject: [PATCH 09/85] fix

---
 paddle/fluid/framework/framework.proto | 2 +-
 paddle/fluid/inference/io.cc           | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto
index 8517d01cfe..460401df54 100644
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -21,7 +21,7 @@ package paddle.framework.proto;
 //
 // Serailization and Deserialization codes should be modified in a way
 // that supports old versions following the version and compatibility policy.
-message Version { optional int64 version = 1 [ default = -1 ]; }
+message Version { optional int64 version = 1 [ default = 0 ]; }
 
 enum AttrType {
   INT = 0;
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index fa59cca383..1d20643ce0 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -125,7 +125,8 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
 
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
-  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()));
+  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
+                 "model version %d is not supported.", main_program->Version());
 
   LoadPersistables(executor, scope, *main_program, dirname, "");
   return main_program;
@@ -140,7 +141,8 @@ std::unique_ptr<framework::ProgramDesc> Load(
 
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
-  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()));
+  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
+                 "model version %d is not supported.", main_program->Version());
 
   LoadPersistables(executor, scope, *main_program, "", param_filename);
   return main_program;

From 4313d870a2a4e99c3a039949224fff41750b1e52 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Mon, 27 Aug 2018 20:56:38 +0800
Subject: [PATCH 10/85] refine

---
 paddle/fluid/framework/CMakeLists.txt  |  4 ++--
 paddle/fluid/framework/lod_tensor.cc   |  7 +++++--
 paddle/fluid/framework/version.cc      | 10 +++++++++-
 paddle/fluid/framework/version.h       | 15 ++++++++++++---
 paddle/fluid/framework/version_test.cc |  6 +++++-
 paddle/fluid/inference/io.cc           |  6 ++++--
 6 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 1c9130305c..d998109df2 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -56,9 +56,9 @@ else()
   cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
 endif()
 if (NOT WIN32)
-cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio)
+cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
 else()
-cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
+cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
 endif (NOT WIN32)
 
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index adeb26e4e7..1e7da9a69c 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/framework/version.h"
 
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
@@ -251,8 +252,8 @@ void AppendLoD(LoD *lod, const LoD &lod_length) {
 void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
                        const platform::DeviceContext &dev_ctx) {
   {  // the 1st field, uint32_t version for LoDTensor
-    constexpr uint32_t version = 0;
-    os.write(reinterpret_cast<const char *>(&version), sizeof(version));
+    os.write(reinterpret_cast<const char *>(&kCurTensorVersion),
+             sizeof(kCurTensorVersion));
   }
   {
     // the 2st field, LoD information
@@ -281,6 +282,8 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
     // the 1st field, unit32_t version for LoDTensor
     uint32_t version;
     is.read(reinterpret_cast<char *>(&version), sizeof(version));
+    PADDLE_ENFORCE(framework::IsTensorVersionSupported(version),
+                   "tensor version %u is not supported.", version);
     PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
   }
   {
diff --git a/paddle/fluid/framework/version.cc b/paddle/fluid/framework/version.cc
index b0d5c26a31..3d559e26e0 100644
--- a/paddle/fluid/framework/version.cc
+++ b/paddle/fluid/framework/version.cc
@@ -17,12 +17,20 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-bool IsProgramVersionSupported(int version) {
+bool IsProgramVersionSupported(int64_t version) {
   static int num_supported =
       sizeof(kSupportedProgramVersion) / sizeof(kSupportedProgramVersion[0]);
   return std::find(kSupportedProgramVersion,
                    kSupportedProgramVersion + num_supported,
                    version) != kSupportedProgramVersion + num_supported;
 }
+
+bool IsTensorVersionSupported(uint32_t version) {
+  static int num_supported =
+      sizeof(kSupportedTensorVersion) / sizeof(kSupportedTensorVersion[0]);
+  return std::find(kSupportedTensorVersion,
+                   kSupportedTensorVersion + num_supported,
+                   version) != kSupportedTensorVersion + num_supported;
+}
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/version.h b/paddle/fluid/framework/version.h
index 2960ac9782..bf07fc288d 100644
--- a/paddle/fluid/framework/version.h
+++ b/paddle/fluid/framework/version.h
@@ -12,19 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <cstdint>
+
 #pragma once
 
 namespace paddle {
 namespace framework {
 
 // The program version the current codes generate.
-constexpr int kCurProgramVersion = 0;
+constexpr int64_t kCurProgramVersion = 0;
 
 // The program version that was generated by previous or current codes
 // and supported by current codes.
-constexpr int kSupportedProgramVersion[] = {0};
+constexpr int64_t kSupportedProgramVersion[] = {0};
+
+// Due to historical reasons, tensor version use uint32_t.
+constexpr uint32_t kCurTensorVersion = 0;
+
+constexpr uint32_t kSupportedTensorVersion[] = {0};
+
+bool IsProgramVersionSupported(int64_t version);
 
-bool IsProgramVersionSupported(int version);
+bool IsTensorVersionSupported(uint32_t version);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/version_test.cc b/paddle/fluid/framework/version_test.cc
index cc57f713d8..e8c5f25600 100644
--- a/paddle/fluid/framework/version_test.cc
+++ b/paddle/fluid/framework/version_test.cc
@@ -17,10 +17,14 @@
 
 namespace paddle {
 namespace framework {
-TEST(Variable, GetMutable) {
+TEST(Version, Basic) {
   EXPECT_TRUE(IsProgramVersionSupported(0));
   EXPECT_FALSE(IsProgramVersionSupported(1));
   EXPECT_FALSE(IsProgramVersionSupported(-1));
+
+  EXPECT_TRUE(IsTensorVersionSupported(0));
+  EXPECT_FALSE(IsTensorVersionSupported(1));
+  EXPECT_FALSE(IsTensorVersionSupported(-1));
 }
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 1d20643ce0..e246a06fd0 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -126,7 +126,8 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
   PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
-                 "model version %d is not supported.", main_program->Version());
+                 "model version %ld is not supported.",
+                 main_program->Version());
 
   LoadPersistables(executor, scope, *main_program, dirname, "");
   return main_program;
@@ -142,7 +143,8 @@ std::unique_ptr<framework::ProgramDesc> Load(
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
   PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
-                 "model version %d is not supported.", main_program->Version());
+                 "model version %ld is not supported.",
+                 main_program->Version());
 
   LoadPersistables(executor, scope, *main_program, "", param_filename);
   return main_program;

From ff47eaf45f5d5cc9715aa455467f1af0edfd8872 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Tue, 28 Aug 2018 09:34:02 +0800
Subject: [PATCH 11/85] clean

---
 paddle/fluid/framework/program_desc.cc | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index f2e0b79c4b..5b9073373e 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -32,10 +32,6 @@ void ProgramDesc::Flush() {
   for (auto &block : blocks_) {
     block->Flush();
   }
-  // If not loaded, use current code version.
-  if (desc_.version().version() < 0) {
-    desc_.mutable_version()->set_version(kCurProgramVersion);
-  }
 }
 
 proto::ProgramDesc *ProgramDesc::Proto() {

From 9b7c3f9615a3a35d4bde29e4f7154b45e5de7786 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Tue, 28 Aug 2018 09:42:34 +0800
Subject: [PATCH 12/85] refine

---
 paddle/fluid/framework/version.h | 8 ++++++++
 python/paddle/fluid/io.py        | 1 -
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/version.h b/paddle/fluid/framework/version.h
index bf07fc288d..3a1a492701 100644
--- a/paddle/fluid/framework/version.h
+++ b/paddle/fluid/framework/version.h
@@ -19,6 +19,11 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+// Note:
+// Program and Tensor that pass the IsXXXVersionSupported should
+// be supported by the current codes. Otherwise, it's a compatibility
+// bug.
+
 // The program version the current codes generate.
 constexpr int64_t kCurProgramVersion = 0;
 
@@ -27,8 +32,11 @@ constexpr int64_t kCurProgramVersion = 0;
 constexpr int64_t kSupportedProgramVersion[] = {0};
 
 // Due to historical reasons, tensor version use uint32_t.
+// The tensor version the current codes generate.
 constexpr uint32_t kCurTensorVersion = 0;
 
+// The tensor version that was generated by previous or current codes
+// and supported by current codes.
 constexpr uint32_t kSupportedTensorVersion[] = {0};
 
 bool IsProgramVersionSupported(int64_t version);
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 3e02e14167..656fafa0cb 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -750,7 +750,6 @@ def load_inference_model(dirname,
         program_desc_str = f.read()
 
     program = Program.parse_from_string(program_desc_str)
-    # TODO(panyx0718): Link to our version and compatibility guide.
     if not core._is_program_version_supported(program._version()):
         raise ValueError("Unsupported program version: %d\n" %
                          program._version())

From 0904f07d4655b82543aba0baca7aecf81a6ff98b Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Tue, 28 Aug 2018 11:15:31 +0800
Subject: [PATCH 13/85] polish

---
 paddle/fluid/pybind/pybind.cc | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 6d85d01477..20fc08e21d 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -518,10 +518,6 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("init_glog", framework::InitGLOG);
   m.def("init_devices",
         [](bool init_p2p) { framework::InitDevices(init_p2p); });
-  m.def("_supported_version", []() {
-    std::vector<int> supported_versions;
-    return supported_versions;
-  });
 
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
   m.def("is_compiled_with_dist", IsCompiledWithDIST);

From e762d85de41ebc8d60a31b79c6a21c23a5afa0d5 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Mon, 10 Sep 2018 11:36:42 +0800
Subject: [PATCH 14/85] clean

---
 paddle/fluid/framework/program_desc.cc | 2 +-
 paddle/fluid/framework/program_desc.h  | 2 +-
 paddle/fluid/framework/version.cc      | 2 +-
 paddle/fluid/framework/version.h       | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index 5b9073373e..589905828f 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -39,7 +39,7 @@ proto::ProgramDesc *ProgramDesc::Proto() {
   return &desc_;
 }
 
-int ProgramDesc::Version() const { return desc_.version().version(); }
+int64_t ProgramDesc::Version() const { return desc_.version().version(); }
 
 ProgramDesc::ProgramDesc() {
   desc_.mutable_version()->set_version(kCurProgramVersion);
diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h
index 9cf3714b6a..2ec0e9d7a0 100644
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
@@ -57,7 +57,7 @@ class ProgramDesc {
 
   proto::ProgramDesc *Proto();
 
-  int Version() const;
+  int64_t Version() const;
 
   // The output variable of feed_op is referenced as feed_target.
   // This function is used to collect the output variable's name of all
diff --git a/paddle/fluid/framework/version.cc b/paddle/fluid/framework/version.cc
index 3d559e26e0..81c0392bf3 100644
--- a/paddle/fluid/framework/version.cc
+++ b/paddle/fluid/framework/version.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/fluid/framework/version.h b/paddle/fluid/framework/version.h
index 3a1a492701..9945bc58c6 100644
--- a/paddle/fluid/framework/version.h
+++ b/paddle/fluid/framework/version.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.

From 681514e15ffbba78def454402f24d5a56f66546c Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 10 Sep 2018 12:20:08 +0800
Subject: [PATCH 15/85] Make all scope pointer to shared

---
 .../fast_threaded_ssa_graph_executor.cc       |  3 +-
 .../fast_threaded_ssa_graph_executor.h        | 11 ++++---
 .../framework/details/fetch_op_handle.cc      |  2 +-
 .../fluid/framework/details/fetch_op_handle.h |  4 +--
 .../scope_buffered_ssa_graph_executor.cc      |  3 +-
 .../scope_buffered_ssa_graph_executor.h       |  5 +--
 .../details/threaded_ssa_graph_executor.cc    |  3 +-
 .../details/threaded_ssa_graph_executor.h     | 11 ++++---
 paddle/fluid/framework/parallel_executor.cc   | 31 ++++++++++++-------
 paddle/fluid/framework/parallel_executor.h    | 21 +++++++------
 paddle/fluid/framework/scope.cc               | 11 ++++---
 paddle/fluid/framework/scope.h                |  2 +-
 12 files changed, 63 insertions(+), 44 deletions(-)

diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index 7606f2bc06..a9b89614ae 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -22,7 +22,8 @@ namespace framework {
 namespace details {
 
 FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
-    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
+    const ExecutionStrategy &strategy,
+    const std::vector<std::shared_ptr<Scope>> &local_scopes,
     const std::vector<platform::Place> &places,
     std::unique_ptr<ir::Graph> &&graph)
     : strategy_(strategy),
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
index dad3a231cb..fb615d70b7 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -29,16 +29,17 @@ namespace details {
 class OpHandleBase;
 class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
  public:
-  FastThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
-                               const std::vector<Scope *> &local_scopes,
-                               const std::vector<platform::Place> &places,
-                               std::unique_ptr<ir::Graph> &&graph);
+  FastThreadedSSAGraphExecutor(
+      const ExecutionStrategy &strategy,
+      const std::vector<std::shared_ptr<Scope>> &local_scopes,
+      const std::vector<platform::Place> &places,
+      std::unique_ptr<ir::Graph> &&graph);
   FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
   const ir::Graph &Graph() const override;
 
  private:
   ExecutionStrategy strategy_;
-  std::vector<Scope *> local_scopes_;
+  std::vector<std::shared_ptr<Scope>> local_scopes_;
   std::vector<platform::Place> places_;
   std::unique_ptr<ir::Graph> graph_;
 
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index fe18b2060c..2f4aefd39d 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -22,7 +22,7 @@ namespace framework {
 namespace details {
 
 FetchOpHandle::FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset,
-                             std::vector<Scope *> *local_scopes)
+                             std::vector<std::shared_ptr<Scope>> *local_scopes)
     : OpHandleBase(node),
       data_(data),
       offset_(offset),
diff --git a/paddle/fluid/framework/details/fetch_op_handle.h b/paddle/fluid/framework/details/fetch_op_handle.h
index 6ce42f92d7..a207e36b8a 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_op_handle.h
@@ -29,7 +29,7 @@ namespace details {
 struct FetchOpHandle : public OpHandleBase {
  public:
   FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset,
-                std::vector<Scope *> *local_scopes);
+                std::vector<std::shared_ptr<Scope>> *local_scopes);
 
   ~FetchOpHandle();
 
@@ -47,7 +47,7 @@ struct FetchOpHandle : public OpHandleBase {
  private:
   FeedFetchList *data_;
   size_t offset_;
-  std::vector<Scope *> *local_scopes_;
+  std::vector<std::shared_ptr<Scope>> *local_scopes_;
   std::vector<LoDTensor> tensors_;
 };
 
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index 5bd974d6b7..bf5671c679 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -23,7 +23,8 @@ namespace paddle {
 namespace framework {
 namespace details {
 ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
-    ExecutionStrategy strategy, std::vector<Scope *> local_scopes,
+    ExecutionStrategy strategy,
+    std::vector<std::shared_ptr<Scope>> local_scopes,
     std::vector<VariableInfo> var_infos, std::vector<platform::Place> places,
     std::unique_ptr<SSAGraphExecutor> &&underlying_executor)
     : strategy_(std::move(strategy)),
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
index 5e87e0bf50..ec31755af5 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -37,7 +37,8 @@ struct VariableInfo {
 class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
  public:
   ScopeBufferedSSAGraphExecutor(
-      ExecutionStrategy strategy, std::vector<Scope*> local_scopes,
+      ExecutionStrategy strategy,
+      std::vector<std::shared_ptr<Scope>> local_scopes,
       std::vector<VariableInfo> var_infos, std::vector<platform::Place> places,
       std::unique_ptr<SSAGraphExecutor>&& underlying_executor);
 
@@ -52,7 +53,7 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
 
   ExecutionStrategy strategy_;
   std::unique_ptr<SSAGraphExecutor> underlying_executor_;
-  std::vector<Scope*> local_scopes_;
+  std::vector<std::shared_ptr<Scope>> local_scopes_;
   std::vector<VariableInfo> var_infos_;
   std::vector<platform::Place> places_;
 };
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index c9e331ef35..cc6f444363 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -21,7 +21,8 @@ namespace paddle {
 namespace framework {
 namespace details {
 ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
-    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
+    const ExecutionStrategy &strategy,
+    const std::vector<std::shared_ptr<Scope>> &local_scopes,
     const std::vector<platform::Place> &places,
     std::unique_ptr<ir::Graph> &&graph)
     : graph_(std::move(graph)),
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 9135c1f5d4..2a74af6c3d 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -38,10 +38,11 @@ namespace details {
 
 class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  public:
-  ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
-                           const std::vector<Scope *> &local_scopes,
-                           const std::vector<platform::Place> &places,
-                           std::unique_ptr<ir::Graph> &&graph);
+  ThreadedSSAGraphExecutor(
+      const ExecutionStrategy &strategy,
+      const std::vector<std::shared_ptr<Scope>> &local_scopes,
+      const std::vector<platform::Place> &places,
+      std::unique_ptr<ir::Graph> &&graph);
 
   const ir::Graph &Graph() const override { return *graph_; }
   // Run a SSAGraph by a thread pool
@@ -57,7 +58,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  private:
   std::unique_ptr<ir::Graph> graph_;
   std::unique_ptr<::ThreadPool> pool_;
-  std::vector<Scope *> local_scopes_;
+  std::vector<std::shared_ptr<Scope>> local_scopes_;
   std::vector<platform::Place> places_;
   platform::DeviceContextPool fetch_ctxs_;
   ExceptionHolder exception_holder_;
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 81cb24bdda..93c74deb3e 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -39,7 +39,8 @@ std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
     const ProgramDesc &main_program, const std::vector<platform::Place> &places,
     const std::string &loss_var_name,
     const std::unordered_set<std::string> &param_names,
-    const std::vector<Scope *> &local_scopes, const bool use_cuda,
+    const std::vector<std::shared_ptr<Scope>> &local_scopes,
+    const bool use_cuda,
 #ifdef PADDLE_WITH_CUDA
     const BuildStrategy &strategy, platform::NCCLContextMap *nccl_ctxs) {
 #else
@@ -66,8 +67,8 @@ std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
                                                      &loss_var_name);
   multi_devices_pass->SetNotOwned<const std::unordered_set<std::string>>(
       "params", &param_names);
-  multi_devices_pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
-                                                              &local_scopes);
+  multi_devices_pass->SetNotOwned<const std::vector<std::shared_ptr<Scope>>>(
+      "local_scopes", &local_scopes);
   multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy", &strategy);
 
 #ifdef PADDLE_WITH_CUDA
@@ -100,8 +101,8 @@ class ParallelExecutorPrivate {
       : places_(places) {}
 
   std::vector<platform::Place> places_;
-  std::vector<Scope *> local_scopes_;
-  Scope *global_scope_;
+  std::vector<std::shared_ptr<Scope>> local_scopes_;
+  std::shared_ptr<Scope> global_scope_;
   std::unique_ptr<details::SSAGraphExecutor> executor_;
 
 #ifdef PADDLE_WITH_CUDA
@@ -112,7 +113,7 @@ class ParallelExecutorPrivate {
   bool use_all_reduce_;
 };
 
-std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
+std::vector<std::shared_ptr<Scope>> &ParallelExecutor::GetLocalScopes() {
   return member_->local_scopes_;
 }
 
@@ -121,7 +122,8 @@ ParallelExecutor::ParallelExecutor(
     const std::unordered_set<std::string> &params,
     const std::unordered_set<std::string> &bcast_vars,
     const ProgramDesc &main_program, const std::string &loss_var_name,
-    Scope *scope, const std::vector<Scope *> &local_scopes,
+    const std::shared_ptr<Scope> &scope,
+    const std::vector<std::shared_ptr<Scope>> &local_scopes,
     const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy,
     size_t num_trainers, size_t trainer_id)
     : member_(new ParallelExecutorPrivate(places)) {
@@ -142,13 +144,13 @@ ParallelExecutor::ParallelExecutor(
     member_->own_local_scope_ = true;
     member_->local_scopes_.emplace_back(member_->global_scope_);
     for (size_t i = 1; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.emplace_back(&scope->NewScope());
+      member_->local_scopes_.emplace_back(scope->NewSharedScope());
     }
   } else {
     member_->own_local_scope_ = false;
     PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
     for (size_t i = 0; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope());
+      member_->local_scopes_.emplace_back(local_scopes[i]->NewSharedScope());
     }
   }
 
@@ -321,7 +323,7 @@ void ParallelExecutor::FeedTensorsIntoLocalScopes(
 
   for (size_t i = 0; i < tensors.size(); ++i) {
     auto &map = tensors[i];
-    auto *scope = member_->local_scopes_[i];
+    auto &scope = member_->local_scopes_[i];
     for (auto &pair : map) {
       auto *trg = scope->Var(pair.first)->GetMutable<LoDTensor>();
       trg->ShareDataWith(pair.second);
@@ -351,8 +353,15 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
 
 ParallelExecutor::~ParallelExecutor() {
   if (member_->own_local_scope_) {
+    std::vector<Scope *> local_scopes_ptrs;
+    local_scopes_ptrs.reserve(member_->local_scopes_.size());
     for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
-      member_->global_scope_->DeleteScope(member_->local_scopes_[i]);
+      local_scopes_ptrs.emplace_back(member_->local_scopes_[i].get());
+      member_->local_scopes_[i].reset();
+    }
+
+    for (size_t i = 0; i != local_scopes_ptrs.size(); ++i) {
+      member_->global_scope_->DeleteScope(local_scopes_ptrs[i]);
     }
   }
 }
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 5fb748fa20..ce1076e44b 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -39,19 +39,20 @@ class ParallelExecutor {
   DISABLE_COPY_AND_ASSIGN(ParallelExecutor);
 
  public:
-  explicit ParallelExecutor(const std::vector<platform::Place> &places,
-                            const std::unordered_set<std::string> &params,
-                            const std::unordered_set<std::string> &bcast_vars,
-                            const ProgramDesc &main_program,
-                            const std::string &loss_var_name, Scope *scope,
-                            const std::vector<Scope *> &local_scopes,
-                            const ExecutionStrategy &exec_strategy,
-                            const BuildStrategy &build_strategy,
-                            size_t num_trainers = 1, size_t trainer_id = 0);
+  explicit ParallelExecutor(
+      const std::vector<platform::Place> &places,
+      const std::unordered_set<std::string> &params,
+      const std::unordered_set<std::string> &bcast_vars,
+      const ProgramDesc &main_program, const std::string &loss_var_name,
+      const std::shared_ptr<Scope> &scope,
+      const std::vector<std::shared_ptr<Scope>> &local_scopes,
+      const ExecutionStrategy &exec_strategy,
+      const BuildStrategy &build_strategy, size_t num_trainers = 1,
+      size_t trainer_id = 0);
 
   ~ParallelExecutor();
 
-  std::vector<Scope *> &GetLocalScopes();
+  std::vector<std::shared_ptr<Scope>> &GetLocalScopes();
 
   /**
    * Feed tensors to local scopes. The size of tensors should be equal to the
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 50f374e370..fa6bf4429d 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -38,8 +38,8 @@ Scope::~Scope() { DropKids(); }
 
 Scope& Scope::NewScope() const {
   std::unique_lock<std::mutex> lock(mutex_);
-  kids_.push_back(new Scope(this));
-  return *kids_.back();
+  kids_.push_back(std::shared_ptr<Scope>(new Scope(this)));
+  return kids_.back().get();
 }
 
 Variable* Scope::Var(const std::string& name) {
@@ -68,7 +68,6 @@ const Scope* Scope::FindScope(const Variable* var) const {
 
 void Scope::DropKids() {
   std::unique_lock<std::mutex> lock(mutex_);
-  for (Scope* s : kids_) delete s;
   kids_.clear();
 }
 
@@ -84,8 +83,12 @@ std::vector<std::string> Scope::LocalVarNames() const {
 
 void Scope::DeleteScope(Scope* scope) const {
   std::unique_lock<std::mutex> lock(mutex_);
-  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
+  auto it = std::find_if(this->kids_.begin(), this->kids_.end(),
+                         [&scope](const std::shared_ptr<Scope>& kid) {
+                           return kid.get() == scope;
+                         });
   PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
+  it->reset();
   this->kids_.erase(it);
   // When making memory benchmark on Fluid, we have to delete scope sync.
   if (FLAGS_benchmark || FLAGS_eager_delete_scope) {
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index e246241c0a..0ba5d34798 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -105,7 +105,7 @@ class Scope {
   Variable* FindVarLocally(const std::string& name) const;
 
   // Scope in `kids_` are owned by this class.
-  mutable std::list<Scope*> kids_;
+  mutable std::list<std::shared_ptr<Scope>> kids_;
   Scope const* parent_{nullptr};
 
   DISABLE_COPY_AND_ASSIGN(Scope);

From dc863aac7edeccbe8362d625b2c1e6eeca885000 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 10 Sep 2018 14:29:19 +0800
Subject: [PATCH 16/85] Add kids exists detection in Scope

---
 .../fast_threaded_ssa_graph_executor.cc       |  3 +-
 .../fast_threaded_ssa_graph_executor.h        | 11 +++---
 .../framework/details/fetch_op_handle.cc      |  2 +-
 .../fluid/framework/details/fetch_op_handle.h |  4 +--
 .../scope_buffered_ssa_graph_executor.cc      |  3 +-
 .../scope_buffered_ssa_graph_executor.h       |  5 ++-
 .../details/threaded_ssa_graph_executor.cc    |  3 +-
 .../details/threaded_ssa_graph_executor.h     | 11 +++---
 paddle/fluid/framework/parallel_executor.cc   | 34 ++++++++-----------
 paddle/fluid/framework/parallel_executor.h    | 21 ++++++------
 paddle/fluid/framework/scope.cc               | 17 ++++++----
 paddle/fluid/framework/scope.h                |  5 ++-
 .../test_image_classification_resnet.py       |  5 +--
 .../test_image_classification_vgg.py          |  5 +--
 .../test_recognize_digits_conv.py             |  5 +--
 .../test_recognize_digits_mlp.py              |  5 +--
 16 files changed, 60 insertions(+), 79 deletions(-)

diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index a9b89614ae..7606f2bc06 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -22,8 +22,7 @@ namespace framework {
 namespace details {
 
 FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
-    const ExecutionStrategy &strategy,
-    const std::vector<std::shared_ptr<Scope>> &local_scopes,
+    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places,
     std::unique_ptr<ir::Graph> &&graph)
     : strategy_(strategy),
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
index fb615d70b7..dad3a231cb 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -29,17 +29,16 @@ namespace details {
 class OpHandleBase;
 class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
  public:
-  FastThreadedSSAGraphExecutor(
-      const ExecutionStrategy &strategy,
-      const std::vector<std::shared_ptr<Scope>> &local_scopes,
-      const std::vector<platform::Place> &places,
-      std::unique_ptr<ir::Graph> &&graph);
+  FastThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
+                               const std::vector<Scope *> &local_scopes,
+                               const std::vector<platform::Place> &places,
+                               std::unique_ptr<ir::Graph> &&graph);
   FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
   const ir::Graph &Graph() const override;
 
  private:
   ExecutionStrategy strategy_;
-  std::vector<std::shared_ptr<Scope>> local_scopes_;
+  std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
   std::unique_ptr<ir::Graph> graph_;
 
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 2f4aefd39d..fe18b2060c 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -22,7 +22,7 @@ namespace framework {
 namespace details {
 
 FetchOpHandle::FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset,
-                             std::vector<std::shared_ptr<Scope>> *local_scopes)
+                             std::vector<Scope *> *local_scopes)
     : OpHandleBase(node),
       data_(data),
       offset_(offset),
diff --git a/paddle/fluid/framework/details/fetch_op_handle.h b/paddle/fluid/framework/details/fetch_op_handle.h
index a207e36b8a..6ce42f92d7 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_op_handle.h
@@ -29,7 +29,7 @@ namespace details {
 struct FetchOpHandle : public OpHandleBase {
  public:
   FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset,
-                std::vector<std::shared_ptr<Scope>> *local_scopes);
+                std::vector<Scope *> *local_scopes);
 
   ~FetchOpHandle();
 
@@ -47,7 +47,7 @@ struct FetchOpHandle : public OpHandleBase {
  private:
   FeedFetchList *data_;
   size_t offset_;
-  std::vector<std::shared_ptr<Scope>> *local_scopes_;
+  std::vector<Scope *> *local_scopes_;
   std::vector<LoDTensor> tensors_;
 };
 
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index bf5671c679..5bd974d6b7 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -23,8 +23,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
-    ExecutionStrategy strategy,
-    std::vector<std::shared_ptr<Scope>> local_scopes,
+    ExecutionStrategy strategy, std::vector<Scope *> local_scopes,
     std::vector<VariableInfo> var_infos, std::vector<platform::Place> places,
     std::unique_ptr<SSAGraphExecutor> &&underlying_executor)
     : strategy_(std::move(strategy)),
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
index ec31755af5..5e87e0bf50 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -37,8 +37,7 @@ struct VariableInfo {
 class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
  public:
   ScopeBufferedSSAGraphExecutor(
-      ExecutionStrategy strategy,
-      std::vector<std::shared_ptr<Scope>> local_scopes,
+      ExecutionStrategy strategy, std::vector<Scope*> local_scopes,
       std::vector<VariableInfo> var_infos, std::vector<platform::Place> places,
       std::unique_ptr<SSAGraphExecutor>&& underlying_executor);
 
@@ -53,7 +52,7 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
 
   ExecutionStrategy strategy_;
   std::unique_ptr<SSAGraphExecutor> underlying_executor_;
-  std::vector<std::shared_ptr<Scope>> local_scopes_;
+  std::vector<Scope*> local_scopes_;
   std::vector<VariableInfo> var_infos_;
   std::vector<platform::Place> places_;
 };
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index cc6f444363..c9e331ef35 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -21,8 +21,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
-    const ExecutionStrategy &strategy,
-    const std::vector<std::shared_ptr<Scope>> &local_scopes,
+    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places,
     std::unique_ptr<ir::Graph> &&graph)
     : graph_(std::move(graph)),
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 2a74af6c3d..9135c1f5d4 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -38,11 +38,10 @@ namespace details {
 
 class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  public:
-  ThreadedSSAGraphExecutor(
-      const ExecutionStrategy &strategy,
-      const std::vector<std::shared_ptr<Scope>> &local_scopes,
-      const std::vector<platform::Place> &places,
-      std::unique_ptr<ir::Graph> &&graph);
+  ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
+                           const std::vector<Scope *> &local_scopes,
+                           const std::vector<platform::Place> &places,
+                           std::unique_ptr<ir::Graph> &&graph);
 
   const ir::Graph &Graph() const override { return *graph_; }
   // Run a SSAGraph by a thread pool
@@ -58,7 +57,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  private:
   std::unique_ptr<ir::Graph> graph_;
   std::unique_ptr<::ThreadPool> pool_;
-  std::vector<std::shared_ptr<Scope>> local_scopes_;
+  std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
   platform::DeviceContextPool fetch_ctxs_;
   ExceptionHolder exception_holder_;
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 93c74deb3e..5b8c75a93d 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -39,8 +39,7 @@ std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
     const ProgramDesc &main_program, const std::vector<platform::Place> &places,
     const std::string &loss_var_name,
     const std::unordered_set<std::string> &param_names,
-    const std::vector<std::shared_ptr<Scope>> &local_scopes,
-    const bool use_cuda,
+    const std::vector<Scope *> &local_scopes, const bool use_cuda,
 #ifdef PADDLE_WITH_CUDA
     const BuildStrategy &strategy, platform::NCCLContextMap *nccl_ctxs) {
 #else
@@ -67,8 +66,8 @@ std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
                                                      &loss_var_name);
   multi_devices_pass->SetNotOwned<const std::unordered_set<std::string>>(
       "params", &param_names);
-  multi_devices_pass->SetNotOwned<const std::vector<std::shared_ptr<Scope>>>(
-      "local_scopes", &local_scopes);
+  multi_devices_pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
+                                                              &local_scopes);
   multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy", &strategy);
 
 #ifdef PADDLE_WITH_CUDA
@@ -101,8 +100,8 @@ class ParallelExecutorPrivate {
       : places_(places) {}
 
   std::vector<platform::Place> places_;
-  std::vector<std::shared_ptr<Scope>> local_scopes_;
-  std::shared_ptr<Scope> global_scope_;
+  std::vector<Scope *> local_scopes_;
+  Scope *global_scope_;
   std::unique_ptr<details::SSAGraphExecutor> executor_;
 
 #ifdef PADDLE_WITH_CUDA
@@ -113,7 +112,7 @@ class ParallelExecutorPrivate {
   bool use_all_reduce_;
 };
 
-std::vector<std::shared_ptr<Scope>> &ParallelExecutor::GetLocalScopes() {
+std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
   return member_->local_scopes_;
 }
 
@@ -122,8 +121,7 @@ ParallelExecutor::ParallelExecutor(
     const std::unordered_set<std::string> &params,
     const std::unordered_set<std::string> &bcast_vars,
     const ProgramDesc &main_program, const std::string &loss_var_name,
-    const std::shared_ptr<Scope> &scope,
-    const std::vector<std::shared_ptr<Scope>> &local_scopes,
+    Scope *scope, const std::vector<Scope *> &local_scopes,
     const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy,
     size_t num_trainers, size_t trainer_id)
     : member_(new ParallelExecutorPrivate(places)) {
@@ -144,13 +142,13 @@ ParallelExecutor::ParallelExecutor(
     member_->own_local_scope_ = true;
     member_->local_scopes_.emplace_back(member_->global_scope_);
     for (size_t i = 1; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.emplace_back(scope->NewSharedScope());
+      member_->local_scopes_.emplace_back(&scope->NewScope());
     }
   } else {
     member_->own_local_scope_ = false;
     PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
     for (size_t i = 0; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.emplace_back(local_scopes[i]->NewSharedScope());
+      member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope());
     }
   }
 
@@ -323,7 +321,7 @@ void ParallelExecutor::FeedTensorsIntoLocalScopes(
 
   for (size_t i = 0; i < tensors.size(); ++i) {
     auto &map = tensors[i];
-    auto &scope = member_->local_scopes_[i];
+    auto *scope = member_->local_scopes_[i];
     for (auto &pair : map) {
       auto *trg = scope->Var(pair.first)->GetMutable<LoDTensor>();
       trg->ShareDataWith(pair.second);
@@ -353,15 +351,11 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
 
 ParallelExecutor::~ParallelExecutor() {
   if (member_->own_local_scope_) {
-    std::vector<Scope *> local_scopes_ptrs;
-    local_scopes_ptrs.reserve(member_->local_scopes_.size());
     for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
-      local_scopes_ptrs.emplace_back(member_->local_scopes_[i].get());
-      member_->local_scopes_[i].reset();
-    }
-
-    for (size_t i = 0; i != local_scopes_ptrs.size(); ++i) {
-      member_->global_scope_->DeleteScope(local_scopes_ptrs[i]);
+      Scope *local_scope = member_->local_scopes_[i];
+      if (member_->global_scope_->HasKid(local_scope)) {
+        member_->global_scope_->DeleteScope(local_scope);
+      }
     }
   }
 }
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index ce1076e44b..5fb748fa20 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -39,20 +39,19 @@ class ParallelExecutor {
   DISABLE_COPY_AND_ASSIGN(ParallelExecutor);
 
  public:
-  explicit ParallelExecutor(
-      const std::vector<platform::Place> &places,
-      const std::unordered_set<std::string> &params,
-      const std::unordered_set<std::string> &bcast_vars,
-      const ProgramDesc &main_program, const std::string &loss_var_name,
-      const std::shared_ptr<Scope> &scope,
-      const std::vector<std::shared_ptr<Scope>> &local_scopes,
-      const ExecutionStrategy &exec_strategy,
-      const BuildStrategy &build_strategy, size_t num_trainers = 1,
-      size_t trainer_id = 0);
+  explicit ParallelExecutor(const std::vector<platform::Place> &places,
+                            const std::unordered_set<std::string> &params,
+                            const std::unordered_set<std::string> &bcast_vars,
+                            const ProgramDesc &main_program,
+                            const std::string &loss_var_name, Scope *scope,
+                            const std::vector<Scope *> &local_scopes,
+                            const ExecutionStrategy &exec_strategy,
+                            const BuildStrategy &build_strategy,
+                            size_t num_trainers = 1, size_t trainer_id = 0);
 
   ~ParallelExecutor();
 
-  std::vector<std::shared_ptr<Scope>> &GetLocalScopes();
+  std::vector<Scope *> &GetLocalScopes();
 
   /**
    * Feed tensors to local scopes. The size of tensors should be equal to the
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index fa6bf4429d..2be655b89a 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -38,8 +38,8 @@ Scope::~Scope() { DropKids(); }
 
 Scope& Scope::NewScope() const {
   std::unique_lock<std::mutex> lock(mutex_);
-  kids_.push_back(std::shared_ptr<Scope>(new Scope(this)));
-  return kids_.back().get();
+  kids_.push_back(new Scope(this));
+  return *kids_.back();
 }
 
 Variable* Scope::Var(const std::string& name) {
@@ -68,9 +68,16 @@ const Scope* Scope::FindScope(const Variable* var) const {
 
 void Scope::DropKids() {
   std::unique_lock<std::mutex> lock(mutex_);
+  for (Scope* s : kids_) delete s;
   kids_.clear();
 }
 
+bool Scope::HasKid(const Scope* scope) const {
+  std::unique_lock<std::mutex> lock(mutex_);
+  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
+  return it != this->kids_.end();
+}
+
 std::vector<std::string> Scope::LocalVarNames() const {
   std::unique_lock<std::mutex> lock(mutex_);
   std::vector<std::string> known_vars;
@@ -83,12 +90,8 @@ std::vector<std::string> Scope::LocalVarNames() const {
 
 void Scope::DeleteScope(Scope* scope) const {
   std::unique_lock<std::mutex> lock(mutex_);
-  auto it = std::find_if(this->kids_.begin(), this->kids_.end(),
-                         [&scope](const std::shared_ptr<Scope>& kid) {
-                           return kid.get() == scope;
-                         });
+  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
-  it->reset();
   this->kids_.erase(it);
   // When making memory benchmark on Fluid, we have to delete scope sync.
   if (FLAGS_benchmark || FLAGS_eager_delete_scope) {
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index 0ba5d34798..b6165a595d 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -71,6 +71,9 @@ class Scope {
   /// Drop all kids scopes belonged to this scope.
   void DropKids();
 
+  /// Find if a scope exists in the kid scopes
+  bool HasKid(const Scope* scope) const;
+
   // enumerate all the variables current contains.
   std::vector<std::string> LocalVarNames() const;
 
@@ -105,7 +108,7 @@ class Scope {
   Variable* FindVarLocally(const std::string& name) const;
 
   // Scope in `kids_` are owned by this class.
-  mutable std::list<std::shared_ptr<Scope>> kids_;
+  mutable std::list<Scope*> kids_;
   Scope const* parent_{nullptr};
 
   DISABLE_COPY_AND_ASSIGN(Scope);
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
index e5ae95e2d9..de276755bb 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -178,7 +178,4 @@ if __name__ == '__main__':
         for parallel in (False, True):
             if use_cuda and not core.is_compiled_with_cuda():
                 continue
-            # TODO(minqiyang): remove this line after fixing the deletion
-            # order problem of Scope in ParallelExecutor in manylinux
-            if six.PY2:
-                main(use_cuda=use_cuda, parallel=parallel)
+            main(use_cuda=use_cuda, parallel=parallel)
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
index ff91be72c9..dd547f3448 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
@@ -152,7 +152,4 @@ if __name__ == '__main__':
         for parallel in (False, True):
             if use_cuda and not core.is_compiled_with_cuda():
                 continue
-            # TODO(minqiyang): remove this line after fixing the deletion
-            # order problem of Scope in ParallelExecutor in manylinux
-            if six.PY2:
-                main(use_cuda=use_cuda, parallel=parallel)
+            main(use_cuda=use_cuda, parallel=parallel)
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
index fa72c939e5..973308498b 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
@@ -155,7 +155,4 @@ if __name__ == '__main__':
         for parallel in (False, True):
             if use_cuda and not core.is_compiled_with_cuda():
                 continue
-            # TODO(minqiyang): remove this line after fixing the deletion
-            # order problem of Scope in ParallelExecutor in manylinux
-            if six.PY2:
-                main(use_cuda=use_cuda, parallel=parallel)
+            main(use_cuda=use_cuda, parallel=parallel)
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
index 440d2a3083..cb4aeb430e 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
@@ -137,7 +137,4 @@ if __name__ == '__main__':
         for parallel in (False, True):
             if use_cuda and not core.is_compiled_with_cuda():
                 continue
-            # TODO(minqiyang): remove this line after fixing the deletion
-            # order problem of Scope in ParallelExecutor in manylinux
-            if six.PY2:
-                main(use_cuda=use_cuda, parallel=parallel)
+            main(use_cuda=use_cuda, parallel=parallel)

From 32b94a7d13233aba6f077dac43071e54f43fd489 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Mon, 10 Sep 2018 15:09:47 +0800
Subject: [PATCH 17/85] cache var types

---
 paddle/fluid/operators/listen_and_serv_op.cc | 56 +++++++++++++++-----
 paddle/fluid/operators/listen_and_serv_op.h  | 11 ++--
 2 files changed, 50 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index abbb3d06d1..966d78b841 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -104,8 +104,7 @@ void ListenAndServOp::RunSyncLoop(
     framework::Executor *executor, framework::ProgramDesc *program,
     framework::Scope *recv_scope, platform::DeviceContext *dev_ctx,
     const std::vector<int> &prefetch_block_id_list,
-    const int checkpoint_point_block_id,
-    const std::vector<std::string> &recv_varnames) const {
+    const int checkpoint_point_block_id) const {
   VLOG(2) << "RunSyncLoop";
   size_t num_blocks = program->Size();
   auto optimize_blocks =
@@ -130,6 +129,7 @@ void ListenAndServOp::RunSyncLoop(
   rpc_service_->SetCond(distributed::kRequestGet);
   rpc_service_->WaitBarrier(distributed::kRequestGet);
   rpc_service_->ResetBarrierCounter();
+
   while (true) {
     rpc_service_->Profiler().OneStep();
     // Get from multiple trainers, we don't care about the order in which
@@ -167,8 +167,7 @@ void ListenAndServOp::RunSyncLoop(
                           recv_scope);
     VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
 
-    ResetReceivedVars(recv_varnames, recv_scope, dev_ctx,
-                      rpc_service_->NeedResetAllVars());
+    ResetReceivedVars(recv_scope, dev_ctx, rpc_service_->NeedResetAllVars());
 
     rpc_service_->SetCond(distributed::kRequestGet);
     rpc_service_->WaitBarrier(distributed::kRequestGet);
@@ -176,10 +175,10 @@ void ListenAndServOp::RunSyncLoop(
   }  // while(true)
 }
 
-void ListenAndServOp::ResetReceivedVars(
-    const std::vector<std::string> &recv_varnames, framework::Scope *recv_scope,
-    platform::DeviceContext *dev_ctx, bool reset_all) const {
-  for (auto &varname : recv_varnames) {
+void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
+                                        platform::DeviceContext *dev_ctx,
+                                        bool reset_all) const {
+  for (auto &varname : sparse_vars_) {
     auto var = recv_scope->FindVar(varname);
     if (var == nullptr) {
       VLOG(2) << "can not find var " << varname << " in received scope";
@@ -188,9 +187,17 @@ void ListenAndServOp::ResetReceivedVars(
     if (var->IsType<framework::SelectedRows>()) {
       VLOG(3) << "reset sparse var: " << varname;
       var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
+    } else {
+      PADDLE_THROW("The type of sparse var should be SelectedRows");
     }
-    if (UNLIKELY(reset_all)) {
-      VLOG(3) << "reset dense var: " << varname;
+  }
+  if (UNLIKELY(reset_all)) {
+    for (auto &varname : dense_vars_) {
+      auto var = recv_scope->FindVar(varname);
+      if (var == nullptr) {
+        VLOG(2) << "can not find var " << varname << " in received scope";
+        continue;
+      }
       if (var->IsType<framework::LoDTensor>()) {
         math::set_constant(*dev_ctx, var->GetMutable<framework::LoDTensor>(),
                            static_cast<float>(0));
@@ -198,8 +205,7 @@ void ListenAndServOp::ResetReceivedVars(
         math::set_constant(*dev_ctx, var->GetMutable<framework::Tensor>(),
                            static_cast<float>(0));
       } else {
-        PADDLE_THROW(
-            "received var should be in [SelectedRows, LoDTensor, Tensor]");
+        PADDLE_THROW("The type of dense var should be in [LoDTensor, Tensor]");
       }
     }
   }
@@ -278,6 +284,25 @@ static void FillRequestCtx(
   h->SetCheckpointNotifyPreparedCtx(checkpoint_ctx);
 }
 
+void ListenAndServOp::CacheVarsType(const std::vector<std::string> &varnames,
+                                    const framework::Scope &scope) const {
+  for (const auto &varname : varnames) {
+    auto var = scope.FindVar(varname);
+    PADDLE_ENFORCE(var != nullptr,
+                   "Received var should be initialized in the received scope.");
+    if (var->IsType<framework::SelectedRows>()) {
+      sparse_vars_.push_back(varname);
+    } else if (var->IsType<framework::LoDTensor>() ||
+               var->IsType<framework::Tensor>()) {
+      dense_vars_.push_back(varname);
+    } else {
+      PADDLE_THROW(
+          "The type of received var should be in [SelectedRows, LoDTensor, "
+          "Tensor].");
+    }
+  }
+}
+
 void ListenAndServOp::RunImpl(const framework::Scope &scope,
                               const platform::Place &dev_place) const {
   // Mark this as PS that it should decide profiling by listening from trainer.
@@ -379,11 +404,16 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   signal(SIGINT, SignalHandler::StopAndExit);
   signal(SIGTERM, SignalHandler::StopAndExit);
 
+  // Cache the type of the received vars as `sparse_vars_` and `dense_vars_`
+  // so that we can reset them at the end of each iteration.
+  // NOTE: only used in sync update
+  CacheVarsType(inputs, recv_scope);
+
   // Write to a file of server selected port for python use.
   SavePort();
   if (sync_mode) {
     RunSyncLoop(&executor, program, &recv_scope, &dev_ctx,
-                prefetch_block_id_list, checkpoint_block_id, inputs);
+                prefetch_block_id_list, checkpoint_block_id);
   } else {
     RunAsyncLoop(&executor, program, &recv_scope);
   }
diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h
index 5102c963b9..5f889793ab 100644
--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -51,8 +51,7 @@ class ListenAndServOp : public framework::OperatorBase {
                    framework::Scope* recv_scope,
                    platform::DeviceContext* dev_ctx,
                    const std::vector<int>& prefetch_block_id_list,
-                   const int checkpoint_point_block_id,
-                   const std::vector<std::string>& recv_varnames) const;
+                   const int checkpoint_point_block_id) const;
 
   void RunAsyncLoop(framework::Executor* executor,
                     framework::ProgramDesc* program,
@@ -67,11 +66,13 @@ class ListenAndServOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override;
 
-  void ResetReceivedVars(const std::vector<std::string>& recv_varnames,
-                         framework::Scope* recv_scope,
+  void ResetReceivedVars(framework::Scope* recv_scope,
                          platform::DeviceContext* dev_ctx,
                          bool reset_all = false) const;
 
+  void CacheVarsType(const std::vector<std::string>& varnames,
+                     const framework::Scope& scope) const;
+
  protected:
   mutable std::shared_ptr<distributed::RPCServer> rpc_service_;
   mutable std::shared_ptr<distributed::RequestHandler> request_send_handler_;
@@ -82,6 +83,8 @@ class ListenAndServOp : public framework::OperatorBase {
       request_checkpoint_handler_;
 
   mutable std::shared_ptr<std::thread> server_thread_;
+  mutable std::vector<std::string> sparse_vars_;
+  mutable std::vector<std::string> dense_vars_;
 };
 
 class SignalHandler {

From 2fd1bf2ea6b6e0d27fb49461dd2b35c8e2a2b13b Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Mon, 10 Sep 2018 17:31:06 +0800
Subject: [PATCH 18/85] fea/add color log (#13305)

---
 paddle/fluid/framework/ir/CMakeLists.txt      |  2 +-
 .../framework/ir/graph_pattern_detector.cc    |  7 +-
 .../fluid/inference/analysis/CMakeLists.txt   |  2 +-
 .../inference/analysis/ir_pass_manager.cc     |  6 +-
 .../fluid/inference/analysis/pass_manager.cc  |  8 ++-
 paddle/fluid/string/CMakeLists.txt            |  2 +
 paddle/fluid/string/pretty_log.cc             | 22 ++++++
 paddle/fluid/string/pretty_log.h              | 70 +++++++++++++++++++
 8 files changed, 112 insertions(+), 7 deletions(-)
 create mode 100644 paddle/fluid/string/pretty_log.cc
 create mode 100644 paddle/fluid/string/pretty_log.h

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 78387c4073..ce3ebed00b 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -19,7 +19,7 @@ function(pass_library TARGET DEST)
 endfunction()
 
 cc_library(node SRCS node.cc DEPS proto_desc)
-cc_library(graph SRCS graph.cc DEPS node)
+cc_library(graph SRCS graph.cc DEPS node pretty_log)
 cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
 cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
 cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index fc7feca567..5825a129b7 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -21,12 +21,17 @@
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/string/pretty_log.h"
 #include "paddle/fluid/string/printf.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
+using string::PrettyLogEndl;
+using string::PrettyLog;
+using string::Style;
+
 size_t PDPattern::id_ = 0UL;
 
 PDNode* PDPattern::NewNode(const std::string& name) {
@@ -83,7 +88,7 @@ void GraphPatternDetector::operator()(Graph* graph,
   ValidateByNodeRole(&subgraphs);
 
   if (subgraphs.empty()) return;
-  LOG(INFO) << "detect " << subgraphs.size() << " subgraph matches the pattern";
+  PrettyLogEndl(Style::detail(), "---  detect %d subgraphs", subgraphs.size());
   int id = 0;
   for (auto& g : subgraphs) {
     VLOG(3) << "optimizing #" << id++ << " subgraph";
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 11a7509feb..fecce9c225 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -1,6 +1,6 @@
 cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass)
 set(analysis_deps
-    framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor)
+    framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log)
 
 cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
   analyzer.cc
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index ea0f2241d7..30c1e8e93d 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -17,10 +17,14 @@
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/string/pretty_log.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
+using string::PrettyLogEndl;
+using string::PrettyLog;
+using string::Style;
 
 IRPassManager::IRPassManager(const ProgramDesc &program,
                              framework::Scope *scope)
@@ -34,7 +38,7 @@ void IRPassManager::Apply(const std::vector<std::string> &passes) {
   // Apply all the passes
   std::string pre_pass;
   for (const std::string &pass_name : passes) {
-    LOG(WARNING) << "Running IR pass [" << pass_name << "]";
+    PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass_name);
     auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
     if (pass_name == "graph_viz_pass") {
       std::string dot_file_path =
diff --git a/paddle/fluid/inference/analysis/pass_manager.cc b/paddle/fluid/inference/analysis/pass_manager.cc
index 759b2b96a1..a6ac0ee49f 100644
--- a/paddle/fluid/inference/analysis/pass_manager.cc
+++ b/paddle/fluid/inference/analysis/pass_manager.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/analysis/pass_manager.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/string/pretty_log.h"
 
 namespace paddle {
 namespace inference {
@@ -22,7 +23,7 @@ namespace analysis {
 bool PassManager::Initialize(Argument* argument) {
   argument_ = argument;
   for (auto& pass : data_) {
-    LOG(WARNING) << "Initializing pass [" << pass->repr() << "]";
+    VLOG(3) << "Initializing pass [" << pass->repr() << "]";
     if (!pass->Initialize(argument)) {
       LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
       return false;
@@ -33,9 +34,10 @@ bool PassManager::Initialize(Argument* argument) {
 
 void DfgPassManager::RunAll() {
   PADDLE_ENFORCE(argument_);
-  LOG(INFO) << "Total " << data_.size() << " Analysys passes";
+  VLOG(3) << "Total " << data_.size() << " Analysys passes";
   for (auto& pass : data_) {
-    LOG(WARNING) << "Running Analysis pass [" << pass->repr() << "]";
+    string::PrettyLogEndl(string::Style::H1(), "* Running Analysis pass [%s]",
+                          pass->repr());
     pass->Run(argument_->main_dfg.get());
   }
 }
diff --git a/paddle/fluid/string/CMakeLists.txt b/paddle/fluid/string/CMakeLists.txt
index 1fe7f42ca1..719411bf66 100644
--- a/paddle/fluid/string/CMakeLists.txt
+++ b/paddle/fluid/string/CMakeLists.txt
@@ -1,4 +1,6 @@
 cc_library(stringpiece SRCS piece.cc)
+cc_library(pretty_log SRCS pretty_log.cc)
+cc_test(test_pretty_log SRCS pretty_log.cc)
 cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
 cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
 cc_test(to_string_test SRCS to_string_test.cc)
diff --git a/paddle/fluid/string/pretty_log.cc b/paddle/fluid/string/pretty_log.cc
new file mode 100644
index 0000000000..4534fdc58b
--- /dev/null
+++ b/paddle/fluid/string/pretty_log.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/string/pretty_log.h"
+#include <gflags/gflags.h>
+
+DEFINE_bool(color, true, "Whether to turn on pretty log");
+
+namespace paddle {
+namespace string {}  // namespace string
+}  // namespace paddle
diff --git a/paddle/fluid/string/pretty_log.h b/paddle/fluid/string/pretty_log.h
new file mode 100644
index 0000000000..a3b4e38f45
--- /dev/null
+++ b/paddle/fluid/string/pretty_log.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <gflags/gflags.h>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <utility>
+#include "paddle/fluid/string/printf.h"
+
+DECLARE_bool(color);
+
+namespace paddle {
+
+namespace string {
+
+inline std::string black() { return FLAGS_color ? "\e[30m" : ""; }
+inline std::string red() { return FLAGS_color ? "\e[31m" : ""; }
+inline std::string b_red() { return FLAGS_color ? "\e[41m" : ""; }
+inline std::string green() { return FLAGS_color ? "\e[32m" : ""; }
+inline std::string yellow() { return FLAGS_color ? "\e[33m" : ""; }
+inline std::string blue() { return FLAGS_color ? "\e[34m" : ""; }
+inline std::string purple() { return FLAGS_color ? "\e[35m" : ""; }
+inline std::string cyan() { return FLAGS_color ? "\e[36m" : ""; }
+inline std::string light_gray() { return FLAGS_color ? "\e[37m" : ""; }
+inline std::string white() { return FLAGS_color ? "\e[37m" : ""; }
+inline std::string light_red() { return FLAGS_color ? "\e[91m" : ""; }
+inline std::string dim() { return FLAGS_color ? "\e[2m" : ""; }
+inline std::string bold() { return FLAGS_color ? "\e[1m" : ""; }
+inline std::string underline() { return FLAGS_color ? "\e[4m" : ""; }
+inline std::string blink() { return FLAGS_color ? "\e[5m" : ""; }
+inline std::string reset() { return FLAGS_color ? "\e[0m" : ""; }
+
+using TextBlock = std::pair<std::string, std::string>;
+
+struct Style {
+  static std::string info() { return black(); }
+  static std::string warn() { return b_red(); }
+  static std::string suc() { return green(); }
+  static std::string H1() { return bold() + purple(); }
+  static std::string H2() { return green(); }
+  static std::string H3() { return green(); }
+  static std::string detail() { return light_gray(); }
+};
+
+template <typename... Args>
+static void PrettyLogEndl(const std::string& style, const char* fmt,
+                          const Args&... args) {
+  std::cerr << style << Sprintf(fmt, args...) << reset() << std::endl;
+}
+template <typename... Args>
+static void PrettyLog(const std::string& style, const char* fmt,
+                      const Args&... args) {
+  std::cerr << style << Sprintf(fmt, args...) << reset();
+}
+
+}  // namespace string
+}  // namespace paddle

From b720b3a58f7e5150c2d1c070a80132b349a612cb Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Mon, 10 Sep 2018 17:52:03 +0800
Subject: [PATCH 19/85] fix fluid benchmark script

---
 benchmark/fluid/fluid_benchmark.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index 11bd75e1d0..25622ee06c 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -91,7 +91,8 @@ def dist_transpile(trainer_id, args, train_prog, startup_prog):
         program=train_prog,
         pservers=pserver_endpoints,
         trainers=trainers,
-        sync_mode=not args.async_mode)
+        sync_mode=not args.async_mode,
+        startup_program=startup_prog)
     if training_role == "PSERVER":
         pserver_program = t.get_pserver_program(current_endpoint)
         pserver_startup_program = t.get_startup_program(

From 83af1b3b3e7933c95398365aaec15f1bff0cc7f4 Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Mon, 10 Sep 2018 18:52:15 +0800
Subject: [PATCH 20/85] move analyzer_rnn1_test out of analyzer_test

---
 .../fluid/inference/analysis/CMakeLists.txt   |   8 +-
 .../analysis/analyzer_rnn1_tester.cc          | 306 ++++++++++++++++++
 .../inference/analysis/analyzer_tester.cc     | 282 +---------------
 3 files changed, 314 insertions(+), 282 deletions(-)
 create mode 100644 paddle/fluid/inference/analysis/analyzer_rnn1_tester.cc

diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 11a7509feb..699e16ad97 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -35,11 +35,15 @@ function (inference_analysis_test TARGET)
         cc_test(${TARGET}
                 SRCS "${analysis_test_SRCS}"
                 DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS}
-                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS})
+                ARGS ${mem_opt} ${analysis_test_ARGS})
         set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
     endif(WITH_TESTING)
 endfunction(inference_analysis_test)
 
+inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
+    ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model)
+
 function (inference_download_and_uncompress install_dir url gz_filename)
     message(STATUS "Download inference test stuff ${gz_filename} from ${url}")
     execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
@@ -56,7 +60,7 @@ if (NOT EXISTS ${RNN1_INSTALL_DIR} AND WITH_TESTING)
   inference_download_and_uncompress(${RNN1_INSTALL_DIR} ${RNN1_DATA_URL} "rnn1%2Fdata.txt.tar.gz")
 endif()
 
-inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
+inference_analysis_test(test_analyzer_rnn1 SRCS analyzer_rnn1_tester.cc
     EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
     ARGS --infer_model=${RNN1_INSTALL_DIR}/model
          --infer_data=${RNN1_INSTALL_DIR}/data.txt)
diff --git a/paddle/fluid/inference/analysis/analyzer_rnn1_tester.cc b/paddle/fluid/inference/analysis/analyzer_rnn1_tester.cc
new file mode 100644
index 0000000000..b8ac468b4e
--- /dev/null
+++ b/paddle/fluid/inference/analysis/analyzer_rnn1_tester.cc
@@ -0,0 +1,306 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+
+#include <google/protobuf/text_format.h>
+#include <gtest/gtest.h>
+#include <thread>  // NOLINT
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/analysis_predictor.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
+
+DEFINE_string(infer_model, "", "model path");
+DEFINE_string(infer_data, "", "data path");
+DEFINE_int32(batch_size, 10, "batch size.");
+DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
+
+namespace paddle {
+namespace inference {
+
+using namespace framework;  // NOLINT
+
+struct DataRecord {
+  std::vector<std::vector<std::vector<float>>> link_step_data_all;
+  std::vector<std::vector<float>> week_data_all, minute_data_all;
+  std::vector<size_t> lod1, lod2, lod3;
+  std::vector<std::vector<float>> rnn_link_data, rnn_week_datas,
+      rnn_minute_datas;
+  size_t batch_iter{0};
+  size_t batch_size{1};
+  DataRecord() = default;
+  explicit DataRecord(const std::string &path, int batch_size = 1)
+      : batch_size(batch_size) {
+    Load(path);
+  }
+  DataRecord NextBatch() {
+    DataRecord data;
+    size_t batch_end = batch_iter + batch_size;
+    // NOTE skip the final batch, if no enough data is provided.
+    if (batch_end <= link_step_data_all.size()) {
+      data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter,
+                                     link_step_data_all.begin() + batch_end);
+      data.week_data_all.assign(week_data_all.begin() + batch_iter,
+                                week_data_all.begin() + batch_end);
+      data.minute_data_all.assign(minute_data_all.begin() + batch_iter,
+                                  minute_data_all.begin() + batch_end);
+      // Prepare LoDs
+      data.lod1.push_back(0);
+      data.lod2.push_back(0);
+      data.lod3.push_back(0);
+      CHECK(!data.link_step_data_all.empty()) << "empty";
+      CHECK(!data.week_data_all.empty());
+      CHECK(!data.minute_data_all.empty());
+      CHECK_EQ(data.link_step_data_all.size(), data.week_data_all.size());
+      CHECK_EQ(data.minute_data_all.size(), data.link_step_data_all.size());
+      for (size_t j = 0; j < data.link_step_data_all.size(); j++) {
+        for (const auto &d : data.link_step_data_all[j]) {
+          data.rnn_link_data.push_back(d);
+        }
+        data.rnn_week_datas.push_back(data.week_data_all[j]);
+        data.rnn_minute_datas.push_back(data.minute_data_all[j]);
+        // calculate lod
+        data.lod1.push_back(data.lod1.back() +
+                            data.link_step_data_all[j].size());
+        data.lod3.push_back(data.lod3.back() + 1);
+        for (size_t i = 1; i < data.link_step_data_all[j].size() + 1; i++) {
+          data.lod2.push_back(data.lod2.back() +
+                              data.link_step_data_all[j].size());
+        }
+      }
+    }
+    batch_iter += batch_size;
+    return data;
+  }
+  void Load(const std::string &path) {
+    std::ifstream file(path);
+    std::string line;
+    int num_lines = 0;
+    while (std::getline(file, line)) {
+      num_lines++;
+      std::vector<std::string> data;
+      split(line, ':', &data);
+      std::vector<std::vector<float>> link_step_data;
+      std::vector<std::string> link_datas;
+      split(data[0], '|', &link_datas);
+      for (auto &step_data : link_datas) {
+        std::vector<float> tmp;
+        split_to_float(step_data, ',', &tmp);
+        link_step_data.push_back(tmp);
+      }
+      // load week data
+      std::vector<float> week_data;
+      split_to_float(data[2], ',', &week_data);
+      // load minute data
+      std::vector<float> minute_data;
+      split_to_float(data[1], ',', &minute_data);
+      link_step_data_all.push_back(std::move(link_step_data));
+      week_data_all.push_back(std::move(week_data));
+      minute_data_all.push_back(std::move(minute_data));
+    }
+  }
+};
+void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
+                   int batch_size) {
+  PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor,
+      week_tensor, minute_tensor;
+  lod_attention_tensor.name = "data_lod_attention";
+  init_zero_tensor.name = "cell_init";
+  lod_tensor_tensor.name = "data";
+  week_tensor.name = "week";
+  minute_tensor.name = "minute";
+  auto one_batch = data->NextBatch();
+  std::vector<int> rnn_link_data_shape(
+      {static_cast<int>(one_batch.rnn_link_data.size()),
+       static_cast<int>(one_batch.rnn_link_data.front().size())});
+  lod_attention_tensor.shape.assign({1, 2});
+  lod_attention_tensor.lod.assign({one_batch.lod1, one_batch.lod2});
+  init_zero_tensor.shape.assign({batch_size, 15});
+  init_zero_tensor.lod.assign({one_batch.lod3});
+  lod_tensor_tensor.shape = rnn_link_data_shape;
+  lod_tensor_tensor.lod.assign({one_batch.lod1});
+  // clang-format off
+  week_tensor.shape.assign(
+      {static_cast<int>(one_batch.rnn_week_datas.size()),
+       static_cast<int>(one_batch.rnn_week_datas.front().size())});
+  week_tensor.lod.assign({one_batch.lod3});
+  minute_tensor.shape.assign(
+      {static_cast<int>(one_batch.rnn_minute_datas.size()),
+       static_cast<int>(one_batch.rnn_minute_datas.front().size())});
+  minute_tensor.lod.assign({one_batch.lod3});
+  // clang-format on
+  // assign data
+  TensorAssignData<float>(&lod_attention_tensor,
+                          std::vector<std::vector<float>>({{0, 0}}));
+  std::vector<float> tmp_zeros(batch_size * 15, 0.);
+  TensorAssignData<float>(&init_zero_tensor, {tmp_zeros});
+  TensorAssignData<float>(&lod_tensor_tensor, one_batch.rnn_link_data);
+  TensorAssignData<float>(&week_tensor, one_batch.rnn_week_datas);
+  TensorAssignData<float>(&minute_tensor, one_batch.rnn_minute_datas);
+  // Set inputs.
+  auto init_zero_tensor1 = init_zero_tensor;
+  init_zero_tensor1.name = "hidden_init";
+  input_slots->assign({week_tensor, init_zero_tensor, minute_tensor,
+                       init_zero_tensor1, lod_attention_tensor,
+                       lod_tensor_tensor});
+  for (auto &tensor : *input_slots) {
+    tensor.dtype = PaddleDType::FLOAT32;
+  }
+}
+
+void CompareResult(const std::vector<PaddleTensor> &outputs,
+                   const std::vector<PaddleTensor> &base_outputs) {
+  PADDLE_ENFORCE_GT(outputs.size(), 0);
+  PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
+  for (size_t i = 0; i < outputs.size(); i++) {
+    auto &out = outputs[i];
+    auto &base_out = base_outputs[i];
+    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                                  [](int a, int b) { return a * b; });
+    size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
+                                   1, [](int a, int b) { return a * b; });
+    PADDLE_ENFORCE_EQ(size, size1);
+    PADDLE_ENFORCE_GT(size, 0);
+    float *data = static_cast<float *>(out.data.data());
+    float *base_data = static_cast<float *>(base_out.data.data());
+    for (size_t i = 0; i < size; i++) {
+      EXPECT_NEAR(data[i], base_data[i], 1e-3);
+    }
+  }
+}
+// Test with a really complicate model.
+void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {
+  AnalysisConfig config;
+  config.prog_file = FLAGS_infer_model + "/__model__";
+  config.param_file = FLAGS_infer_model + "/param";
+  config.use_gpu = false;
+  config.device = 0;
+  config.specify_input_name = true;
+  config.enable_ir_optim = activate_ir;
+  PADDLE_ENFORCE(config.ir_mode ==
+                 AnalysisConfig::IrPassMode::kExclude);  // default
+  config.ir_passes.clear();  // Do not exclude any pass.
+
+  int batch_size = FLAGS_batch_size;
+  int num_times = FLAGS_repeat;
+
+  auto base_predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  auto predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
+  std::vector<PaddleTensor> input_slots;
+  DataRecord data(FLAGS_infer_data, batch_size);
+  // Prepare inputs.
+  PrepareInputs(&input_slots, &data, batch_size);
+  std::vector<PaddleTensor> outputs, base_outputs;
+
+  base_predictor->Run(input_slots, &base_outputs);
+
+  if (num_threads == 1) {
+    // Prepare inputs.
+    Timer timer;
+    timer.tic();
+    for (int i = 0; i < num_times; i++) {
+      predictor->Run(input_slots, &outputs);
+    }
+    PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times);
+    CompareResult(outputs, base_outputs);
+  } else {
+    std::vector<std::thread> threads;
+    std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+    // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
+    // because AttentionLSTM's hard code nodeid will be damanged.
+    for (int tid = 0; tid < num_threads; ++tid) {
+      predictors.emplace_back(
+          CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+              config));
+    }
+    for (int tid = 0; tid < num_threads; ++tid) {
+      threads.emplace_back([&, tid]() {
+        // Each thread should have local input_slots and outputs.
+        std::vector<PaddleTensor> input_slots;
+        DataRecord data(FLAGS_infer_data, batch_size);
+        PrepareInputs(&input_slots, &data, batch_size);
+        std::vector<PaddleTensor> outputs;
+        Timer timer;
+        timer.tic();
+        for (int i = 0; i < num_times; i++) {
+          predictors[tid]->Run(input_slots, &outputs);
+        }
+        PrintTime(batch_size, num_times, num_threads, tid,
+                  timer.toc() / num_times);
+        CompareResult(outputs, base_outputs);
+      });
+    }
+    for (int i = 0; i < num_threads; ++i) {
+      threads[i].join();
+    }
+  }
+
+  if (use_analysis && activate_ir) {
+    AnalysisPredictor *analysis_predictor =
+        dynamic_cast<AnalysisPredictor *>(predictor.get());
+    auto &fuse_statis = analysis_predictor->analysis_argument()
+                            .Get<std::unordered_map<std::string, int>>(
+                                framework::ir::kFuseStatisAttr);
+    for (auto &item : fuse_statis) {
+      LOG(INFO) << "fused " << item.first << " " << item.second;
+    }
+
+    int num_ops = 0;
+    for (auto &node :
+         analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
+      if (node->IsFunction()) {
+        ++num_ops;
+      }
+    }
+    LOG(INFO) << "has num ops: " << num_ops;
+
+    ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+    EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
+    EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2);  // bi-directional LSTM
+    EXPECT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1);
+    EXPECT_EQ(num_ops,
+              13);  // After graph optimization, only 13 operators exists.
+  }
+}
+
+// Inference with analysis and IR, easy for profiling independently.
+TEST(Analyzer, rnn1) { TestRNN1Prediction(true, true, FLAGS_num_threads); }
+
+// Other unit-tests of RNN1, test different options of use_analysis,
+// activate_ir and multi-threads.
+TEST(Analyzer, RNN_tests) {
+  int num_threads[2] = {1, 4};
+  for (auto i : num_threads) {
+    // Directly infer with the original model.
+    TestRNN1Prediction(false, false, i);
+    // Inference with the original model with the analysis turned on, the
+    // analysis
+    // module will transform the program to a data flow graph.
+    TestRNN1Prediction(true, false, i);
+    // Inference with analysis and IR. The IR module will fuse some large
+    // kernels.
+    TestRNN1Prediction(true, true, i);
+  }
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index cc4b390495..3b5be7f3ee 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -16,21 +16,9 @@
 
 #include <google/protobuf/text_format.h>
 #include <gtest/gtest.h>
-#include <thread>  // NOLINT
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
-#include "paddle/fluid/inference/utils/singleton.h"
-
-DEFINE_string(infer_model, "", "model path");
-DEFINE_string(infer_data, "", "data path");
-DEFINE_int32(batch_size, 10, "batch size.");
-DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
-DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
 
 namespace paddle {
 namespace inference {
@@ -91,274 +79,8 @@ void TestWord2vecPrediction(const std::string &model_path) {
   }
 }
 
-namespace {
-
-struct DataRecord {
-  std::vector<std::vector<std::vector<float>>> link_step_data_all;
-  std::vector<std::vector<float>> week_data_all, minute_data_all;
-  std::vector<size_t> lod1, lod2, lod3;
-  std::vector<std::vector<float>> rnn_link_data, rnn_week_datas,
-      rnn_minute_datas;
-  size_t batch_iter{0};
-  size_t batch_size{1};
-  DataRecord() = default;
-  explicit DataRecord(const std::string &path, int batch_size = 1)
-      : batch_size(batch_size) {
-    Load(path);
-  }
-  DataRecord NextBatch() {
-    DataRecord data;
-    size_t batch_end = batch_iter + batch_size;
-    // NOTE skip the final batch, if no enough data is provided.
-    if (batch_end <= link_step_data_all.size()) {
-      data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter,
-                                     link_step_data_all.begin() + batch_end);
-      data.week_data_all.assign(week_data_all.begin() + batch_iter,
-                                week_data_all.begin() + batch_end);
-      data.minute_data_all.assign(minute_data_all.begin() + batch_iter,
-                                  minute_data_all.begin() + batch_end);
-      // Prepare LoDs
-      data.lod1.push_back(0);
-      data.lod2.push_back(0);
-      data.lod3.push_back(0);
-      CHECK(!data.link_step_data_all.empty()) << "empty";
-      CHECK(!data.week_data_all.empty());
-      CHECK(!data.minute_data_all.empty());
-      CHECK_EQ(data.link_step_data_all.size(), data.week_data_all.size());
-      CHECK_EQ(data.minute_data_all.size(), data.link_step_data_all.size());
-      for (size_t j = 0; j < data.link_step_data_all.size(); j++) {
-        for (const auto &d : data.link_step_data_all[j]) {
-          data.rnn_link_data.push_back(d);
-        }
-        data.rnn_week_datas.push_back(data.week_data_all[j]);
-        data.rnn_minute_datas.push_back(data.minute_data_all[j]);
-        // calculate lod
-        data.lod1.push_back(data.lod1.back() +
-                            data.link_step_data_all[j].size());
-        data.lod3.push_back(data.lod3.back() + 1);
-        for (size_t i = 1; i < data.link_step_data_all[j].size() + 1; i++) {
-          data.lod2.push_back(data.lod2.back() +
-                              data.link_step_data_all[j].size());
-        }
-      }
-    }
-    batch_iter += batch_size;
-    return data;
-  }
-  void Load(const std::string &path) {
-    std::ifstream file(path);
-    std::string line;
-    int num_lines = 0;
-    while (std::getline(file, line)) {
-      num_lines++;
-      std::vector<std::string> data;
-      split(line, ':', &data);
-      std::vector<std::vector<float>> link_step_data;
-      std::vector<std::string> link_datas;
-      split(data[0], '|', &link_datas);
-      for (auto &step_data : link_datas) {
-        std::vector<float> tmp;
-        split_to_float(step_data, ',', &tmp);
-        link_step_data.push_back(tmp);
-      }
-      // load week data
-      std::vector<float> week_data;
-      split_to_float(data[2], ',', &week_data);
-      // load minute data
-      std::vector<float> minute_data;
-      split_to_float(data[1], ',', &minute_data);
-      link_step_data_all.push_back(std::move(link_step_data));
-      week_data_all.push_back(std::move(week_data));
-      minute_data_all.push_back(std::move(minute_data));
-    }
-  }
-};
-void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
-                   int batch_size) {
-  PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor,
-      week_tensor, minute_tensor;
-  lod_attention_tensor.name = "data_lod_attention";
-  init_zero_tensor.name = "cell_init";
-  lod_tensor_tensor.name = "data";
-  week_tensor.name = "week";
-  minute_tensor.name = "minute";
-  auto one_batch = data->NextBatch();
-  std::vector<int> rnn_link_data_shape(
-      {static_cast<int>(one_batch.rnn_link_data.size()),
-       static_cast<int>(one_batch.rnn_link_data.front().size())});
-  lod_attention_tensor.shape.assign({1, 2});
-  lod_attention_tensor.lod.assign({one_batch.lod1, one_batch.lod2});
-  init_zero_tensor.shape.assign({batch_size, 15});
-  init_zero_tensor.lod.assign({one_batch.lod3});
-  lod_tensor_tensor.shape = rnn_link_data_shape;
-  lod_tensor_tensor.lod.assign({one_batch.lod1});
-  // clang-format off
-  week_tensor.shape.assign(
-      {static_cast<int>(one_batch.rnn_week_datas.size()),
-       static_cast<int>(one_batch.rnn_week_datas.front().size())});
-  week_tensor.lod.assign({one_batch.lod3});
-  minute_tensor.shape.assign(
-      {static_cast<int>(one_batch.rnn_minute_datas.size()),
-       static_cast<int>(one_batch.rnn_minute_datas.front().size())});
-  minute_tensor.lod.assign({one_batch.lod3});
-  // clang-format on
-  // assign data
-  TensorAssignData<float>(&lod_attention_tensor,
-                          std::vector<std::vector<float>>({{0, 0}}));
-  std::vector<float> tmp_zeros(batch_size * 15, 0.);
-  TensorAssignData<float>(&init_zero_tensor, {tmp_zeros});
-  TensorAssignData<float>(&lod_tensor_tensor, one_batch.rnn_link_data);
-  TensorAssignData<float>(&week_tensor, one_batch.rnn_week_datas);
-  TensorAssignData<float>(&minute_tensor, one_batch.rnn_minute_datas);
-  // Set inputs.
-  auto init_zero_tensor1 = init_zero_tensor;
-  init_zero_tensor1.name = "hidden_init";
-  input_slots->assign({week_tensor, init_zero_tensor, minute_tensor,
-                       init_zero_tensor1, lod_attention_tensor,
-                       lod_tensor_tensor});
-  for (auto &tensor : *input_slots) {
-    tensor.dtype = PaddleDType::FLOAT32;
-  }
-}
-
-}  // namespace
-
-void CompareResult(const std::vector<PaddleTensor> &outputs,
-                   const std::vector<PaddleTensor> &base_outputs) {
-  PADDLE_ENFORCE_GT(outputs.size(), 0);
-  PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
-  for (size_t i = 0; i < outputs.size(); i++) {
-    auto &out = outputs[i];
-    auto &base_out = base_outputs[i];
-    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
-                                  [](int a, int b) { return a * b; });
-    size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
-                                   1, [](int a, int b) { return a * b; });
-    PADDLE_ENFORCE_EQ(size, size1);
-    PADDLE_ENFORCE_GT(size, 0);
-    float *data = static_cast<float *>(out.data.data());
-    float *base_data = static_cast<float *>(base_out.data.data());
-    for (size_t i = 0; i < size; i++) {
-      EXPECT_NEAR(data[i], base_data[i], 1e-3);
-    }
-  }
-}
-// Test with a really complicate model.
-void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {
-  AnalysisConfig config;
-  config.prog_file = FLAGS_infer_model + "/__model__";
-  config.param_file = FLAGS_infer_model + "/param";
-  config.use_gpu = false;
-  config.device = 0;
-  config.specify_input_name = true;
-  config.enable_ir_optim = activate_ir;
-  PADDLE_ENFORCE(config.ir_mode ==
-                 AnalysisConfig::IrPassMode::kExclude);  // default
-  config.ir_passes.clear();  // Do not exclude any pass.
-
-  int batch_size = FLAGS_batch_size;
-  int num_times = FLAGS_repeat;
-
-  auto base_predictor =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
-  auto predictor =
-      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-          config);
-  std::vector<PaddleTensor> input_slots;
-  DataRecord data(FLAGS_infer_data, batch_size);
-  // Prepare inputs.
-  PrepareInputs(&input_slots, &data, batch_size);
-  std::vector<PaddleTensor> outputs, base_outputs;
-
-  base_predictor->Run(input_slots, &base_outputs);
-
-  if (num_threads == 1) {
-    // Prepare inputs.
-    Timer timer;
-    timer.tic();
-    for (int i = 0; i < num_times; i++) {
-      predictor->Run(input_slots, &outputs);
-    }
-    PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times);
-    CompareResult(outputs, base_outputs);
-  } else {
-    std::vector<std::thread> threads;
-    std::vector<std::unique_ptr<PaddlePredictor>> predictors;
-    // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
-    // because AttentionLSTM's hard code nodeid will be damanged.
-    for (int tid = 0; tid < num_threads; ++tid) {
-      predictors.emplace_back(
-          CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-              config));
-    }
-    for (int tid = 0; tid < num_threads; ++tid) {
-      threads.emplace_back([&, tid]() {
-        // Each thread should have local input_slots and outputs.
-        std::vector<PaddleTensor> input_slots;
-        DataRecord data(FLAGS_infer_data, batch_size);
-        PrepareInputs(&input_slots, &data, batch_size);
-        std::vector<PaddleTensor> outputs;
-        Timer timer;
-        timer.tic();
-        for (int i = 0; i < num_times; i++) {
-          predictors[tid]->Run(input_slots, &outputs);
-        }
-        PrintTime(batch_size, num_times, num_threads, tid,
-                  timer.toc() / num_times);
-        CompareResult(outputs, base_outputs);
-      });
-    }
-    for (int i = 0; i < num_threads; ++i) {
-      threads[i].join();
-    }
-  }
-
-  if (use_analysis && activate_ir) {
-    AnalysisPredictor *analysis_predictor =
-        dynamic_cast<AnalysisPredictor *>(predictor.get());
-    auto &fuse_statis = analysis_predictor->analysis_argument()
-                            .Get<std::unordered_map<std::string, int>>(
-                                framework::ir::kFuseStatisAttr);
-    for (auto &item : fuse_statis) {
-      LOG(INFO) << "fused " << item.first << " " << item.second;
-    }
-
-    int num_ops = 0;
-    for (auto &node :
-         analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
-      if (node->IsFunction()) {
-        ++num_ops;
-      }
-    }
-    LOG(INFO) << "has num ops: " << num_ops;
-
-    ASSERT_TRUE(fuse_statis.count("fc_fuse"));
-    EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
-    EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2);  // bi-directional LSTM
-    EXPECT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1);
-    EXPECT_EQ(num_ops,
-              13);  // After graph optimization, only 13 operators exists.
-  }
-}
-
-// Inference with analysis and IR, easy for profiling independently.
-TEST(Analyzer, rnn1) { TestRNN1Prediction(true, true, FLAGS_num_threads); }
-
-// Other unit-tests of RNN1, test different options of use_analysis,
-// activate_ir and multi-threads.
-TEST(Analyzer, RNN_tests) {
-  int num_threads[2] = {1, 4};
-  for (auto i : num_threads) {
-    // Directly infer with the original model.
-    TestRNN1Prediction(false, false, i);
-    // Inference with the original model with the analysis turned on, the
-    // analysis
-    // module will transform the program to a data flow graph.
-    TestRNN1Prediction(true, false, i);
-    // Inference with analysis and IR. The IR module will fuse some large
-    // kernels.
-    TestRNN1Prediction(true, true, i);
-  }
+TEST(Analyzer, word2vec_without_analysis) {
+  TestWord2vecPrediction(FLAGS_inference_model_dir);
 }
 
 }  // namespace analysis

From d0fbe780403b42490e90ed38a86c504bbf8f80c5 Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Mon, 10 Sep 2018 19:45:05 +0800
Subject: [PATCH 21/85] move analyzer_xxx_tester to inference/tests/api

---
 paddle/fluid/inference/CMakeLists.txt         |  3 +-
 .../fluid/inference/analysis/CMakeLists.txt   | 64 -------------------
 .../fluid/inference/tests/api/CMakeLists.txt  | 60 +++++++++++++++++
 .../api}/analyzer_lac_tester.cc               |  0
 .../api}/analyzer_ner_tester.cc               |  0
 .../api}/analyzer_rnn1_tester.cc              |  0
 .../analyzer_text_classification_tester.cc    |  0
 7 files changed, 62 insertions(+), 65 deletions(-)
 create mode 100644 paddle/fluid/inference/tests/api/CMakeLists.txt
 rename paddle/fluid/inference/{analysis => tests/api}/analyzer_lac_tester.cc (100%)
 rename paddle/fluid/inference/{analysis => tests/api}/analyzer_ner_tester.cc (100%)
 rename paddle/fluid/inference/{analysis => tests/api}/analyzer_rnn1_tester.cc (100%)
 rename paddle/fluid/inference/{analysis => tests/api}/analyzer_text_classification_tester.cc (100%)

diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 2006e3b24f..efb91bcf75 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -55,6 +55,7 @@ if(NOT APPLE)
 endif()
 
 if(WITH_TESTING)
-  # both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book
+  # tests/book depends the models that generated by python/paddle/fluid/tests/book
   add_subdirectory(tests/book)
+  add_subdirectory(tests/api)
 endif()
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 699e16ad97..a36f85bd70 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -44,27 +44,6 @@ inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
     EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
     ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model)
 
-function (inference_download_and_uncompress install_dir url gz_filename)
-    message(STATUS "Download inference test stuff ${gz_filename} from ${url}")
-    execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
-    execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}")
-    execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${gz_filename}")
-    message(STATUS "finish downloading ${gz_filename}")
-endfunction(inference_download_and_uncompress)
-
-set(RNN1_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fmodel.tar.gz")
-set(RNN1_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fdata.txt.tar.gz")
-set(RNN1_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/rnn1" CACHE PATH "RNN1 model and data root." FORCE)
-if (NOT EXISTS ${RNN1_INSTALL_DIR} AND WITH_TESTING)
-  inference_download_and_uncompress(${RNN1_INSTALL_DIR} ${RNN1_MODEL_URL} "rnn1%2Fmodel.tar.gz")
-  inference_download_and_uncompress(${RNN1_INSTALL_DIR} ${RNN1_DATA_URL} "rnn1%2Fdata.txt.tar.gz")
-endif()
-
-inference_analysis_test(test_analyzer_rnn1 SRCS analyzer_rnn1_tester.cc
-    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
-    ARGS --infer_model=${RNN1_INSTALL_DIR}/model
-         --infer_data=${RNN1_INSTALL_DIR}/data.txt)
-
 inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
 inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc)
 inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc)
@@ -75,46 +54,3 @@ inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_
 inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
 inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc)
 inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc)
-
-set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz")
-set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz")
-set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner" CACHE PATH "Chinese ner model and data root." FORCE)
-if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
-  inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} "chinese_ner_model.tar.gz")
-  inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_DATA_URL} "chinese_ner-data.txt.tar.gz")
-endif()
-
-inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc
-    EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
-    ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model
-        --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt)
-
-set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz")
-set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz")
-set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac" CACHE PATH "LAC model and data root." FORCE)
-if (NOT EXISTS ${LAC_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
-    inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} "lac_model.tar.gz")
-    inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_DATA_URL} "lac_data.txt.tar.gz")
-endif()
-
-inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
-    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
-    ARGS --infer_model=${LAC_INSTALL_DIR}/model
-        --infer_data=${LAC_INSTALL_DIR}/data.txt)
-
-
-set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz")
-set(TEXT_CLASSIFICATION_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/text_classification_data.txt.tar.gz")
-set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification" CACHE PATH "Text Classification model and data root." FORCE)
-
-if (NOT EXISTS ${TEXT_CLASSIFICATION_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
-  inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} "text-classification-Senta.tar.gz")
-  inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_DATA_URL} "text_classification_data.txt.tar.gz")
-endif()
-
-inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc
-    EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
-    ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta
-         --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt
-         --topn=1 # Just run top 1 batch.
-    )
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
new file mode 100644
index 0000000000..caf23eef40
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -0,0 +1,60 @@
+function (inference_download_and_uncompress install_dir url gz_filename)
+    message(STATUS "Download inference test stuff ${gz_filename} from ${url}")
+    execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
+    execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}")
+    execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${gz_filename}")
+    message(STATUS "finish downloading ${gz_filename}")
+endfunction(inference_download_and_uncompress)
+
+function(download_model_and_data install_dir model_url model_gz_filename data_url data_gz_filename)
+    if (NOT EXISTS ${install_dir} AND WITH_INFERENCE)
+        inference_download_and_uncompress(${install_dir} ${model_url} ${model_gz_filename})
+        inference_download_and_uncompress(${install_dir} ${data_url} ${data_gz_filename})
+    endif()
+endfunction()
+
+# RNN1
+set(RNN1_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fmodel.tar.gz")
+set(RNN1_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fdata.txt.tar.gz")
+set(RNN1_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/rnn1")
+download_model_and_data(${RNN1_INSTALL_DIR} ${RNN1_MODEL_URL} "rnn1%2Fmodel.tar.gz"
+                        ${RNN1_DATA_URL} "rnn1%2Fdata.txt.tar.gz")
+inference_analysis_test(test_analyzer_rnn1 SRCS analyzer_rnn1_tester.cc
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
+    ARGS --infer_model=${RNN1_INSTALL_DIR}/model
+         --infer_data=${RNN1_INSTALL_DIR}/data.txt)
+
+# chinese_ner
+set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz")
+set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz")
+set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner")
+download_model_and_data(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} "chinese_ner_model.tar.gz"
+                        ${CHINESE_NER_DATA_URL} "chinese_ner-data.txt.tar.gz")
+inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
+    ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model
+        --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt)
+
+# lac
+set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz")
+set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz")
+set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac")
+download_model_and_data(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} "lac_model.tar.gz"
+                        ${LAC_DATA_URL} "lac_data.txt.tar.gz")
+inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
+    ARGS --infer_model=${LAC_INSTALL_DIR}/model
+        --infer_data=${LAC_INSTALL_DIR}/data.txt)
+
+# text_classification
+set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz")
+set(TEXT_CLASSIFICATION_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/text_classification_data.txt.tar.gz")
+set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification")
+download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} "text_classification-Senta.tar.gz"
+                        ${TEXT_CLASSIFICATION_DATA_URL} "text_classification_data.txt.tar.gz")
+inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
+    ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta
+         --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt
+         --topn=1 # Just run top 1 batch.
+         )
diff --git a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
similarity index 100%
rename from paddle/fluid/inference/analysis/analyzer_lac_tester.cc
rename to paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
diff --git a/paddle/fluid/inference/analysis/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
similarity index 100%
rename from paddle/fluid/inference/analysis/analyzer_ner_tester.cc
rename to paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
diff --git a/paddle/fluid/inference/analysis/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
similarity index 100%
rename from paddle/fluid/inference/analysis/analyzer_rnn1_tester.cc
rename to paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
diff --git a/paddle/fluid/inference/analysis/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
similarity index 100%
rename from paddle/fluid/inference/analysis/analyzer_text_classification_tester.cc
rename to paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc

From 81c21705b493bd59259f491e3818af8ba09033ab Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Mon, 10 Sep 2018 20:24:04 +0800
Subject: [PATCH 22/85] simplify inference/tests/api/CMakeLists.txt

---
 .../fluid/inference/tests/api/CMakeLists.txt  | 27 +++++++++----------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index caf23eef40..d44a2cfa7f 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -1,15 +1,16 @@
-function (inference_download_and_uncompress install_dir url gz_filename)
-    message(STATUS "Download inference test stuff ${gz_filename} from ${url}")
+function (inference_download_and_uncompress install_dir url)
+    get_filename_component(filename ${url} NAME)
+    message(STATUS "Download inference test stuff ${filename} from ${url}")
     execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
     execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}")
-    execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${gz_filename}")
-    message(STATUS "finish downloading ${gz_filename}")
+    execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}")
+    message(STATUS "finish downloading ${filename}")
 endfunction(inference_download_and_uncompress)
 
-function(download_model_and_data install_dir model_url model_gz_filename data_url data_gz_filename)
+function(download_model_and_data install_dir model_url data_url)
     if (NOT EXISTS ${install_dir} AND WITH_INFERENCE)
-        inference_download_and_uncompress(${install_dir} ${model_url} ${model_gz_filename})
-        inference_download_and_uncompress(${install_dir} ${data_url} ${data_gz_filename})
+        inference_download_and_uncompress(${install_dir} ${model_url})
+        inference_download_and_uncompress(${install_dir} ${data_url})
     endif()
 endfunction()
 
@@ -17,8 +18,7 @@ endfunction()
 set(RNN1_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fmodel.tar.gz")
 set(RNN1_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fdata.txt.tar.gz")
 set(RNN1_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/rnn1")
-download_model_and_data(${RNN1_INSTALL_DIR} ${RNN1_MODEL_URL} "rnn1%2Fmodel.tar.gz"
-                        ${RNN1_DATA_URL} "rnn1%2Fdata.txt.tar.gz")
+download_model_and_data(${RNN1_INSTALL_DIR} ${RNN1_MODEL_URL} ${RNN1_DATA_URL})
 inference_analysis_test(test_analyzer_rnn1 SRCS analyzer_rnn1_tester.cc
     EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
     ARGS --infer_model=${RNN1_INSTALL_DIR}/model
@@ -28,8 +28,7 @@ inference_analysis_test(test_analyzer_rnn1 SRCS analyzer_rnn1_tester.cc
 set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz")
 set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz")
 set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner")
-download_model_and_data(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} "chinese_ner_model.tar.gz"
-                        ${CHINESE_NER_DATA_URL} "chinese_ner-data.txt.tar.gz")
+download_model_and_data(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} ${CHINESE_NER_DATA_URL})
 inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc
     EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
     ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model
@@ -39,8 +38,7 @@ inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc
 set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz")
 set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz")
 set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac")
-download_model_and_data(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} "lac_model.tar.gz"
-                        ${LAC_DATA_URL} "lac_data.txt.tar.gz")
+download_model_and_data(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} ${LAC_DATA_URL})
 inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
     EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
     ARGS --infer_model=${LAC_INSTALL_DIR}/model
@@ -50,8 +48,7 @@ inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
 set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz")
 set(TEXT_CLASSIFICATION_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/text_classification_data.txt.tar.gz")
 set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification")
-download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} "text_classification-Senta.tar.gz"
-                        ${TEXT_CLASSIFICATION_DATA_URL} "text_classification_data.txt.tar.gz")
+download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} ${TEXT_CLASSIFICATION_DATA_URL})
 inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc
     EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
     ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta

From 1658958fe697f1b7a2c558e8bda06285826b058a Mon Sep 17 00:00:00 2001
From: Krzysztof Binias <krzysztof.binias@intel.com>
Date: Mon, 10 Sep 2018 14:57:10 +0200
Subject: [PATCH 23/85] Reusing converted weights

---
 paddle/fluid/operators/conv_mkldnn_op.cc | 9 ++++++---
 paddle/fluid/operators/conv_op.cc        | 1 +
 paddle/fluid/platform/mkldnn_helper.h    | 6 +++---
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index c5cbadc892..1ccf2494f2 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -130,12 +130,13 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
       const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+      const std::vector<mkldnn::primitive>& pipeline,
+      bool is_test = false) {  // NOLINT
     auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
     auto weights_pd = conv_pd_->weights_primitive_desc();
     return this->AcquireMemory(weights_pd, user_weights_pd,
                                user_weights_memory_p, "@weights_mem_p",
-                               pipeline);
+                               pipeline, is_test);
   }
 
   std::shared_ptr<mkldnn::memory> AcquireBiasMemoryFromPrimitive(
@@ -266,6 +267,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                    "It must use CPUPlace.");
 
+    const bool is_test = ctx.Attr<bool>("is_test");
+
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
@@ -371,7 +374,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto src_memory_p =
         handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
     auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive(
-        user_weights_memory_p, pipeline);
+        user_weights_memory_p, pipeline, is_test);
     auto dst_memory_p =
         handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
 
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 61ca80877a..6070173ee2 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -109,6 +109,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
 }
 
 void Conv2DOpMaker::Make() {
+  AddAttr<bool>("is_test", "").SetDefault(false);
   AddInput(
       "Input",
       "(Tensor) The input tensor of convolution operator. "
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index f6e9a52b27..c64e5dafda 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -191,8 +191,8 @@ class MKLDNNHandler {
       mkldnn::memory::primitive_desc& mpd,       // NOLINT
       mkldnn::memory::primitive_desc& user_mpd,  // NOLINT
       const std::shared_ptr<mkldnn::memory> user_memory_p,
-      const std::string& suffix,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+      const std::string& suffix, const std::vector<mkldnn::primitive>& pipeline,
+      bool is_test = false) {  // NOLINT
     // create reorder primitive if the input format is not the preferred one
     auto local_key = key_ + suffix;
     auto key_reorder_p = key_ + suffix + "reorder_p";
@@ -213,7 +213,7 @@ class MKLDNNHandler {
         pipeline.push_back(*reorder_p);
       }
       dev_ctx_.SetBlob(local_key, target_memory_p);
-    } else {
+    } else if (!is_test) {
       // Make reorder if needed
       auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
           dev_ctx_.GetBlob(key_reorder_p));

From 9664c53c7cae450dc70459008f1509bffb2d0518 Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Mon, 10 Sep 2018 22:50:24 +0800
Subject: [PATCH 24/85] fix cmake error to pass the ci

---
 paddle/fluid/inference/analysis/CMakeLists.txt | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index a36f85bd70..29e3d7dacd 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -35,15 +35,12 @@ function (inference_analysis_test TARGET)
         cc_test(${TARGET}
                 SRCS "${analysis_test_SRCS}"
                 DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS}
-                ARGS ${mem_opt} ${analysis_test_ARGS})
+                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS})
         set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
     endif(WITH_TESTING)
 endfunction(inference_analysis_test)
 
-inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
-    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
-    ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model)
-
+inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api)
 inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
 inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc)
 inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc)

From c4d6364060f87d094524115d358c31c23008e2e0 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Tue, 11 Sep 2018 13:02:44 +0800
Subject: [PATCH 25/85] update release doc

---
 doc/fluid/dev/releasing_process_cn.md | 44 ++++++++---------
 doc/fluid/dev/releasing_process_en.md | 68 ++++++++++++++++++---------
 2 files changed, 66 insertions(+), 46 deletions(-)

diff --git a/doc/fluid/dev/releasing_process_cn.md b/doc/fluid/dev/releasing_process_cn.md
index 4c6728fba7..b3ce2b1b02 100644
--- a/doc/fluid/dev/releasing_process_cn.md
+++ b/doc/fluid/dev/releasing_process_cn.md
@@ -1,24 +1,23 @@
 # PaddlePaddle发行规范
 
-PaddlePaddle使用git-flow branching model做分支管理，使用[Semantic Versioning](http://semver.org/)标准表示PaddlePaddle版本号。
+PaddlePaddle使用Trunk Based Development，使用[Semantic Versioning](http://semver.org/)标准表示PaddlePaddle版本号。
 
 PaddlePaddle每次发新的版本，遵循以下流程:
 
 1. 从`develop`分支派生出新的分支，分支名为`release/版本号`。例如，`release/0.10.0`
-1. 将新分支的版本打上tag，tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`，第二个为`0.10.0rc2`，依次类推。
-1. 对这个版本的提交，做如下几个操作:
-  * 使用Regression Test List作为检查列表，测试本次release的正确性。
-	  * 如果失败，记录下所有失败的例子，在这个`release/版本号`分支中，修复所有bug后，Patch号加一，到第二步
-	* 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。
-	* 将这个版本的python wheel包发布到pypi。
-	* 更新Docker镜像（参考后面的操作细节）。
-1. 第三步完成后，将`release/版本号`分支合入master分支，将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。
-1. 协同完成Release Note的书写。
+2. 将新分支的版本打上tag，tag为`版本号rc.Patch号`。第一个tag为`0.10.0-rc0`。
+3. 新分支一般不接受新的feature和优化。QA在release分支上进行测试。研发基于最新的develop开发。
+4. QA和研发发现的bug，在develop上修复验证后，cherry-pick到release分支。直到release分支相对稳定。
+5. 如果有需要，在release分支最新代码上打上新的tag，比如`0.10.0-rc1`，让更多的用户加入测试。重复3-4步。
+6. release分支稳定后，打上正式的release tag，比如`0.10.0`。
+7. 将这个版本的python wheel包发布到pypi。
+8. 更新Docker镜像（参考后面的操作细节）。
 
 需要注意的是:
 
-* `release/版本号`分支一旦建立，一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭，方便测试人员测试PaddlePaddle的行为。
-* 在`release/版本号`分支存在的时候，如果有bugfix的行为，需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。
+* bug修复需要先在develop上进行，然后进入release分支。而不是直接在release分支上开发。
+
+* release分支原则上只接受修复类的修改，不接受新feature。
 
 ## 发布wheel包到pypi
 
@@ -61,24 +60,21 @@ docker push [镜像]:[version]
 
 ## PaddlePaddle 分支规范
 
-PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范，并适应github的特性做了一些区别。
+PaddlePaddle开发过程使用[Trunk Based Development](https://trunkbaseddevelopment.com/) 开发规范。
 
-* PaddlePaddle的主版本库遵循[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范。其中:
-	* `master`分支为稳定(stable branch)版本分支。每一个`master`分支的版本都是经过单元测试和回归测试的版本。
-	* `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试，但并没有经过回归测试。
-	* `release/版本号`分支为每一次Release时建立的临时分支。在这个阶段的代码正在经历回归测试。
+* `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试。并且会经过模型回归测试。
+* `release/版本号`分支为每一次Release时建立的临时分支。release分支主要用于测试，bug修复和最终发版。
+* `master`分支因为历史原因，已经废弃。
 
-* 其他用户的fork版本库并不需要严格遵守[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范，但所有fork的版本库的所有分支都相当于特性分支。
+* 其他开发者fork的feature branch。
 	* 建议，开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支
-	* 建议，开发者fork的版本库中，再基于`develop`版本fork出自己的功能分支。
-	* 当功能分支开发完毕后，向PaddlePaddle的主版本库提交`Pull Reuqest`，进而进行代码评审。
-		* 在评审过程中，开发者修改自己的代码，可以继续在自己的功能分支提交代码。
-
-* BugFix分支也是在开发者自己的fork版本库维护，与功能分支不同的是，BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支，同时提起`Pull Request`。
+	* 建议，开发者fork的版本库中，再基于`develop`版本fork出自己的feature branch。
+	* 当feature branch开发完毕后，向PaddlePaddle的主版本库提交`Pull Reuqest`，进而进行代码评审。
+		* 在评审过程中，开发者修改自己的代码，可以继续在自己的feature branch提交代码。
 
 ## PaddlePaddle回归测试列表
 
-本列表说明PaddlePaddle发版之前需要测试的功能点。
+TODO
 
 ### PaddlePaddle Book中所有章节
 
diff --git a/doc/fluid/dev/releasing_process_en.md b/doc/fluid/dev/releasing_process_en.md
index 2c1c30c1ed..e3ca3acd22 100644
--- a/doc/fluid/dev/releasing_process_en.md
+++ b/doc/fluid/dev/releasing_process_en.md
@@ -17,13 +17,23 @@ Each time we release a new PaddlePaddle version, we should follow the below step
     * Update the Docker images (see below instructions for detail).
 1. After above step, merge `release/[version]` branch to master and push a tag on the master commit,
    then merge `master` to `develop`.
-1. Update the Release Note.          
+1. Update the Release Note.
 
-***NOTE:***
+1. Create a new release branch from `develop`，named `release/[version]`. E.g.，`release/0.10.0`
+2. Create a new tag for the release branch, tag format: `version-rc.Patch`. The first tag is `0.10.0-rc0`。
+3. New release branch normally doesn't accept new features or optimizations. QA will test on the release branch. Developer should develop based on `develop` branch.
+4. If QA or Developer find bugs. They should first fix and verity on `develop` branch. Then cherry-pick to the release branch. Wait until the release branch is stable.
+5. If necessary, create a new tag on the relese branch, e.g. `0.10.0-rc1`. Involve more users to try it and repeat step 3-4.
+6. After release branch is stable，Create the official release tag，such as `0.10.0`.
+7. Release the python wheel package to pypi.
+8. Update the docker image (More details below).
+
+NOTE:
+
+* bug fix should happen on `develop` branch, then cherry-pick to relese branch. Avoid developing directly on release branch.
+
+* release normally only accept bug fixes. Don't add new features.
 
-* Do ***NOT*** merge commits from develop branch to release branches to keep the release branch contain
-  features only for current release, so that we can test on that version.
-* If we want to fix bugs on release branches, we must merge the fix to master, develop and release branch.
 
 ## Publish Wheel Packages to pypi
 
@@ -95,28 +105,42 @@ Tags that need to be updated are:
 
 You can then checkout the latest pushed tags at https://hub.docker.com/r/paddlepaddle/paddle/tags/.
 
+## PaddlePaddle 分支规范
+
+PaddlePaddle开发过程使用[Trunk Based Development](https://trunkbaseddevelopment.com/) 开发规范。
+
+* `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试。并且会经过模型回归测试。
+* `release/版本号`分支为每一次Release时建立的临时分支。release分支主要用于测试，bug修复和最终发版。
+* `master`分支因为历史原因，已经废弃。
+
+* 其他开发者fork的feature branch。
+	* 建议，开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支
+	* 建议，开发者fork的版本库中，再基于`develop`版本fork出自己的feature branch。
+	* 当feature branch开发完毕后，向PaddlePaddle的主版本库提交`Pull Reuqest`，进而进行代码评审。
+		* 在评审过程中，开发者修改自己的代码，可以继续在自己的feature branch提交代码。
+
+## PaddlePaddle回归测试列表
+
+TODO
+
 ## Branching Model
 
-We use [git-flow](http://nvie.com/posts/a-successful-git-branching-model/) as our branching model,
-with some modifications:
-
-* `master` branch is the stable branch. Each version on the master branch is tested and guaranteed.
-* `develop` branch is for development. Each commit on develop branch has passed CI unit test, but no
-  regression tests are run.
-* `release/[version]` branch is used to publish each release. Latest release version branches have
-  bugfix only for that version, but no feature updates.
-* Developer forks are not required to follow
-  [git-flow](http://nvie.com/posts/a-successful-git-branching-model/)
-  branching model, all forks is like a feature branch.
-    * Advise: developer fork's develop branch is used to sync up with main repo's develop branch.
-    * Advise: developer use it's fork's develop branch to for new branch to start developing.
-  * Use that branch on developer's fork to create pull requests and start reviews.
-      * developer can push new commits to that branch when the pull request is open.
-* Bug fixes are also started from developers forked repo. And, bug fixes branch can merge to
-  `master`, `develop` and `releases`.
+PaddlePaddle uses [Trunk Based Development](https://trunkbaseddevelopment.com/) as our branching model.
+
+* `develop` branch is used for development. Each comment to `develop` branc goes through unit tests and model regression tests.
+* `release/[version]` branch is used for each release. Release branch is used for tests, bug fix and evetual release.
+* `master` branch as been deprecated for historical reasons
+
+* Developer's feature branch。
+	* Developer's feature branch should sync with upstream `develop` branch.
+	* Developer's feature branch should be forked from upstream `develop` branch.
+	* After feature branch is ready, create a `Pull Request` against the Paddle repo and go through code review.
+	   * In the review process, develop modify codes and push to their own feature branch.
 
 ## PaddlePaddle Regression Test List
 
+TODO
+
 ### All Chapters of PaddlePaddle Book
 
 We need to guarantee that all the chapters of PaddlePaddle Book can run correctly. Including

From 5b12eb9294c4e0a109d4cd6224eff4c18948466f Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Tue, 11 Sep 2018 13:13:55 +0800
Subject: [PATCH 26/85] clean

---
 doc/fluid/dev/releasing_process_cn.md |  6 ++---
 doc/fluid/dev/releasing_process_en.md | 35 +--------------------------
 2 files changed, 4 insertions(+), 37 deletions(-)

diff --git a/doc/fluid/dev/releasing_process_cn.md b/doc/fluid/dev/releasing_process_cn.md
index b3ce2b1b02..b1b6595eff 100644
--- a/doc/fluid/dev/releasing_process_cn.md
+++ b/doc/fluid/dev/releasing_process_cn.md
@@ -5,7 +5,7 @@ PaddlePaddle使用Trunk Based Development，使用[Semantic Versioning](http://s
 PaddlePaddle每次发新的版本，遵循以下流程:
 
 1. 从`develop`分支派生出新的分支，分支名为`release/版本号`。例如，`release/0.10.0`
-2. 将新分支的版本打上tag，tag为`版本号rc.Patch号`。第一个tag为`0.10.0-rc0`。
+2. 将新分支的版本打上tag，tag为`版本号rc-Patch号`。例如，第一个tag为`0.10.0-rc0`。
 3. 新分支一般不接受新的feature和优化。QA在release分支上进行测试。研发基于最新的develop开发。
 4. QA和研发发现的bug，在develop上修复验证后，cherry-pick到release分支。直到release分支相对稳定。
 5. 如果有需要，在release分支最新代码上打上新的tag，比如`0.10.0-rc1`，让更多的用户加入测试。重复3-4步。
@@ -67,8 +67,8 @@ PaddlePaddle开发过程使用[Trunk Based Development](https://trunkbaseddevelo
 * `master`分支因为历史原因，已经废弃。
 
 * 其他开发者fork的feature branch。
-	* 建议，开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支
-	* 建议，开发者fork的版本库中，再基于`develop`版本fork出自己的feature branch。
+	* 建议，开发者的feature branch需要同步主版本库的`develop`分支。
+	* 建议，开发者的feature branch需要基于朱版本库中的`develop`分支。
 	* 当feature branch开发完毕后，向PaddlePaddle的主版本库提交`Pull Reuqest`，进而进行代码评审。
 		* 在评审过程中，开发者修改自己的代码，可以继续在自己的feature branch提交代码。
 
diff --git a/doc/fluid/dev/releasing_process_en.md b/doc/fluid/dev/releasing_process_en.md
index e3ca3acd22..a4ea4f6fb0 100644
--- a/doc/fluid/dev/releasing_process_en.md
+++ b/doc/fluid/dev/releasing_process_en.md
@@ -4,23 +4,8 @@ PaddlePaddle manages its branches using "git-flow branching model", and [Semanti
 
 Each time we release a new PaddlePaddle version, we should follow the below steps:
 
-1. Fork a new branch from `develop` named `release/[version]`, e.g. `release/0.10.0`.
-1. Push a new tag on the release branch, the tag name should be like `[version]rc.patch`. The
-   first tag should be `0.10.0rc1`, and the second should be `0.10.0.rc2` and so on.
-1. After that, we should do:
-  * Run all regression test on the Regression Test List (see PaddlePaddle TeamCity CI), to confirm
-      that this release has no major bugs.
-        * If regression test fails, we must fix those bugs and create a new `release/[version]`
-          branch from previous release branch.
-    * Modify `python/setup.py.in`, change the version number and change `ISTAGED` to `True`.
-    * Publish PaddlePaddle release wheel packages to pypi (see below instructions for detail).
-    * Update the Docker images (see below instructions for detail).
-1. After above step, merge `release/[version]` branch to master and push a tag on the master commit,
-   then merge `master` to `develop`.
-1. Update the Release Note.
-
 1. Create a new release branch from `develop`，named `release/[version]`. E.g.，`release/0.10.0`
-2. Create a new tag for the release branch, tag format: `version-rc.Patch`. The first tag is `0.10.0-rc0`。
+2. Create a new tag for the release branch, tag format: `version-rc.Patch`. E.g. the first tag is `0.10.0-rc0`。
 3. New release branch normally doesn't accept new features or optimizations. QA will test on the release branch. Developer should develop based on `develop` branch.
 4. If QA or Developer find bugs. They should first fix and verity on `develop` branch. Then cherry-pick to the release branch. Wait until the release branch is stable.
 5. If necessary, create a new tag on the relese branch, e.g. `0.10.0-rc1`. Involve more users to try it and repeat step 3-4.
@@ -105,24 +90,6 @@ Tags that need to be updated are:
 
 You can then checkout the latest pushed tags at https://hub.docker.com/r/paddlepaddle/paddle/tags/.
 
-## PaddlePaddle 分支规范
-
-PaddlePaddle开发过程使用[Trunk Based Development](https://trunkbaseddevelopment.com/) 开发规范。
-
-* `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试。并且会经过模型回归测试。
-* `release/版本号`分支为每一次Release时建立的临时分支。release分支主要用于测试，bug修复和最终发版。
-* `master`分支因为历史原因，已经废弃。
-
-* 其他开发者fork的feature branch。
-	* 建议，开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支
-	* 建议，开发者fork的版本库中，再基于`develop`版本fork出自己的feature branch。
-	* 当feature branch开发完毕后，向PaddlePaddle的主版本库提交`Pull Reuqest`，进而进行代码评审。
-		* 在评审过程中，开发者修改自己的代码，可以继续在自己的feature branch提交代码。
-
-## PaddlePaddle回归测试列表
-
-TODO
-
 ## Branching Model
 
 PaddlePaddle uses [Trunk Based Development](https://trunkbaseddevelopment.com/) as our branching model.

From 52122704d351c1339f8728d5dfc91a82f4b2e60d Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Tue, 11 Sep 2018 13:19:09 +0800
Subject: [PATCH 27/85] clean

---
 doc/fluid/dev/releasing_process_cn.md | 4 ++--
 doc/fluid/dev/releasing_process_en.md | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/fluid/dev/releasing_process_cn.md b/doc/fluid/dev/releasing_process_cn.md
index b1b6595eff..acea9a2b5d 100644
--- a/doc/fluid/dev/releasing_process_cn.md
+++ b/doc/fluid/dev/releasing_process_cn.md
@@ -7,7 +7,7 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 1. 从`develop`分支派生出新的分支，分支名为`release/版本号`。例如，`release/0.10.0`
 2. 将新分支的版本打上tag，tag为`版本号rc-Patch号`。例如，第一个tag为`0.10.0-rc0`。
 3. 新分支一般不接受新的feature和优化。QA在release分支上进行测试。研发基于最新的develop开发。
-4. QA和研发发现的bug，在develop上修复验证后，cherry-pick到release分支。直到release分支相对稳定。
+4. QA和研发发现的bug，在develop上修复验证后，cherry-pick修复到release分支。直到release分支相对稳定。
 5. 如果有需要，在release分支最新代码上打上新的tag，比如`0.10.0-rc1`，让更多的用户加入测试。重复3-4步。
 6. release分支稳定后，打上正式的release tag，比如`0.10.0`。
 7. 将这个版本的python wheel包发布到pypi。
@@ -68,7 +68,7 @@ PaddlePaddle开发过程使用[Trunk Based Development](https://trunkbaseddevelo
 
 * 其他开发者fork的feature branch。
 	* 建议，开发者的feature branch需要同步主版本库的`develop`分支。
-	* 建议，开发者的feature branch需要基于朱版本库中的`develop`分支。
+	* 建议，开发者的feature branch需要基于主版本库中的`develop`分支。
 	* 当feature branch开发完毕后，向PaddlePaddle的主版本库提交`Pull Reuqest`，进而进行代码评审。
 		* 在评审过程中，开发者修改自己的代码，可以继续在自己的feature branch提交代码。
 
diff --git a/doc/fluid/dev/releasing_process_en.md b/doc/fluid/dev/releasing_process_en.md
index a4ea4f6fb0..b810dc941d 100644
--- a/doc/fluid/dev/releasing_process_en.md
+++ b/doc/fluid/dev/releasing_process_en.md
@@ -7,7 +7,7 @@ Each time we release a new PaddlePaddle version, we should follow the below step
 1. Create a new release branch from `develop`，named `release/[version]`. E.g.，`release/0.10.0`
 2. Create a new tag for the release branch, tag format: `version-rc.Patch`. E.g. the first tag is `0.10.0-rc0`。
 3. New release branch normally doesn't accept new features or optimizations. QA will test on the release branch. Developer should develop based on `develop` branch.
-4. If QA or Developer find bugs. They should first fix and verity on `develop` branch. Then cherry-pick to the release branch. Wait until the release branch is stable.
+4. If QA or Developer find bugs. They should first fix and verify on `develop` branch. Then cherry-pick the fix to the release branch. Wait until the release branch is stable.
 5. If necessary, create a new tag on the relese branch, e.g. `0.10.0-rc1`. Involve more users to try it and repeat step 3-4.
 6. After release branch is stable，Create the official release tag，such as `0.10.0`.
 7. Release the python wheel package to pypi.

From 5fd5bf9c9644b93ded4737edea84f4ea754b60d4 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Tue, 11 Sep 2018 14:18:50 +0800
Subject: [PATCH 28/85] sync resnet model

---
 benchmark/fluid/models/resnet.py | 225 +++++++++++++++----------------
 1 file changed, 108 insertions(+), 117 deletions(-)

diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py
index ae1baa48e1..d71b855612 100644
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -20,6 +20,7 @@ import functools
 import numpy as np
 import time
 import os
+import math
 
 import cProfile, pstats, StringIO
 
@@ -27,128 +28,120 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.profiler as profiler
-# from recordio_converter import imagenet_train, imagenet_test
 from imagenet_reader import train, val
 
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class ResNet():
+    def __init__(self, layers=50, is_train=True):
+        self.params = train_parameters
+        self.layers = layers
+        self.is_train = is_train
+
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_filters = [64, 128, 256, 512]
+
+        conv = self.conv_bn_layer(
+            input=input, num_filters=64, filter_size=7, stride=2, act='relu')
+        conv = fluid.layers.pool2d(
+            input=conv,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+
+        for block in range(len(depth)):
+            for i in range(depth[block]):
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1)
+
+        pool = fluid.layers.pool2d(
+            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        out = fluid.layers.fc(input=pool,
+                              size=class_dim,
+                              act='softmax',
+                              param_attr=fluid.param_attr.ParamAttr(
+                                  initializer=fluid.initializer.Uniform(-stdv,
+                                                                        stdv)))
+        return out
+
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            bias_attr=False)
+        return fluid.layers.batch_norm(
+            input=conv, act=act, is_test=not self.is_train)
+
+    def shortcut(self, input, ch_out, stride):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            return self.conv_bn_layer(input, ch_out, 1, stride)
+        else:
+            return input
 
-def conv_bn_layer(input,
-                  ch_out,
-                  filter_size,
-                  stride,
-                  padding,
-                  act='relu',
-                  is_train=True):
-    conv1 = fluid.layers.conv2d(
-        input=input,
-        filter_size=filter_size,
-        num_filters=ch_out,
-        stride=stride,
-        padding=padding,
-        act=None,
-        bias_attr=False)
-    return fluid.layers.batch_norm(input=conv1, act=act, is_test=not is_train)
-
-
-def shortcut(input, ch_out, stride, is_train=True):
-    ch_in = input.shape[1]  # if args.data_format == 'NCHW' else input.shape[-1]
-    if ch_in != ch_out:
-        return conv_bn_layer(
-            input, ch_out, 1, stride, 0, None, is_train=is_train)
-    else:
-        return input
-
-
-def basicblock(input, ch_out, stride, is_train=True):
-    short = shortcut(input, ch_out, stride, is_train=is_train)
-    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1, is_train=is_train)
-    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None, is_train=is_train)
-    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
-
-
-def bottleneck(input, ch_out, stride, is_train=True):
-    short = shortcut(input, ch_out * 4, stride, is_train=is_train)
-    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0, is_train=is_train)
-    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, is_train=is_train)
-    conv3 = conv_bn_layer(
-        conv2, ch_out * 4, 1, 1, 0, act=None, is_train=is_train)
-    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
-
-
-def layer_warp(block_func, input, ch_out, count, stride):
-    res_out = block_func(input, ch_out, stride)
-    for i in range(1, count):
-        res_out = block_func(res_out, ch_out, 1)
-    return res_out
-
+    def bottleneck_block(self, input, num_filters, stride):
+        conv0 = self.conv_bn_layer(
+            input=input, num_filters=num_filters, filter_size=1, act='relu')
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu')
+        conv2 = self.conv_bn_layer(
+            input=conv1, num_filters=num_filters * 4, filter_size=1, act=None)
 
-def resnet_imagenet(input,
-                    class_dim,
-                    depth=50,
-                    data_format='NCHW',
-                    is_train=True):
+        short = self.shortcut(input, num_filters * 4, stride)
 
-    cfg = {
-        18: ([2, 2, 2, 1], basicblock),
-        34: ([3, 4, 6, 3], basicblock),
-        50: ([3, 4, 6, 3], bottleneck),
-        101: ([3, 4, 23, 3], bottleneck),
-        152: ([3, 8, 36, 3], bottleneck)
-    }
-    stages, block_func = cfg[depth]
-    conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
-    pool1 = fluid.layers.pool2d(
-        input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
-    res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
-    res2 = layer_warp(block_func, res1, 128, stages[1], 2)
-    res3 = layer_warp(block_func, res2, 256, stages[2], 2)
-    res4 = layer_warp(block_func, res3, 512, stages[3], 2)
-    pool2 = fluid.layers.pool2d(
-        input=res4,
-        pool_size=7,
-        pool_type='avg',
-        pool_stride=1,
-        global_pooling=True)
-    out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
-    return out
-
-
-def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
-    assert (depth - 2) % 6 == 0
-
-    n = (depth - 2) // 6
-
-    conv1 = conv_bn_layer(
-        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
-    res1 = layer_warp(basicblock, conv1, 16, n, 1)
-    res2 = layer_warp(basicblock, res1, 32, n, 2)
-    res3 = layer_warp(basicblock, res2, 64, n, 2)
-    pool = fluid.layers.pool2d(
-        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
-    out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
-    return out
+        return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
 
 
 def _model_reader_dshape_classdim(args, is_train):
-    model = resnet_cifar10
+    model = None
     reader = None
-    if args.data_set == "cifar10":
-        class_dim = 10
-        if args.data_format == 'NCHW':
-            dshape = [3, 32, 32]
-        else:
-            dshape = [32, 32, 3]
-        model = resnet_cifar10
-        if is_train:
-            reader = paddle.dataset.cifar.train10()
-        else:
-            reader = paddle.dataset.cifar.test10()
-    elif args.data_set == "flowers":
+    if args.data_set == "flowers":
         class_dim = 102
         if args.data_format == 'NCHW':
             dshape = [3, 224, 224]
         else:
             dshape = [224, 224, 3]
-        model = resnet_imagenet
         if is_train:
             reader = paddle.dataset.flowers.train()
         else:
@@ -159,7 +152,6 @@ def _model_reader_dshape_classdim(args, is_train):
             dshape = [3, 224, 224]
         else:
             dshape = [224, 224, 3]
-        model = resnet_imagenet
         if not args.data_path:
             raise Exception(
                 "Must specify --data_path when training with imagenet")
@@ -173,12 +165,11 @@ def _model_reader_dshape_classdim(args, is_train):
                 reader = train(xmap=False)
             else:
                 reader = val(xmap=False)
-    return model, reader, dshape, class_dim
+    return reader, dshape, class_dim
 
 
 def get_model(args, is_train, main_prog, startup_prog):
-    model, reader, dshape, class_dim = _model_reader_dshape_classdim(args,
-                                                                     is_train)
+    reader, dshape, class_dim = _model_reader_dshape_classdim(args, is_train)
 
     pyreader = None
     trainer_count = int(os.getenv("PADDLE_TRAINERS"))
@@ -198,7 +189,8 @@ def get_model(args, is_train, main_prog, startup_prog):
                 label = fluid.layers.data(
                     name='label', shape=[1], dtype='int64')
 
-            predict = model(input, class_dim, is_train=is_train)
+            model = ResNet(is_train=is_train)
+            predict = model.net(input, class_dim=class_dim)
             cost = fluid.layers.cross_entropy(input=predict, label=label)
             avg_cost = fluid.layers.mean(x=cost)
 
@@ -216,15 +208,14 @@ def get_model(args, is_train, main_prog, startup_prog):
                 total_images = 1281167 / trainer_count
 
                 step = int(total_images / args.batch_size + 1)
-                epochs = [30, 60, 80, 90]
+                epochs = [30, 60, 90]
                 bd = [step * e for e in epochs]
                 base_lr = args.learning_rate
                 lr = []
                 lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
                 optimizer = fluid.optimizer.Momentum(
-                    learning_rate=base_lr,
-                    #learning_rate=fluid.layers.piecewise_decay(
-                    #    boundaries=bd, values=lr),
+                    learning_rate=fluid.layers.piecewise_decay(
+                        boundaries=bd, values=lr),
                     momentum=0.9,
                     regularization=fluid.regularizer.L2Decay(1e-4))
                 optimizer.minimize(avg_cost)

From faf8ad2436522576cd1fdf0b783291e519308859 Mon Sep 17 00:00:00 2001
From: Bai Yifan <bai.yf@qq.com>
Date: Tue, 11 Sep 2018 15:33:36 +0800
Subject: [PATCH 29/85] Add ignore_index in cross_entropy op (#13217)

* add ignore index

* update api.spec

* enhance softmax_with_cross_entropy
---
 paddle/fluid/API.spec                         |  4 +--
 paddle/fluid/operators/cross_entropy_op.cc    |  5 +++
 paddle/fluid/operators/cross_entropy_op.h     | 26 +++++++++-----
 paddle/fluid/operators/math/cross_entropy.cc  |  9 +++--
 paddle/fluid/operators/math/cross_entropy.cu  | 15 +++++---
 paddle/fluid/operators/math/cross_entropy.h   |  3 +-
 .../softmax_with_cross_entropy_op.cc          |  6 ++++
 .../softmax_with_cross_entropy_op.cu          | 11 +++---
 .../operators/softmax_with_cross_entropy_op.h |  3 +-
 python/paddle/fluid/layers/nn.py              | 22 +++++++++---
 .../tests/unittests/test_cross_entropy_op.py  | 29 +++++++++++++++
 .../fluid/tests/unittests/test_layers.py      |  9 +++++
 .../test_softmax_with_cross_entropy_op.py     | 35 +++++++++++++++++++
 13 files changed, 148 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index ae5f30e431..842fde1ec5 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -100,7 +100,7 @@ paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_att
 paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100))
 paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None))
@@ -142,7 +142,7 @@ paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 's
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
-paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100))
 paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1))
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index 578ab63bc3..66f19fe7ec 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -138,6 +138,11 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool, default false), a flag indicating whether to "
                   "interpretate the given labels as soft labels.")
         .SetDefault(false);
+    AddAttr<int>("ignore_index",
+                 "(int, default -100), Specifies a target value that is"
+                 "ignored and does not contribute to the input gradient."
+                 "Only valid if soft_label is set to False")
+        .SetDefault(-100);
     AddComment(R"DOC(
 CrossEntropy Operator.
 
diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h
index 36b58d8014..03974a7fc5 100644
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@@ -40,7 +40,7 @@ class CrossEntropyOpKernel : public framework::OpKernel<T> {
 
     math::CrossEntropyFunctor<DeviceContext, T>()(
         ctx.template device_context<DeviceContext>(), &y_2d, &x_2d, &labels_2d,
-        ctx.Attr<bool>("soft_label"));
+        ctx.Attr<bool>("soft_label"), ctx.Attr<int>("ignore_index"));
   }
 };
 
@@ -74,16 +74,22 @@ class XeGradFunctor {
                 const T* dy,           // NOLINT
                 const T* x,            // NOLINT
                 const int64_t* label,  // NOLINT
-                size_t num_classes)
-      : dx_(dx), dy_(dy), x_(x), label_(label), num_classes_(num_classes) {}
+                size_t num_classes, size_t ignore_index)
+      : dx_(dx),
+        dy_(dy),
+        x_(x),
+        label_(label),
+        num_classes_(num_classes),
+        ignore_index_(ignore_index) {}
 
   HOSTDEVICE void operator()(size_t sample_id) {
     auto x_is_true_offset = sample_id * num_classes_ + label_[sample_id];
     for (size_t x_offset = sample_id * num_classes_;
          x_offset < (sample_id + 1) * num_classes_; ++x_offset) {
-      dx_[x_offset] = x_offset != x_is_true_offset
-                          ? static_cast<T>(0)
-                          : -dy_[sample_id] / x_[x_offset];
+      dx_[x_offset] =
+          (x_offset != x_is_true_offset || label_[sample_id] == ignore_index_)
+              ? static_cast<T>(0)
+              : -dy_[sample_id] / x_[x_offset];
     }
   }
 
@@ -93,6 +99,7 @@ class XeGradFunctor {
   const T* x_;
   const int64_t* label_;
   size_t num_classes_;
+  size_t ignore_index_;
 };
 
 template <typename DeviceContext, typename T>
@@ -109,6 +116,7 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
     // unnecessary to convert tensors to 2-D views.
     int rank = x->dims().size();
     int64_t class_num = x->dims()[rank - 1];
+    int64_t ignore_index = ctx.Attr<int>("ignore_index");
     if (ctx.Attr<bool>("soft_label")) {
       XeSoftlabelGradFunctor<T> functor(dx_data, dy->data<T>(), x->data<T>(),
                                         label->data<T>(),
@@ -118,9 +126,9 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
           static_cast<size_t>(dx->numel()));
       for_range(functor);
     } else {
-      XeGradFunctor<T> functor(dx_data, dy->data<T>(), x->data<T>(),
-                               label->data<int64_t>(),
-                               static_cast<size_t>(class_num));
+      XeGradFunctor<T> functor(
+          dx_data, dy->data<T>(), x->data<T>(), label->data<int64_t>(),
+          static_cast<size_t>(class_num), static_cast<size_t>(ignore_index));
       platform::ForRange<DeviceContext> for_range(
           ctx.template device_context<DeviceContext>(),
           static_cast<size_t>(dy->numel()));
diff --git a/paddle/fluid/operators/math/cross_entropy.cc b/paddle/fluid/operators/math/cross_entropy.cc
index caff35e03a..18bf1a66f6 100644
--- a/paddle/fluid/operators/math/cross_entropy.cc
+++ b/paddle/fluid/operators/math/cross_entropy.cc
@@ -28,7 +28,8 @@ class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
  public:
   void operator()(const platform::CPUDeviceContext& ctx, framework::Tensor* out,
                   const framework::Tensor* prob,
-                  const framework::Tensor* labels, const bool softLabel) {
+                  const framework::Tensor* labels, const bool softLabel,
+                  const int ignore_index) {
     const int batch_size = prob->dims()[0];
     if (softLabel) {
       auto in = EigenMatrix<T>::From(*prob);
@@ -49,8 +50,12 @@ class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
         int lbl = label_data[i];
         PADDLE_ENFORCE_GE(lbl, 0);
         PADDLE_ENFORCE_LT(lbl, class_num);
+        PADDLE_ENFORCE((lbl >= 0 && lbl < class_num) || lbl == ignore_index);
         int index = i * class_num + lbl;
-        loss_data[i] = -math::TolerableValue<T>()(std::log(prob_data[index]));
+        loss_data[i] =
+            lbl == ignore_index
+                ? 0
+                : -math::TolerableValue<T>()(std::log(prob_data[index]));
       }
     }
   }
diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu
index 0de58d5fdd..c92341ea55 100644
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -23,11 +23,14 @@ namespace math {
 namespace {
 template <typename T>
 __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
-                                   const int N, const int D) {
+                                   const int N, const int D,
+                                   const int ignore_index) {
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
        i += blockDim.x * gridDim.x) {
-    PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
-    Y[i] = -math::TolerableValue<T>()(log(X[i * D + label[i]]));
+    PADDLE_ASSERT(label[i] >= 0 && label[i] < D || label[i] == ignore_index);
+    Y[i] = ignore_index == label[i]
+               ? 0
+               : -math::TolerableValue<T>()(log(X[i * D + label[i]]));
   }
 }
 
@@ -57,7 +60,8 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& ctx,
                   framework::Tensor* out, const framework::Tensor* prob,
-                  const framework::Tensor* labels, bool softLabel) {
+                  const framework::Tensor* labels, bool softLabel,
+                  const int ignore_index) {
     const T* prob_data = prob->data<T>();
     T* loss_data = out->mutable_data<T>(ctx.GetPlace());
 
@@ -77,7 +81,8 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
       int block = 512;
       int grid = (batch_size + block - 1) / block;
       CrossEntropyKernel<T><<<grid, block, 0, ctx.stream()>>>(
-          loss_data, prob_data, label_data, batch_size, class_num);
+          loss_data, prob_data, label_data, batch_size, class_num,
+          ignore_index);
     }
   }
 };
diff --git a/paddle/fluid/operators/math/cross_entropy.h b/paddle/fluid/operators/math/cross_entropy.h
index adc5b3fe47..e8aeb5d057 100644
--- a/paddle/fluid/operators/math/cross_entropy.h
+++ b/paddle/fluid/operators/math/cross_entropy.h
@@ -38,7 +38,8 @@ class CrossEntropyFunctor {
  public:
   void operator()(const DeviceContext& context, framework::Tensor* out,
                   const framework::Tensor* prob,
-                  const framework::Tensor* labels, const bool softLabel);
+                  const framework::Tensor* labels, const bool softLabel,
+                  const int ignore_index);
 };
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
index 53cb716a97..1a9324ec86 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -44,6 +44,12 @@ class SoftmaxWithCrossEntropyOpMaker
         "(bool, default: false), A flag to indicate whether to interpretate "
         "the given labels as soft labels.")
         .SetDefault(false);
+    AddAttr<int>(
+        "ignore_index",
+        "(int, default -100), Specifies a target value that is ignored and"
+        "does not contribute to the input gradient. Only valid if soft_label"
+        "is set to False")
+        .SetDefault(-100);
     AddComment(R"DOC(
 Softmax With Cross Entropy Operator.
 
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index a559b01ed3..148faec4af 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -26,7 +26,8 @@ using Tensor = framework::Tensor;
 namespace {
 template <typename T>
 __global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels,
-                                 const int batch_size, const int class_num) {
+                                 const int batch_size, const int class_num,
+                                 const int ignore_index) {
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < batch_size;
        i += blockDim.x * gridDim.x) {
     int idx = i * class_num + labels[i];
@@ -260,6 +261,7 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
     auto* loss_data = loss->mutable_data<T>(context.GetPlace());
 
     auto soft_label = context.Attr<bool>("soft_label");
+    auto ignore_index = context.Attr<int>("ignore_index");
     if (soft_label) {
       int batch_size = logits->dims()[0];
       int feature_size = logits->dims()[1];
@@ -272,7 +274,8 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
       math::SoftmaxCUDNNFunctor<T>()(context.cuda_device_context(), logits,
                                      softmax);
       math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
-          context.cuda_device_context(), loss, softmax, labels, false);
+          context.cuda_device_context(), loss, softmax, labels, false,
+          ignore_index);
     }
   }
 };
@@ -295,7 +298,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
     const int class_num = logit_grad->dims()[1];
     int block = 512;
     auto stream = context.cuda_device_context().stream();
-
+    auto ignore_index = context.Attr<int>("ignore_index");
     if (context.Attr<bool>("soft_label")) {
       int grid = (batch_size * class_num + block - 1) / block;
       const T* label_data = labels->data<T>();
@@ -305,7 +308,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
       int grid = (batch_size + block - 1) / block;
       const int64_t* label_data = labels->data<int64_t>();
       CrossEntropyGrad<T><<<grid, block, 0, stream>>>(
-          logit_grad_data, label_data, batch_size, class_num);
+          logit_grad_data, label_data, batch_size, class_num, ignore_index);
       int num = batch_size * class_num;
       grid = (num + block - 1) / block;
       Scale<T><<<grid, block, 0, stream>>>(logit_grad_data, loss_grad_data, num,
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
index dd6f6aca5a..e9aba3b37b 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
@@ -45,7 +45,8 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
     math::SoftmaxFunctor<platform::CPUDeviceContext, T>()(dev_ctx, logits,
                                                           softmax);
     math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
-        dev_ctx, loss, softmax, labels, context.Attr<bool>("soft_label"));
+        dev_ctx, loss, softmax, labels, context.Attr<bool>("soft_label"),
+        context.Attr<int>("ignore_index"));
   }
 };
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 8408e6d2a1..3ae0fac4be 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -968,7 +968,7 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
     return out
 
 
-def cross_entropy(input, label, soft_label=False):
+def cross_entropy(input, label, soft_label=False, ignore_index=-100):
     """
     **Cross Entropy Layer**
 
@@ -1012,7 +1012,10 @@ def cross_entropy(input, label, soft_label=False):
                                tensor<float/double> with shape [N x D].
         soft_label (bool): a flag indicating whether to
                                            interpretate the given labels as soft
-                                           labels, default `False`.
+                                           labels. Default: `False`.
+        ignore_index (int): Specifies a target value that is ignored and does 
+                            not contribute to the input gradient. Only valid 
+                            if soft_label is set to False. Default: -100
 
     Returns:
          A 2-D tensor with shape [N x 1], the cross entropy loss.
@@ -1037,7 +1040,8 @@ def cross_entropy(input, label, soft_label=False):
         inputs={'X': [input],
                 'Label': [label]},
         outputs={'Y': [out]},
-        attrs={"soft_label": soft_label})
+        attrs={"soft_label": soft_label,
+               "ignore_index": ignore_index})
     return out
 
 
@@ -4242,7 +4246,10 @@ def multiplex(inputs, index):
     return out
 
 
-def softmax_with_cross_entropy(logits, label, soft_label=False):
+def softmax_with_cross_entropy(logits,
+                               label,
+                               soft_label=False,
+                               ignore_index=-100):
     """
     **Softmax With Cross Entropy Operator.**
 
@@ -4284,6 +4291,10 @@ def softmax_with_cross_entropy(logits, label, soft_label=False):
             soft_label is set to true, Label is a Tensor<float/double> with
         soft_label (bool): A flag to indicate whether to interpretate the given
             labels as soft labels. By default, `soft_label` is set to False.
+        ignore_index (int): Specifies a target value that is ignored and does 
+                            not contribute to the input gradient. Only valid 
+                            if soft_label is set to False. Default: -100
+
     Returns:
         Variable: The cross entropy loss is a 2-D tensor with shape [N x 1].
 
@@ -4305,7 +4316,8 @@ def softmax_with_cross_entropy(logits, label, soft_label=False):
                 'Label': label},
         outputs={'Softmax': softmax,
                  'Loss': loss},
-        attrs={'soft_label': soft_label})
+        attrs={'soft_label': soft_label,
+               'ignore_index': ignore_index})
     return loss
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
index fa367f95fc..f22badbea0 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
@@ -209,5 +209,34 @@ class TestCrossEntropyOp6(OpTest):
             ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
 
 
+class TestCrossEntropyOp7(OpTest):
+    """Test cross-entropy with ignore index.
+    """
+
+    def setUp(self):
+        self.op_type = "cross_entropy"
+        batch_size = 30
+        class_num = 10
+        ignore_index = 3
+
+        X = randomize_probability(batch_size, class_num, dtype='float64')
+
+        label = np.random.randint(0, class_num, (batch_size, 1), dtype="int64")
+        cross_entropy = np.asmatrix(
+            [[-np.log(X[i][label[i][0]])]
+             if label[i][0] != ignore_index else [0]
+             for i in range(X.shape[0])],
+            dtype="float64")
+        self.inputs = {"X": X, "Label": label}
+        self.outputs = {"Y": cross_entropy}
+        self.attrs = {"soft_label": False, "ignore_index": ignore_index}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Y", numeric_grad_delta=0.001)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index bc4d364c74..b04346b052 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -556,6 +556,15 @@ class TestBook(unittest.TestCase):
             out = layers.sequence_enumerate(input=x, win_size=2, pad_value=0)
         print(str(program))
 
+    def test_cross_entropy(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name="x", shape=[30, 10], dtype="float32")
+            label = layers.data(name="label", shape=[30, 1], dtype="int32")
+            mode = 'channel'
+            out = layers.cross_entropy(x, label, False, 4)
+            self.assertIsNotNone(out)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
index b7e5ff6d52..a18941dd31 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
@@ -88,5 +88,40 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest):
         self.check_grad(["Logits"], "Loss")
 
 
+class TestSoftmaxWithCrossEntropyOp3(OpTest):
+    """
+    Test softmax with cross entropy operator with ignore_index.
+    """
+
+    def setUp(self):
+        self.op_type = "softmax_with_cross_entropy"
+        batch_size = 41
+        class_num = 37
+
+        logits = np.random.uniform(0.1, 1.0,
+                                   [batch_size, class_num]).astype("float64")
+        softmax = np.apply_along_axis(stable_softmax, 1, logits)
+        labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int64")
+        ignore_index = 7
+        cross_entropy = np.asmatrix(
+            [[-np.log(softmax[i][labels[i][0]])]
+             if labels[i] != ignore_index else [0]
+             for i in range(softmax.shape[0])],
+            dtype="float64")
+
+        self.inputs = {"Logits": logits, "Label": labels}
+        self.outputs = {
+            "Softmax": softmax.astype("float64"),
+            "Loss": cross_entropy.astype("float64")
+        }
+        self.attrs = {"ignore_index": ignore_index}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["Logits"], "Loss")
+
+
 if __name__ == "__main__":
     unittest.main()

From 1ce9e9dc3072f50e7e827fe6b63d59e3eb883196 Mon Sep 17 00:00:00 2001
From: Krzysztof Binias <krzysztof.binias@intel.com>
Date: Mon, 10 Sep 2018 15:29:51 +0200
Subject: [PATCH 30/85] Renaming decision variable

---
 paddle/fluid/operators/conv_mkldnn_op.cc | 4 ++--
 paddle/fluid/platform/mkldnn_helper.h    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index 1ccf2494f2..244a578db8 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -131,12 +131,12 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
   std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
       const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
       const std::vector<mkldnn::primitive>& pipeline,
-      bool is_test = false) {  // NOLINT
+      bool is_persistent = false) {  // NOLINT
     auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
     auto weights_pd = conv_pd_->weights_primitive_desc();
     return this->AcquireMemory(weights_pd, user_weights_pd,
                                user_weights_memory_p, "@weights_mem_p",
-                               pipeline, is_test);
+                               pipeline, is_persistent);
   }
 
   std::shared_ptr<mkldnn::memory> AcquireBiasMemoryFromPrimitive(
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index c64e5dafda..cf08202ccb 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -192,7 +192,7 @@ class MKLDNNHandler {
       mkldnn::memory::primitive_desc& user_mpd,  // NOLINT
       const std::shared_ptr<mkldnn::memory> user_memory_p,
       const std::string& suffix, const std::vector<mkldnn::primitive>& pipeline,
-      bool is_test = false) {  // NOLINT
+      bool is_persistent = false) {  // NOLINT
     // create reorder primitive if the input format is not the preferred one
     auto local_key = key_ + suffix;
     auto key_reorder_p = key_ + suffix + "reorder_p";
@@ -213,7 +213,7 @@ class MKLDNNHandler {
         pipeline.push_back(*reorder_p);
       }
       dev_ctx_.SetBlob(local_key, target_memory_p);
-    } else if (!is_test) {
+    } else if (!is_persistent) {
       // Make reorder if needed
       auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
           dev_ctx_.GetBlob(key_reorder_p));

From e0436ad8bbaed57b9c2c60f100d1e1f86fe42e07 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Tue, 11 Sep 2018 16:07:07 +0800
Subject: [PATCH 31/85] refine fusion lstm infershape

---
 paddle/fluid/framework/operator.cc           | 277 ++++++++-----------
 paddle/fluid/framework/shape_runtime_infer.h |  86 ++++++
 paddle/fluid/operators/fusion_lstm_op.cc     |  81 ++++--
 3 files changed, 260 insertions(+), 184 deletions(-)
 create mode 100644 paddle/fluid/framework/shape_runtime_infer.h

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index d58d6e4f3e..36025db7ba 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
+#include "paddle/fluid/framework/shape_runtime_infer.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -458,187 +459,147 @@ bool OpSupportGPU(const std::string& op_type) {
   return false;
 }
 
-class RuntimeInferShapeContext : public InferShapeContext {
- public:
-  RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope)
-      : op_(op), scope_(scope) {}
-
-  bool HasInput(const std::string& name) const override {
-    if (!op_.HasInputs(name)) {
-      return false;
-    }
-    auto& ins = Inputs(name);
-    size_t length = ins.size();
-    if (length == 0) {
-      return false;
-    }
-    PADDLE_ENFORCE_EQ(length, 1UL,
-                      "Input %s should not have more than one inputs", name);
-    auto ipt = ins[0];
-    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
-    return var != nullptr;
+bool RuntimeInferShapeContext::HasInput(const std::string& name) const {
+  if (!op_.HasInputs(name)) {
+    return false;
   }
-
-  bool HasOutput(const std::string& name) const override {
-    if (!op_.HasOutputs(name)) {
-      return false;
-    }
-    auto& outs = Outputs(name);
-    size_t length = outs.size();
-    if (length == 0) {
-      return false;
-    }
-    PADDLE_ENFORCE_EQ(length, 1UL,
-                      "Output %s should not have more than one inputs", name);
-    auto ipt = outs[0];
-    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
-    return var != nullptr;
+  auto& ins = Inputs(name);
+  size_t length = ins.size();
+  if (length == 0) {
+    return false;
   }
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Input %s should not have more than one inputs", name);
+  auto ipt = ins[0];
+  auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
+  return var != nullptr;
+}
 
-  bool HasInputs(const std::string& name) const override {
-    if (!op_.HasInputs(name)) {
-      return false;
-    }
-    auto inputs = op_.Inputs(name);
-    if (inputs.empty()) {
-      return false;
-    }
-    for (auto& input : inputs) {
-      if (scope_.FindVar(input) == nullptr) {
-        return false;
-      }
-    }
-    return true;
+bool RuntimeInferShapeContext::HasOutput(const std::string& name) const {
+  if (!op_.HasOutputs(name)) {
+    return false;
   }
+  auto& outs = Outputs(name);
+  size_t length = outs.size();
+  if (length == 0) {
+    return false;
+  }
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Output %s should not have more than one inputs", name);
+  auto ipt = outs[0];
+  auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
+  return var != nullptr;
+}
 
-  bool HasOutputs(const std::string& name) const override {
-    if (!op_.HasOutputs(name)) {
-      return false;
-    }
-    auto outputs = op_.Outputs(name);
-    if (outputs.empty()) {
+bool RuntimeInferShapeContext::HasInputs(const std::string& name) const {
+  if (!op_.HasInputs(name)) {
+    return false;
+  }
+  auto inputs = op_.Inputs(name);
+  if (inputs.empty()) {
+    return false;
+  }
+  for (auto& input : inputs) {
+    if (scope_.FindVar(input) == nullptr) {
       return false;
     }
-    for (auto& output : outputs) {
-      if (scope_.FindVar(output) == nullptr) {
-        return false;
-      }
-    }
-    return true;
   }
+  return true;
+}
 
-  AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
-
-  const std::vector<std::string>& Inputs(
-      const std::string& name) const override {
-    return op_.Inputs(name);
+bool RuntimeInferShapeContext::HasOutputs(const std::string& name) const {
+  if (!op_.HasOutputs(name)) {
+    return false;
   }
-
-  const std::vector<std::string>& Outputs(
-      const std::string& name) const override {
-    return op_.Outputs(name);
+  auto outputs = op_.Outputs(name);
+  if (outputs.empty()) {
+    return false;
   }
+  for (auto& output : outputs) {
+    if (scope_.FindVar(output) == nullptr) {
+      return false;
+    }
+  }
+  return true;
+}
 
-  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
-                size_t j = 0) const override {
-    PADDLE_ENFORCE_LT(i, Inputs(in).size());
-    PADDLE_ENFORCE_LT(j, Outputs(out).size());
-    Variable* in_var = scope_.FindVar(Inputs(in)[i]);
-    Variable* out_var = scope_.FindVar(Outputs(out)[j]);
-    if (!in_var->IsType<LoDTensor>()) return;
-    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
-                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
-    auto in_tensor = in_var->Get<LoDTensor>();
-    auto* out_tensor = out_var->GetMutable<LoDTensor>();
-    out_tensor->set_lod(in_tensor.lod());
+void RuntimeInferShapeContext::ShareLoD(const std::string& in,
+                                        const std::string& out, size_t i,
+                                        size_t j) const {
+  PADDLE_ENFORCE_LT(i, Inputs(in).size());
+  PADDLE_ENFORCE_LT(j, Outputs(out).size());
+  Variable* in_var = scope_.FindVar(Inputs(in)[i]);
+  Variable* out_var = scope_.FindVar(Outputs(out)[j]);
+  if (!in_var->IsType<LoDTensor>()) return;
+  PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
+                 "The %d-th output of Output(%s) must be LoDTensor.", j, out);
+  auto in_tensor = in_var->Get<LoDTensor>();
+  auto* out_tensor = out_var->GetMutable<LoDTensor>();
+  out_tensor->set_lod(in_tensor.lod());
 
 // TODO(dzhwinter) : reuse ShareLoD in most operators.
 // Need to call ShareLayout explicitly in sequence related ops.
 // Shall we have a better method to shared info between in/out Tensor?
 #ifdef PADDLE_WITH_MKLDNN
-    // Fix me: ugly workaround below
-    // Correct solution:
-    //    set_layout() should NOT be called here (i.e. ShareLoD). Instead,
-    //    layout of output tensor should be set "manually" in Compute()
-    //    of each OPKernel. The reason layout should NOT be shared between
-    //    input and output "automatically" (now by InferShape()->ShareLoD())
-    //    is that layout transform may occur after InferShape().
-    // Workaround:
-    //    Skip set_layout() when input layout is kMKLDNN
-    //    This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN
-    //    OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called
-    //    in Compute()
-    if (in_tensor.layout() != DataLayout::kMKLDNN)
+  // Fix me: ugly workaround below
+  // Correct solution:
+  //    set_layout() should NOT be called here (i.e. ShareLoD). Instead,
+  //    layout of output tensor should be set "manually" in Compute()
+  //    of each OPKernel. The reason layout should NOT be shared between
+  //    input and output "automatically" (now by InferShape()->ShareLoD())
+  //    is that layout transform may occur after InferShape().
+  // Workaround:
+  //    Skip set_layout() when input layout is kMKLDNN
+  //    This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN
+  //    OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called
+  //    in Compute()
+  if (in_tensor.layout() != DataLayout::kMKLDNN)
 #endif
-      out_tensor->set_layout(in_tensor.layout());
-  }
-
-  void ShareLayout(const std::string& in, const std::string& out, size_t i = 0,
-                   size_t j = 0) const {
-    PADDLE_ENFORCE_LT(i, Inputs(in).size());
-    PADDLE_ENFORCE_LT(j, Outputs(out).size());
-    Variable* in_var = scope_.FindVar(Inputs(in)[i]);
-    Variable* out_var = scope_.FindVar(Outputs(out)[j]);
-    if (!in_var->IsType<LoDTensor>()) return;
-    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
-                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
-    auto in_tensor = in_var->Get<LoDTensor>();
-    auto* out_tensor = out_var->GetMutable<LoDTensor>();
     out_tensor->set_layout(in_tensor.layout());
-  }
-
-  bool IsRuntime() const override { return true; }
-
- protected:
-  DDim GetDim(const std::string& name) const override {
-    Variable* var = scope_.FindVar(name);
-    PADDLE_ENFORCE_NOT_NULL(var);
-    if (var->IsType<LoDTensor>()) {
-      return var->Get<LoDTensor>().dims();
-    } else if (var->IsType<SelectedRows>()) {
-      return var->Get<SelectedRows>().GetCompleteDims();
-    } else {
-      PADDLE_THROW(
-          "Only LoDTensor/SelectedRows support 'GetDim', but Variable %s's "
-          "type_id is %s.",
-          name, var->Type().name());
-    }
-  }
-
-  std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
-    PADDLE_THROW("Only compile time support this method");
-  }
-
-  void SetDim(const std::string& name, const DDim& dim) override {
-    Variable* var = scope_.FindVar(name);
-    if (var->IsType<LoDTensor>()) {
-      var->GetMutable<LoDTensor>()->Resize(dim);
-    } else if (var->IsType<SelectedRows>()) {
-      var->GetMutable<SelectedRows>()->set_height(dim[0]);
-    } else {
-      PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.",
-                   name, var->Type().name());
-    }
-  }
-
-  void SetRepeatedDims(const std::string& name,
-                       const std::vector<DDim>& dims) override {
-    PADDLE_THROW("Only compile time support this method");
-  }
+}
 
-  proto::VarType::Type GetVarType(const std::string& name) const override {
-    auto* var = scope_.FindVar(name);
-    return ToVarType(var->Type());
+void RuntimeInferShapeContext::ShareLayout(const std::string& in,
+                                           const std::string& out, size_t i,
+                                           size_t j) const {
+  PADDLE_ENFORCE_LT(i, Inputs(in).size());
+  PADDLE_ENFORCE_LT(j, Outputs(out).size());
+  Variable* in_var = scope_.FindVar(Inputs(in)[i]);
+  Variable* out_var = scope_.FindVar(Outputs(out)[j]);
+  if (!in_var->IsType<LoDTensor>()) return;
+  PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
+                 "The %d-th output of Output(%s) must be LoDTensor.", j, out);
+  auto in_tensor = in_var->Get<LoDTensor>();
+  auto* out_tensor = out_var->GetMutable<LoDTensor>();
+  out_tensor->set_layout(in_tensor.layout());
+}
+
+DDim RuntimeInferShapeContext::GetDim(const std::string& name) const {
+  Variable* var = scope_.FindVar(name);
+  PADDLE_ENFORCE_NOT_NULL(var);
+  if (var->IsType<LoDTensor>()) {
+    return var->Get<LoDTensor>().dims();
+  } else if (var->IsType<SelectedRows>()) {
+    return var->Get<SelectedRows>().GetCompleteDims();
+  } else {
+    PADDLE_THROW(
+        "Only LoDTensor/SelectedRows support 'GetDim', but Variable %s's "
+        "type_id is %s.",
+        name, var->Type().name());
   }
+}
 
-  InferShapeVarPtr GetVarPtr(const std::string& name) override {
-    return scope_.FindVar(name);
+void RuntimeInferShapeContext::SetDim(const std::string& name,
+                                      const DDim& dim) {
+  Variable* var = scope_.FindVar(name);
+  if (var->IsType<LoDTensor>()) {
+    var->GetMutable<LoDTensor>()->Resize(dim);
+  } else if (var->IsType<SelectedRows>()) {
+    var->GetMutable<SelectedRows>()->set_height(dim[0]);
+  } else {
+    PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.", name,
+                 var->Type().name());
   }
-
- private:
-  const OperatorBase& op_;
-  const Scope& scope_;
-};
+}
 
 static void CheckTensorNANOrInf(const std::string& name,
                                 const framework::Tensor& tensor) {
diff --git a/paddle/fluid/framework/shape_runtime_infer.h b/paddle/fluid/framework/shape_runtime_infer.h
new file mode 100644
index 0000000000..04d4e33f7a
--- /dev/null
+++ b/paddle/fluid/framework/shape_runtime_infer.h
@@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/shape_inference.h"
+#include "paddle/fluid/framework/var_type.h"
+
+namespace paddle {
+namespace framework {
+
+class RuntimeInferShapeContext : public InferShapeContext {
+ public:
+  RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope)
+      : op_(op), scope_(scope) {}
+
+  bool HasInput(const std::string& name) const override;
+  bool HasOutput(const std::string& name) const override;
+  bool HasInputs(const std::string& name) const override;
+  bool HasOutputs(const std::string& name) const override;
+
+  const OperatorBase& OpBase() const { return op_; }
+
+  const Scope& InferScope() const { return scope_; }
+  AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
+
+  const std::vector<std::string>& Inputs(
+      const std::string& name) const override {
+    return op_.Inputs(name);
+  }
+
+  const std::vector<std::string>& Outputs(
+      const std::string& name) const override {
+    return op_.Outputs(name);
+  }
+
+  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
+                size_t j = 0) const override;
+
+  void ShareLayout(const std::string& in, const std::string& out, size_t i = 0,
+                   size_t j = 0) const;
+
+  bool IsRuntime() const override { return true; }
+
+ protected:
+  DDim GetDim(const std::string& name) const override;
+  void SetDim(const std::string& name, const DDim& dim) override;
+
+  std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
+    PADDLE_THROW("Only compile time support this method");
+  }
+  void SetRepeatedDims(const std::string& name,
+                       const std::vector<DDim>& dims) override {
+    PADDLE_THROW("Only compile time support this method");
+  }
+
+  proto::VarType::Type GetVarType(const std::string& name) const override {
+    auto* var = scope_.FindVar(name);
+    return ToVarType(var->Type());
+  }
+
+  InferShapeVarPtr GetVarPtr(const std::string& name) override {
+    return scope_.FindVar(name);
+  }
+
+ private:
+  const OperatorBase& op_;
+  const Scope& scope_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc
index ef23ab3f98..ae9d5d78ae 100644
--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fusion_lstm_op.h"
 #include <string>
+#include "paddle/fluid/framework/shape_runtime_infer.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
@@ -24,26 +25,54 @@ namespace paddle {
 namespace operators {
 
 void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("WeightX"),
-                 "Input(WeightX) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("WeightH"),
-                 "Input(WeightH) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("Bias"),
-                 "Input(Bias) of LSTM should not be null.");
-
-  PADDLE_ENFORCE(ctx->HasOutput("XX"),
-                 "Output(XX) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
-                 "Output(Hidden) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Cell"),
-                 "Output(Cell) of LSTM should not be null.");
+  auto* runtime_ctx = dynamic_cast<framework::RuntimeInferShapeContext*>(ctx);
+  if (runtime_ctx == nullptr) {
+    LOG(FATAL) << "Should have runtime infer context";
+  }
+  const auto& ins = runtime_ctx->OpBase().Inputs();
+  const auto& outs = runtime_ctx->OpBase().Outputs();
+  const auto& scope = runtime_ctx->InferScope();
+  const auto ins_end = ins.end();
+  const auto outs_end = outs.end();
+  auto fair_input = [&](const std::string& name) -> bool {
+    auto it = ins.find(name);
+    if (it == ins_end) {
+      return false;
+    }
+    const auto& in = it->second;
+    if (in.size() != 1 || in[0] == framework::kEmptyVarName) {
+      return false;
+    }
+    return scope.FindVar(in[0]) != nullptr;
+  };
+  auto fair_output = [&](const std::string& name) -> bool {
+    auto it = outs.find(name);
+    if (it == outs_end) {
+      return false;
+    }
+    const auto& out = it->second;
+    if (out.size() != 1 || out[0] == framework::kEmptyVarName) {
+      return false;
+    }
+    return scope.FindVar(out[0]) != nullptr;
+  };
+
+  PADDLE_ENFORCE(fair_input("X"), "Assert only one Input(X) of LSTM.");
+  PADDLE_ENFORCE(fair_input("WeightX"),
+                 "Assert only one Input(WeightX) of LSTM.");
+  PADDLE_ENFORCE(fair_input("WeightH"),
+                 "Assert only one Input(WeightH) of LSTM.");
+  PADDLE_ENFORCE(fair_input("Bias"), "Assert only one Input(Bias) of LSTM.");
+  PADDLE_ENFORCE(fair_output("XX"), "Assert only one Output(XX) of LSTM.");
+  PADDLE_ENFORCE(fair_output("Hidden"),
+                 "Assert only one Output(Hidden) of LSTM.");
+  PADDLE_ENFORCE(fair_output("Cell"), "Assert only one Output(Cell) of LSTM.");
 
   auto x_dims = ctx->GetInputDim("X");
   PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
 
-  if (ctx->HasInput("H0")) {
-    PADDLE_ENFORCE(ctx->HasInput("C0"),
+  if (fair_input("H0")) {
+    PADDLE_ENFORCE(fair_input("C0"),
                    "Input(Cell) and Input(Hidden) of LSTM should not "
                    "be null at the same time.");
     auto h_dims = ctx->GetInputDim("H0");
@@ -95,16 +124,16 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
     xx_width = wx_dims[1];
   } else {
     xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
-    PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
-                   "Output(BatchedInput) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
-                   "Output(BatchedHidden) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"),
-                   "Output(BatchedCell) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
-                   "Output(ReorderedH0) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"),
-                   "Output(ReorderedC0) of LSTM should not be null.");
+    PADDLE_ENFORCE(fair_output("BatchedInput"),
+                   "Assert only one Output(BatchedInput) of LSTM.");
+    PADDLE_ENFORCE(fair_output("BatchedHidden"),
+                   "Assert only one Output(BatchedHidden) of LSTM.");
+    PADDLE_ENFORCE(fair_output("BatchedCell"),
+                   "Assert only one Output(BatchedCell) of LSTM.");
+    PADDLE_ENFORCE(fair_output("ReorderedH0"),
+                   "Assert only one Output(ReorderedH0) of LSTM");
+    PADDLE_ENFORCE(fair_output("ReorderedC0"),
+                   "Assert only one Output(ReorderedC0) of LSTM.");
     ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
     ctx->SetOutputDim("BatchedHidden", out_dims);
     ctx->SetOutputDim("BatchedCell", out_dims);

From b681537e1a873a08e1b2f5a4bb78772ee0353279 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 11 Sep 2018 16:24:28 +0800
Subject: [PATCH 32/85] Add multiprocess reader (#13311)

* add multiprocess_reader

* add multiprocess_reader to reader decorator

* support piped multi process reader

* revert v2 decorator

* add comment to multiprocess_reader

* optimize code

* use ujson to speed up json serialize/deserialize

* add assert to multiprocess_reader

* update comment of multiprocess_reader

* optimize ujson import, handle error case

* optimize import ujson

* remove ujson from requirements.txt

* add import sys to decorator.py
---
 python/paddle/reader/decorator.py            | 99 +++++++++++++++++++-
 python/paddle/reader/tests/decorator_test.py | 29 ++++++
 2 files changed, 127 insertions(+), 1 deletion(-)

diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index 6d7ac876fd..5b9459b670 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -14,11 +14,14 @@
 
 __all__ = [
     'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
-    'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader'
+    'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader',
+    'multiprocess_reader'
 ]
 
 from threading import Thread
 import subprocess
+import multiprocessing
+import sys
 
 from six.moves.queue import Queue
 from six.moves import zip_longest
@@ -332,6 +335,100 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
     return xreader
 
 
+def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
+    """
+    multiprocess_reader use python multi process to read data from readers
+    and then use multiprocess.Queue or multiprocess.Pipe to merge all
+    data. The process number is equal to the number of input readers, each
+    process call one reader.
+
+    Multiprocess.Queue require the rw access right to /dev/shm, some
+    platform does not support.
+
+    you need to create multiple readers first, these readers should be independent
+    to each other so that each process can work independently.
+
+    An example:
+
+    .. code-block:: python
+
+        reader0 = reader(["file01", "file02"])
+        reader1 = reader(["file11", "file12"])
+        reader1 = reader(["file21", "file22"])
+        reader = multiprocess_reader([reader0, reader1, reader2],
+            queue_size=100, use_pipe=False)
+    """
+
+    try:
+        import ujson as json
+    except Exception as e:
+        sys.stderr.write("import ujson error: " + str(e) + " use json\n")
+        import json
+
+    assert type(readers) is list and len(readers) > 0
+
+    def _read_into_queue(reader, queue):
+        for sample in reader():
+            if sample is None:
+                raise ValueError("sample has None")
+            queue.put(sample)
+        queue.put(None)
+
+    def queue_reader():
+        queue = multiprocessing.Queue(queue_size)
+        for reader in readers:
+            p = multiprocessing.Process(
+                target=_read_into_queue, args=(reader, queue))
+            p.start()
+
+        reader_num = len(readers)
+        finish_num = 0
+        while finish_num < reader_num:
+            sample = queue.get()
+            if sample is None:
+                finish_num += 1
+            else:
+                yield sample
+
+    def _read_into_pipe(reader, conn):
+        for sample in reader():
+            if sample is None:
+                raise ValueError("sample has None!")
+            conn.send(json.dumps(sample))
+        conn.send(json.dumps(None))
+        conn.close()
+
+    def pipe_reader():
+        conns = []
+        for reader in readers:
+            parent_conn, child_conn = multiprocessing.Pipe()
+            conns.append(parent_conn)
+            p = multiprocessing.Process(
+                target=_read_into_pipe, args=(reader, child_conn))
+            p.start()
+
+        reader_num = len(readers)
+        finish_num = 0
+        conn_to_remove = []
+        while finish_num < reader_num:
+            for conn in conn_to_remove:
+                conns.remove(conn)
+            conn_to_remove = []
+            for conn in conns:
+                sample = json.loads(conn.recv())
+                if sample is None:
+                    finish_num += 1
+                    conn.close()
+                    conn_to_remove.append(conn)
+                else:
+                    yield sample
+
+    if use_pipe:
+        return pipe_reader
+    else:
+        return queue_reader
+
+
 def _buf2lines(buf, line_break="\n"):
     # FIXME: line_break should be automatically configured.
     lines = buf.split(line_break)
diff --git a/python/paddle/reader/tests/decorator_test.py b/python/paddle/reader/tests/decorator_test.py
index 537df489b9..c324092f88 100644
--- a/python/paddle/reader/tests/decorator_test.py
+++ b/python/paddle/reader/tests/decorator_test.py
@@ -14,6 +14,7 @@
 
 import time
 import unittest
+import functools
 
 import paddle.reader
 
@@ -174,5 +175,33 @@ class TestPipeReader(unittest.TestCase):
             temp.close()
 
 
+class TestMultiProcessReader(unittest.TestCase):
+    def setup(self):
+        self.samples = []
+        for i in range(1000):
+            self.samples.append([[i], [i + 1, i + 2], i + 3])
+
+        def reader(index):
+            for i in range(len(self.samples)):
+                if i % 3 == index:
+                    yield self.samples[i]
+
+        self.reader0 = functools.partial(reader, 0)
+        self.reader1 = functools.partial(reader, 1)
+        self.reader2 = functools.partial(reader, 2)
+
+    def reader_test(self, use_pipe):
+        self.setup()
+        results = []
+        for data in paddle.reader.multiprocess_reader(
+            [self.reader0, self.reader1, self.reader2], 100, use_pipe)():
+            results.append(data)
+        self.assertEqual(sorted(self.samples), sorted(results))
+
+    def test_multi_process_reader(self):
+        self.reader_test(use_pipe=False)
+        self.reader_test(use_pipe=True)
+
+
 if __name__ == '__main__':
     unittest.main()

From 03ff4f689213a6dc2c469dfd0c2cffe16e6b418d Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Tue, 11 Sep 2018 08:27:24 +0000
Subject: [PATCH 33/85] fix subgraph bug!

---
 .../inference/analysis/data_flow_graph.cc     |  39 +---
 .../inference/analysis/data_flow_graph.h      |   3 -
 .../analysis/data_flow_graph_to_fluid_pass.cc |  25 ++-
 .../inference/analysis/subgraph_splitter.cc   | 186 +++++++++++++++++-
 .../analysis/subgraph_splitter_tester.cc      |   2 +-
 paddle/fluid/operators/tensorrt_engine_op.h   |  20 +-
 6 files changed, 215 insertions(+), 60 deletions(-)

diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc
index e4f4bbf43c..8c7d58678f 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -440,6 +440,7 @@ ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
     }
     return false;
   };
+
   for (auto &node : graph) {
     for (auto *in : node->inlinks) {
       // The Value that is written by nodes inside a sub-graph shouldn't be the
@@ -459,6 +460,7 @@ ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
                         std::vector<Node *>(outputs.begin(), outputs.end()));
 }
 
+// Filter the Intermediate results of the subgraph node.
 void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
   std::vector<Node *> op_nodes;
   for (auto &node : GraphTraits<DataFlowGraph>(*graph).nodes_in_TS()) {
@@ -484,46 +486,11 @@ void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
         out->SetDeleted();
       }
     }
-    PADDLE_ENFORCE_GE(filtered_subgraph_outlinks.size(), 1UL);
+    // The filtered_subgraph_outlinks may be empty.
     op_nodes[i]->outlinks = filtered_subgraph_outlinks;
   }
 }
 
-void FlexibleDFS(const std::vector<Node *> &source, bool reverse,
-                 const std::function<bool(const Node *)> &enter,
-                 const std::function<bool(const Node *)> &leave) {
-  typedef struct {
-    const Node *node;
-    bool leave;
-  } FNode;
-  std::vector<FNode> stack;
-  for (auto &node : source) {
-    stack.push_back(FNode{node, false});
-  }
-  std::unordered_set<const Node *> visited;
-  while (!stack.empty()) {
-    auto fnode = stack.back();
-    stack.pop_back();
-
-    if (fnode.leave) {
-      if (leave && !leave(fnode.node)) return;
-    }
-    if (visited.count(fnode.node)) continue;
-    visited.insert(fnode.node);
-
-    if (enter && !enter(fnode.node)) return;
-
-    if (leave) stack.push_back(FNode{fnode.node, true});
-    const std::vector<Node *> iter_nodes =
-        reverse == true ? fnode.node->inlinks : fnode.node->outlinks;
-    for (const Node *node : iter_nodes) {
-      if (!visited.count(node)) {
-        stack.push_back(FNode{node, false});
-      }
-    }
-  }
-}
-
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
index 4fefc175f3..437e097acd 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -204,9 +204,6 @@ std::pair<std::vector<Node *>, std::vector<Node *>>
 ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph);  // NOLINT
 
 void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph);
-void FlexibleDFS(const std::vector<Node *> &source, bool reverse,
-                 const std::function<bool(const Node *)> &enter,
-                 const std::function<bool(const Node *)> &leave);
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
index 80c85555e7..47e9752ff2 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -106,20 +106,23 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
 
   // collect inputs
   std::unordered_set<std::string> input_names;
+  std::unordered_set<std::string> input_names_with_id;
   for (auto *x : func->inlinks) {
     input_names.insert(x->name());
+    input_names_with_id.insert(x->name() + std::to_string(x->id()));
   }
   desc.SetInput(
       "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
 
   std::unordered_set<std::string> output_names;
+  std::unordered_set<std::string> output_names_with_id;
   for (auto *x : func->outlinks) {
     output_names.insert(x->name());
+    output_names_with_id.insert(x->name() + std::to_string(x->id()));
   }
 
-  std::vector<std::string> output_temp(output_names.begin(),
-                                       output_names.end());
-  desc.SetOutput("Ys", output_temp);
+  desc.SetOutput(
+      "Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
   desc.SetType("tensorrt_engine");
 
   std::unordered_map<std::string, std::string> output_name_map;
@@ -153,11 +156,12 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
       std::vector<std::string> replaced_names;
       for (int k = 0; k < in_var->arguments_size(); k++) {
         std::string arg_value = in_var->arguments(k);
-        if (input_names.count(arg_value)) {
+        std::string arg_value_with_id =
+            arg_value + std::to_string(var2id[arg_value]);
+        if (input_names_with_id.count(arg_value_with_id)) {
           replaced_names.push_back(arg_value);
         } else {
-          replaced_names.push_back(arg_value +
-                                   std::to_string(var2id[arg_value]));
+          replaced_names.push_back(arg_value_with_id);
         }
       }
       in_var->clear_arguments();
@@ -176,11 +180,12 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
       std::vector<std::string> replaced_names;
       for (int k = 0; k < out_var->arguments_size(); k++) {
         std::string arg_value = out_var->arguments(k);
-        if (output_names.count(arg_value)) {
-          output_name_map[arg_value] =
-              arg_value + std::to_string(var2id[arg_value]);
+        std::string arg_value_with_id =
+            arg_value + std::to_string(var2id[arg_value]);
+        if (output_names_with_id.count(arg_value_with_id)) {
+          output_name_map[arg_value] = arg_value_with_id;
         }
-        replaced_names.push_back(arg_value + std::to_string(var2id[arg_value]));
+        replaced_names.push_back(arg_value_with_id);
       }
       out_var->clear_arguments();
       for (size_t k = 0; k < replaced_names.size(); k++) {
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc
index 670a8de667..857375fc21 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -74,13 +74,126 @@ void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) {
   node_map.at(b)->attr(kUnionFindParent).Int32() = a_ancestor;
 }
 
+// This is a simple representation of a graph.
+// The BriefNode hold the pointer of the Node.
+// This is to avoid changing the original graph
+// in the process of trt graph analysis.
+struct BriefNode {
+  explicit BriefNode(Node *n) { node = n; }
+  Node *node;
+  std::vector<BriefNode *> inlinks;
+  std::vector<BriefNode *> outlinks;
+};
+
+void UnionContractedNodes(const std::unordered_map<int, BriefNode *> &node_map,
+                          int src_id, int dst_id) {
+  // merge the two adjacent nodes into one node.
+  BriefNode *src_node = node_map.at(src_id);
+  BriefNode *dst_node = node_map.at(dst_id);
+
+  std::unordered_set<BriefNode *> inputs(src_node->inlinks.begin(),
+                                         src_node->inlinks.end());
+  std::unordered_set<BriefNode *> outputs;
+
+  for (auto *n : src_node->outlinks) {
+    if (n != dst_node) outputs.insert(n);
+  }
+
+  // Add the inlinks and outlinks of dst node to src node.
+  std::vector<BriefNode *> dst_in_nodes = dst_node->inlinks;
+  for (BriefNode *node : dst_in_nodes) {
+    if (node != src_node) {
+      inputs.insert(node);
+    }
+  }
+
+  std::vector<BriefNode *> dst_out_nodes = dst_node->outlinks;
+  for (BriefNode *node : dst_out_nodes) {
+    outputs.insert(node);
+  }
+
+  // update the dst and src node's inlinks and outlinks.
+  src_node->inlinks =
+      std::move(std::vector<BriefNode *>(inputs.begin(), inputs.end()));
+  src_node->outlinks =
+      std::move(std::vector<BriefNode *>(outputs.begin(), outputs.end()));
+  dst_node->inlinks.clear();
+  dst_node->outlinks.clear();
+
+  auto inlink_or_outlink_cleaner = [&](std::vector<BriefNode *> &nodes) {
+    for (auto *&n : nodes) {
+      if (n == src_node || n == dst_node) {
+        n = src_node;
+      }
+    }
+  };
+  // Change all the dst inputs and outputs corresponding inlink and
+  // outlink to the src node.
+  for (auto *node : src_node->inlinks) {
+    inlink_or_outlink_cleaner(node->outlinks);
+  }
+
+  for (auto *node : src_node->outlinks) {
+    inlink_or_outlink_cleaner(node->inlinks);
+  }
+}
+
+// FlexibleDfS
+// If reverse is true, do reverse dfs.
+// If enter func is not nullptr, calls enter(node) before visiting any children
+// of node.
+// If leave func not nullptr, calls leave(node) after visiting all parents of
+// node.
+void FlexibleDFS(const std::vector<BriefNode *> &source, bool reverse,
+                 const std::function<bool(const BriefNode *)> &enter,
+                 const std::function<bool(const BriefNode *)> &leave) {
+  typedef struct {
+    const BriefNode *node;
+    bool leave;
+  } FNode;
+
+  std::vector<FNode> stack;
+  for (auto &node : source) {
+    stack.push_back(FNode{node, false});
+  }
+  std::unordered_set<const BriefNode *> visited;
+  while (!stack.empty()) {
+    auto fnode = stack.back();
+    stack.pop_back();
+
+    if (fnode.leave) {
+      if (leave && !leave(fnode.node)) return;
+    }
+    if (visited.count(fnode.node)) continue;
+    visited.insert(fnode.node);
+
+    if (enter && !enter(fnode.node)) return;
+
+    if (leave) stack.push_back(FNode{fnode.node, true});
+    const std::vector<BriefNode *> iter_nodes =
+        reverse == true ? fnode.node->inlinks : fnode.node->outlinks;
+    for (const BriefNode *node : iter_nodes) {
+      if (!visited.count(node)) {
+        stack.push_back(FNode{node, false});
+      }
+    }
+  }
+}
+
 std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
+  // Run the Extract algorithm to find all subgraphs.
   std::vector<Node *> marked_nodes;
+  //  We use brief_node_map to represent the original graph in order to avoid
+  //  changing the original graph.
+  std::unordered_map<int, BriefNode *> brief_node_map;
+
   for (auto &node : GraphTraits<DataFlowGraph>(*graph_).nodes_in_TS()) {
+    brief_node_map[node.id()] = new BriefNode(&node);
     if (node.attr(kMarkerAttrName).Bool()) {
       marked_nodes.push_back(&node);
     }
   }
+
   // extract sub-graphs in the marked node set, use Union Find algorithm.
   node_map_t node_map;  // id to ptr
   for (auto *n : marked_nodes) {
@@ -88,11 +201,73 @@ std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
     n->attr(kUnionFindParent).Int32() = n->id();
     node_map[n->id()] = n;
   }
-  std::unordered_set<Node *> visited;
-  for (auto *n : marked_nodes) {
-    for (auto *out : n->outlinks) {
-      if (node_map.count(out->id())) {
-        UnionFindCombine(node_map, n->id(), out->id());
+
+  // create breif node map
+  for (auto &itr : brief_node_map) {
+    for (Node *node : itr.second->node->inlinks) {
+      itr.second->inlinks.push_back(brief_node_map[node->id()]);
+    }
+
+    for (Node *node : itr.second->node->outlinks) {
+      itr.second->outlinks.push_back(brief_node_map[node->id()]);
+    }
+  }
+
+  for (auto &itr : brief_node_map) {
+    BriefNode *brief_node = itr.second;
+
+    if (!brief_node->node->attr(kMarkerAttrName).Bool()) {
+      VLOG(4) << brief_node->node->id() << " node not a trt candicate.";
+      continue;
+    }
+
+    //  Our algorithm must guarantee that:
+    //  1. The graph is always directed acyclic graph（DAG）.
+    //  2. If there is a path in the subgraph from X to Y (X and Y are both
+    //  nodes
+    //     in the subgraph), then all paths from X to Y are in the subgraph.
+    //
+    //  In order to achieve the above guarantee.
+    //  For adjacent nodes src -> dst.
+    //  1. Get all dst input nodes except src.
+    //  2. Reverse DFS from those input nodes
+    //  3. If there is a path from input nodes to src,
+    //  then the src and dst nodes can not be fused into one node,
+    //  otherwise it can be done.
+
+    while (true) {
+      std::unordered_set<BriefNode *> contract_nodes;
+      for (auto *out : brief_node->outlinks) {
+        // must be an trt candidate
+        if (!out->node->attr(kMarkerAttrName).Bool()) continue;
+        // get all dst input nodes except src.
+        std::vector<BriefNode *> source_nodes;
+        for (auto *n : out->inlinks) {
+          if (n != brief_node) {
+            source_nodes.push_back(n);
+          }
+        }
+
+        // Reverse DFS from the source_nodes.
+        bool have_excess_path = false;
+        FlexibleDFS(source_nodes, true, nullptr,
+                    [&have_excess_path, brief_node](const BriefNode *n) {
+                      if (n == brief_node) {
+                        have_excess_path = true;
+                        return false;
+                      }
+                      return true;
+                    });
+        if (have_excess_path) continue;
+        contract_nodes.insert(out);
+      }
+      if (contract_nodes.empty()) break;
+
+      for (auto dst_node : contract_nodes) {
+        UnionFindCombine(node_map, brief_node->node->id(),
+                         dst_node->node->id());
+        UnionContractedNodes(brief_node_map, brief_node->node->id(),
+                             dst_node->node->id());
       }
     }
   }
@@ -128,6 +303,7 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() {
     auto io = ExtractInputAndOutputOfSubGraph(subgraph);
     block_node->inlinks = std::move(io.first);
     block_node->outlinks = std::move(io.second);
+
     for (auto *node : subgraph) {
       // TODO(Superjomn) need a unified mechanism to treat deleted node in each
       // pass.
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
index 39cc433b40..531a170512 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
@@ -82,7 +82,7 @@ TEST(SubGraphSplitter, Fuse) {
 
   // At least one nodes should be deleted.
   ASSERT_EQ(dfg.nodes.size(), count0 + 1);  // added a new FunctionBlock
-  ASSERT_EQ(6, count1);
+  ASSERT_EQ(11, count1);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h
index bc556ab364..395d8bcc07 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -160,11 +160,21 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
           fluid_t->mutable_data<float>(platform::CUDAPlace(
               boost::get<platform::CUDAPlace>(context.GetPlace()).device)),
           size * sizeof(float));
-      //} else {
-      // engine->GetOutputInGPU(
-      // y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
-      // size * sizeof(float));
-      //}
+
+      // TODO(zhaolong) : delete it sometimes
+      /* THIS CODE JUST FOR TEST
+      std::cout << output_maps[output_index] << std::endl;
+      platform::CPUPlace cpu_place;
+      framework::LoDTensor temp_tensor;
+      temp_tensor.Resize(framework::make_ddim(ddim));
+      auto* temp_data = temp_tensor.mutable_data<float>(cpu_place);
+
+      TensorCopySync(*fluid_t, cpu_place ,&temp_tensor);
+      for(int i = 0; i < size; i++) {
+        std::cout << temp_data[i] <<  " " ;
+      }
+      std::cout << std::endl;
+      */
       output_index += 1;
     }
 

From df161e08f0974b5fc77a62714c94bcdb8f04c412 Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Tue, 11 Sep 2018 08:36:29 +0000
Subject: [PATCH 34/85] delete unuse ut

---
 .../analysis/data_flow_graph_tester.cc        | 71 -------------------
 .../inference/analysis/subgraph_splitter.cc   |  2 +-
 2 files changed, 1 insertion(+), 72 deletions(-)

diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
index 040ca19514..1682011c3d 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
@@ -160,77 +160,6 @@ TEST(DataFlowGraph, Build_IR_Graph) {
   ASSERT_EQ(graph.nodes.size(), ir_graph.Nodes().size());
 }
 
-// FlexibleDFS
-/*
- * Graph topology
- * inputs: 0
- * 0 -> 1
- * 1 -> 2
- * 1 -> 3
- * 3 -> 4
- * 4 -> 5
- * 5 -> 2
- */
-TEST(DataFlowGraph, flexibledfs) {
-  DataFlowGraph graph;
-
-  for (int i = 0; i < 6; i++) {
-    auto* node = graph.nodes.Create(Node::Type::kValue);
-    node->SetName("node-" + std::to_string(i));
-  }
-
-  auto add_link = [&](int i, int j) {
-    Node* source = graph.nodes.GetMutable(i);
-    Node* target = graph.nodes.GetMutable(j);
-    target->inlinks.push_back(source);
-    source->outlinks.push_back(target);
-  };
-
-  add_link(0, 1);
-  add_link(1, 2);
-  add_link(1, 3);
-  add_link(3, 4);
-  add_link(4, 5);
-  add_link(5, 2);
-  graph.Build();
-
-  std::vector<const Node*> order;
-  FlexibleDFS(graph.inputs(), false, nullptr, [&order](const Node* n) {
-    order.push_back(n);
-    return true;
-  });
-
-  ASSERT_EQ(order.size(), 6UL);
-
-  order.clear();
-  // reverse dfs
-  FlexibleDFS(graph.outputs(), true, nullptr, [&order](const Node* n) {
-    order.push_back(n);
-    return true;
-  });
-
-  ASSERT_EQ(order.size(), 6UL);
-
-  // If we delete
-  Node* last_node = graph.nodes.GetMutable(2);
-  Node* direct_node = graph.nodes.GetMutable(1);
-  std::vector<Node*> source_nodes;
-  for (Node* node : last_node->inlinks) {
-    if (node != direct_node) source_nodes.push_back(node);
-  }
-
-  bool has_cycle = false;
-  FlexibleDFS(source_nodes, true, nullptr,
-              [&has_cycle, direct_node](const Node* n) {
-                if (n == direct_node) {
-                  has_cycle = true;
-                  return false;
-                }
-                return true;
-              });
-  ASSERT_TRUE(has_cycle);
-}
-
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc
index 857375fc21..773fceeeb2 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -138,7 +138,7 @@ void UnionContractedNodes(const std::unordered_map<int, BriefNode *> &node_map,
   }
 }
 
-// FlexibleDfS
+// FlexibleDFS
 // If reverse is true, do reverse dfs.
 // If enter func is not nullptr, calls enter(node) before visiting any children
 // of node.

From 5d34ef61cbeacb7089f6e28de685c79db324f207 Mon Sep 17 00:00:00 2001
From: Michal Gallus <michal.gallus@intel.com>
Date: Tue, 4 Sep 2018 12:02:57 +0200
Subject: [PATCH 35/85] Fuse MKLDNN's Conv + ReLU

---
 paddle/fluid/operators/conv_mkldnn_op.cc      | 46 +++++++++++++++----
 paddle/fluid/operators/conv_op.cc             |  2 +
 .../fluid/transpiler/inference_transpiler.py  | 39 ++++++++++++++--
 3 files changed, 75 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index c5cbadc892..53e705c8ca 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -296,6 +296,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    bool fuse_relu = ctx.Attr<bool>("fuse_relu");
     int groups = ctx.Attr<int>("groups");
 
     // TODO(pzelazko-intel) add support for group convolution and dilation
@@ -348,11 +349,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       bias_tz = paddle::framework::vectorize2int(bias->dims());
       auto bias_md = platform::MKLDNNMemDesc(
           bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x);
-      conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
-                                     strides, paddings, mkldnn_engine);
+      conv_pd =
+          ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, strides,
+                               paddings, mkldnn_engine, fuse_relu);
     } else {
       conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides,
-                                     paddings, mkldnn_engine);
+                                     paddings, mkldnn_engine, fuse_relu);
     }
     // Save conv_pd/src_memory/weights_memory for backward pass
     dev_ctx.SetBlob(key_conv_pd, conv_pd);
@@ -402,11 +404,26 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
   }
 
  private:
+  mkldnn::primitive_attr AddRelu() const {
+    // Fusion with ReLU layer is executed through the PostOps feature. Create a
+    // PostOps object and configure it to execute an eltwise relu operation.
+    mkldnn::primitive_attr conv_attr;
+    constexpr float scale = 1.0f;
+    constexpr float negative_slope = 0.0f;
+    constexpr float placeholder = 0.0f;
+    mkldnn::post_ops post_operations;
+    post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
+                                   negative_slope, placeholder);
+    conv_attr.set_post_ops(post_operations);
+    return conv_attr;
+  }
+
   std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
   ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
                        const memory::desc& dst, const std::vector<int>& strides,
                        const std::vector<int>& paddings,
-                       const mkldnn::engine& engine) const {
+                       const mkldnn::engine& engine,
+                       const bool fuse_relu) const {
     memory::dims stride_dims = {strides[0], strides[1]};
     memory::dims padding_dims = {paddings[0], paddings[1]};
 
@@ -415,8 +432,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         dst, stride_dims, padding_dims, padding_dims,
         mkldnn::padding_kind::zero);
 
-    auto p_conv_pd =
-        new mkldnn::convolution_forward::primitive_desc(conv_desc, engine);
+    mkldnn::primitive_attr conv_attr;
+    if (fuse_relu) {
+      conv_attr = AddRelu();
+    }
+
+    auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
+        conv_desc, conv_attr, engine);
 
     return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
         p_conv_pd);
@@ -427,7 +449,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                        const memory::desc& bias, const memory::desc& dst,
                        const std::vector<int>& strides,
                        const std::vector<int>& paddings,
-                       const mkldnn::engine& engine) const {
+                       const mkldnn::engine& engine,
+                       const bool fuse_relu) const {
     memory::dims stride_dims = {strides[0], strides[1]};
     memory::dims padding_dims = {paddings[0], paddings[1]};
 
@@ -436,8 +459,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         bias, dst, stride_dims, padding_dims, padding_dims,
         mkldnn::padding_kind::zero);
 
-    auto p_conv_pd =
-        new mkldnn::convolution_forward::primitive_desc(conv_desc, engine);
+    mkldnn::primitive_attr conv_attr;
+    if (fuse_relu) {
+      conv_attr = AddRelu();
+    }
+
+    auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
+        conv_desc, conv_attr, engine);
 
     return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
         p_conv_pd);
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 61ca80877a..3332e64301 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -161,6 +161,8 @@ void Conv2DOpMaker::Make() {
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
+  AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
   AddAttr<std::string>(
       "data_format",
       "(string, default NCHW) Only used in "
diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py
index 02fefe32df..adad2428f7 100644
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -60,12 +60,46 @@ class InferenceTranspiler(object):
         if not isinstance(scope, core.Scope):
             raise TypeError("scope should be as Scope type or None")
         use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))
+
         self._fuse_batch_norm(program, place, scope)
         if use_mkldnn:
-            self._fuse_relu_mkldnn(program)
             self._fuse_conv_bias_mkldnn(program)
+            self._fuse_conv_relu_mkldnn(program)
+            self._fuse_bn_relu_mkldnn(program)
+
+    def _fuse_conv_relu_mkldnn(self, program):
+        '''
+        Transpile the program by fused relu activation for MKLDNN program.
+        Relu activation following convolution OP can be fused by adding
+        'fuse_relu' attribute to convolution OP.
+        The result of fuse is:
+            - before:
+                - conv->relu->any_other_op
+            - after:
+                - conv->any_other_op
+        :param program: program to transpile
+        :type program: Program
+        '''
+        self.block = program.block(0)
+
+        i = 0
+        while i < len(self.block.ops):
+            current_op = self.block.ops[i]
+            if current_op.type in ['conv2d']:
+                next_op = self.block.ops[i + 1]
+                if next_op.type == 'relu':
+                    # modify conv OP to include relu
+                    current_op.set_attr("fuse_relu", True)
+                    # remove conv OP
+                    self.block._remove_op(i + 1)
+            i = i + 1
+
+        # TODO(luotao): use clone() method to flush the program.desc in force,
+        # since some large program.desc will not be flushed immediately.
+        # And a better solution will be considered later.
+        program = program.clone()
 
-    def _fuse_relu_mkldnn(self, program):
+    def _fuse_bn_relu_mkldnn(self, program):
         '''
         Transpile the program by fused relu activation for MKLDNN program.
 
@@ -159,7 +193,6 @@ class InferenceTranspiler(object):
                 self._fuse_conv_bias(i, current_op, next_op)
                 self.block._remove_op(i + 1)  # Remove old conv
                 self.block._remove_op(i + 1)  # Remove elementwise_add
-                i = i + 1
             i = i + 1
 
         self._remove_unused_var()

From accdecc6814b8070d3e3bdbec77b162d954f21d6 Mon Sep 17 00:00:00 2001
From: Krzysztof Binias <krzysztof.binias@intel.com>
Date: Tue, 11 Sep 2018 11:16:05 +0200
Subject: [PATCH 36/85] Correcting Lint errors

---
 paddle/fluid/operators/conv_mkldnn_op.cc | 4 ++--
 paddle/fluid/platform/mkldnn_helper.h    | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index 244a578db8..fa9ee637c5 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -130,8 +130,8 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
       const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
-      const std::vector<mkldnn::primitive>& pipeline,
-      bool is_persistent = false) {  // NOLINT
+      std::vector<mkldnn::primitive>& pipeline,  // NOLINT
+      bool is_persistent = false) {
     auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
     auto weights_pd = conv_pd_->weights_primitive_desc();
     return this->AcquireMemory(weights_pd, user_weights_pd,
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index cf08202ccb..c0a2543ba5 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -191,8 +191,9 @@ class MKLDNNHandler {
       mkldnn::memory::primitive_desc& mpd,       // NOLINT
       mkldnn::memory::primitive_desc& user_mpd,  // NOLINT
       const std::shared_ptr<mkldnn::memory> user_memory_p,
-      const std::string& suffix, const std::vector<mkldnn::primitive>& pipeline,
-      bool is_persistent = false) {  // NOLINT
+      const std::string& suffix,
+      std::vector<mkldnn::primitive>& pipeline,  // NOLINT
+      bool is_persistent = false) {
     // create reorder primitive if the input format is not the preferred one
     auto local_key = key_ + suffix;
     auto key_reorder_p = key_ + suffix + "reorder_p";

From a5556d44175931682bb049451639948c0da7ed6e Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Tue, 11 Sep 2018 17:49:54 +0800
Subject: [PATCH 37/85] refine attentionlstm infershape

---
 paddle/fluid/operators/attention_lstm_op.cc | 88 ++++++++++++++-------
 1 file changed, 60 insertions(+), 28 deletions(-)

diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index 39b0c85699..ac4ddb5502 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/attention_lstm_op.h"
 #include <string>
+#include "paddle/fluid/framework/shape_runtime_infer.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
@@ -23,29 +24,60 @@ namespace paddle {
 namespace operators {
 
 void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("X"),
-                 "Input(X) of AttentionLSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("C0"),
-                 "Input(C0) of AttentionLSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("LSTMWeight"),
-                 "Input(LSTMWeight) of AttentionLSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("LSTMBias"),
-                 "Input(LSTMBias) of AttentionLSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("AttentionWeight"),
-                 "Input(AttentionWeight) of AttentionLSTM should not be null.");
-
-  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
-                 "Output(Hidden) of AttentionLSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Cell"),
-                 "Output(Cell) of AttentionLSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("AttentionedX"),
-                 "Output(AttentionedX) of AttentionLSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("AttentionFCOut"),
-                 "Output(AttentionFCOut) of AttentionLSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("LSTMX"),
-                 "Output(LSTMX) of AttentionLSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("LSTMOUT"),
-                 "Output(LSTMOUT) of AttentionLSTM should not be null.");
+  auto* runtime_ctx = dynamic_cast<framework::RuntimeInferShapeContext*>(ctx);
+  if (runtime_ctx == nullptr) {
+    LOG(FATAL) << "Should have runtime infer context";
+  }
+  const auto& ins = runtime_ctx->OpBase().Inputs();
+  const auto& outs = runtime_ctx->OpBase().Outputs();
+  const auto& scope = runtime_ctx->InferScope();
+  const auto ins_end = ins.end();
+  const auto outs_end = outs.end();
+  auto fair_input = [&](const std::string& name) -> bool {
+    auto it = ins.find(name);
+    if (it == ins_end) {
+      return false;
+    }
+    const auto& in = it->second;
+    if (in.size() != 1 || in[0] == framework::kEmptyVarName) {
+      return false;
+    }
+    return scope.FindVar(in[0]) != nullptr;
+  };
+  auto fair_output = [&](const std::string& name) -> bool {
+    auto it = outs.find(name);
+    if (it == outs_end) {
+      return false;
+    }
+    const auto& out = it->second;
+    if (out.size() != 1 || out[0] == framework::kEmptyVarName) {
+      return false;
+    }
+    return scope.FindVar(out[0]) != nullptr;
+  };
+
+  PADDLE_ENFORCE(fair_input("X"), "Assert only one Input(X) of AttentionLSTM.");
+  PADDLE_ENFORCE(fair_input("C0"),
+                 "Assert only one Input(C0) of AttentionLSTM.");
+  PADDLE_ENFORCE(fair_input("LSTMWeight"),
+                 "Assert only one Input(LSTMWeight) of AttentionLSTM.");
+  PADDLE_ENFORCE(fair_input("LSTMBias"),
+                 "Assert only one Input(LSTMBias) of AttentionLSTM.");
+  PADDLE_ENFORCE(fair_input("AttentionWeight"),
+                 "Assert only one Input(AttentionWeight) of AttentionLSTM.");
+
+  PADDLE_ENFORCE(fair_output("Hidden"),
+                 "Assert only one Output(Hidden) of AttentionLSTM.");
+  PADDLE_ENFORCE(fair_output("Cell"),
+                 "Assert only one Output(Cell) of AttentionLSTM.");
+  PADDLE_ENFORCE(fair_output("AttentionedX"),
+                 "Assert only one Output(AttentionedX) of AttentionLSTM.");
+  PADDLE_ENFORCE(fair_output("AttentionFCOut"),
+                 "Assert only one Output(AttentionFCOut) of AttentionLSTM.");
+  PADDLE_ENFORCE(fair_output("LSTMX"),
+                 "Assert only one Output(LSTMX) of AttentionLSTM.");
+  PADDLE_ENFORCE(fair_output("LSTMOUT"),
+                 "Assert only one Output(LSTMOUT) of AttentionLSTM.");
 
   auto x_dims = ctx->GetInputDim("X");
   const int M = x_dims[1];
@@ -65,7 +97,7 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
   auto c_dims = ctx->GetInputDim("C0");
   PADDLE_ENFORCE_EQ(c_dims.size(), 2, "Input(C0)'s rank must be 2.");
   PADDLE_ENFORCE_EQ(c_dims[1], D, "C0 dims should be N x %d.", D);
-  if (ctx->HasInput("H0")) {
+  if (fair_input("H0")) {
     auto h_dims = ctx->GetInputDim("H0");
     PADDLE_ENFORCE(h_dims == c_dims,
                    "The dimension of Input(H0) and Input(C0) "
@@ -79,7 +111,7 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
                     "AttentionWeight shapes must be (%d + %d) * 1.", M, D);
   PADDLE_ENFORCE_EQ(atten_w_dims[1], 1,
                     "AttentionWeight shapes must be (%d + %d) * 1.", M, D);
-  if (ctx->HasInput("AttentionBias")) {
+  if (fair_input("AttentionBias")) {
     auto atten_b_dims = ctx->GetInputDim("AttentionBias");
     PADDLE_ENFORCE_EQ(atten_b_dims.size(), 2,
                       "Input(AttentionBias)'s rank must be 2.");
@@ -89,7 +121,7 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
                       "AttentionBias shapes must be 1 * 1.");
   }
 
-  if (ctx->HasInput("AttentionScalar")) {
+  if (fair_input("AttentionScalar")) {
     auto dims = ctx->GetInputDim("AttentionScalar");
     PADDLE_ENFORCE_EQ(dims.size(), 2,
                       "Input(AttentionScalar)'s rank must be 2.");
@@ -97,10 +129,10 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
     PADDLE_ENFORCE_EQ(dims[1], 1, "AttentionScalar shapes must be 1 * 1.");
   }
 
-  if (ctx->HasInput("AttentionScalarBias")) {
+  if (fair_input("AttentionScalarBias")) {
     auto dims = ctx->GetInputDim("AttentionScalarBias");
     PADDLE_ENFORCE(
-        ctx->HasInput("AttentionScalar"),
+        fair_input("AttentionScalar"),
         "AttentionScalar should not be null when have AttentionScalarBias.");
     PADDLE_ENFORCE_EQ(dims.size(), 2,
                       "Input(AttentionScalarBias)'s rank must be 2.");

From 916f42bcbf7bc308f2135be5f341b8628cc883dc Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Tue, 11 Sep 2018 18:00:20 +0800
Subject: [PATCH 38/85] refine fusion gru infershape

---
 paddle/fluid/operators/fusion_gru_op.cc | 65 +++++++++++++++++++------
 1 file changed, 49 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/operators/fusion_gru_op.cc b/paddle/fluid/operators/fusion_gru_op.cc
index 916f84cb4a..bcdcb2ac4d 100644
--- a/paddle/fluid/operators/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fusion_gru_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/fusion_gru_op.h"
 #include <cstring>  // for memcpy
 #include <string>
+#include "paddle/fluid/framework/shape_runtime_infer.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
@@ -25,14 +26,46 @@ namespace paddle {
 namespace operators {
 
 void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("WeightX"),
-                 "Input(WeightX) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("WeightH"),
-                 "Input(WeightH) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("XX"), "Output(XX) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
-                 "Output(Hidden) of GRU should not be null.");
+  auto* runtime_ctx = dynamic_cast<framework::RuntimeInferShapeContext*>(ctx);
+  if (runtime_ctx == nullptr) {
+    LOG(FATAL) << "Should have runtime infer context";
+  }
+  const auto& ins = runtime_ctx->OpBase().Inputs();
+  const auto& outs = runtime_ctx->OpBase().Outputs();
+  const auto& scope = runtime_ctx->InferScope();
+  const auto ins_end = ins.end();
+  const auto outs_end = outs.end();
+  auto fair_input = [&](const std::string& name) -> bool {
+    auto it = ins.find(name);
+    if (it == ins_end) {
+      return false;
+    }
+    const auto& in = it->second;
+    if (in.size() != 1 || in[0] == framework::kEmptyVarName) {
+      return false;
+    }
+    return scope.FindVar(in[0]) != nullptr;
+  };
+  auto fair_output = [&](const std::string& name) -> bool {
+    auto it = outs.find(name);
+    if (it == outs_end) {
+      return false;
+    }
+    const auto& out = it->second;
+    if (out.size() != 1 || out[0] == framework::kEmptyVarName) {
+      return false;
+    }
+    return scope.FindVar(out[0]) != nullptr;
+  };
+
+  PADDLE_ENFORCE(fair_input("X"), "Assert only one Input(X) of GRU.");
+  PADDLE_ENFORCE(fair_input("WeightX"),
+                 "Assert only one Input(WeightX) of GRU.");
+  PADDLE_ENFORCE(fair_input("WeightH"),
+                 "Assert only one Input(WeightH) of GRU.");
+  PADDLE_ENFORCE(fair_output("XX"), "Assert only one Output(XX) of GRU.");
+  PADDLE_ENFORCE(fair_output("Hidden"),
+                 "Assert only one Output(Hidden) of GRU.");
 
   auto x_dims = ctx->GetInputDim("X");
   PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
@@ -58,12 +91,12 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
                     "should be 3 * %d.",
                     frame_size);
 
-  if (ctx->HasInput("H0")) {
+  if (fair_input("H0")) {
     auto h0_dims = ctx->GetInputDim("H0");
     PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
                       "The width of H0 must be equal to frame_size.");
   }
-  if (ctx->HasInput("Bias")) {
+  if (fair_input("Bias")) {
     auto b_dims = ctx->GetInputDim("Bias");
     PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
     PADDLE_ENFORCE_EQ(b_dims[0], 1,
@@ -79,12 +112,12 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
     xx_width = wx_dims[1];
   } else {
     xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
-    PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
-                   "Output(ReorderedH0) of GRU should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
-                   "Output(BatchedInput) of GRU should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"),
-                   "Output(BatchedOut) of GRU should not be null.");
+    PADDLE_ENFORCE(fair_output("ReorderedH0"),
+                   "Assert only one Output(ReorderedH0) of GRU.");
+    PADDLE_ENFORCE(fair_output("BatchedInput"),
+                   "Assert only one Output(BatchedInput) of GRU.");
+    PADDLE_ENFORCE(fair_output("BatchedOut"),
+                   "Assert only one Output(BatchedOut) of GRU.");
     ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
     ctx->SetOutputDim("BatchedOut", out_dims);
   }

From 8e0fe035d478a8bfb7bea888b986eafa827dcbf1 Mon Sep 17 00:00:00 2001
From: superjomn <yanchunwei@outlook.com>
Date: Tue, 11 Sep 2018 10:16:19 +0000
Subject: [PATCH 39/85] fix ner_test when bs>1

---
 paddle/fluid/inference/tests/api/analyzer_ner_tester.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
index 661b047ed7..6e8e43add7 100644
--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -144,8 +144,9 @@ void TestChineseNERPrediction(bool use_analysis) {
     size_t num_samples;
     for (int i = 0; i < FLAGS_repeat; i++) {
       DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+      // Just one batch, the num_samples remains the same.
       num_samples = data.num_samples;
-      for (size_t bid = 0; bid < num_samples; ++bid) {
+      for (size_t bid = 0; bid < num_samples / FLAGS_batch_size; ++bid) {
         PrepareInputs(&input_slots, &data, FLAGS_batch_size);
         timer.tic();
         predictor->Run(input_slots, &outputs);

From 8a1abe54d797de7c4f17ab92d2268c3cebf83b66 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Tue, 11 Sep 2018 18:30:49 +0800
Subject: [PATCH 40/85] clean fusion infershape code

---
 paddle/fluid/operators/attention_lstm_op.cc   | 35 +----------
 paddle/fluid/operators/fusion_gru_op.cc       | 35 +----------
 .../operators/fusion_infershape_define.h      | 60 +++++++++++++++++++
 paddle/fluid/operators/fusion_lstm_op.cc      | 35 +----------
 4 files changed, 66 insertions(+), 99 deletions(-)
 create mode 100644 paddle/fluid/operators/fusion_infershape_define.h

diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index ac4ddb5502..7531aa9a46 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/attention_lstm_op.h"
 #include <string>
-#include "paddle/fluid/framework/shape_runtime_infer.h"
+#include "paddle/fluid/operators/fusion_infershape_define.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
@@ -24,38 +24,7 @@ namespace paddle {
 namespace operators {
 
 void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
-  auto* runtime_ctx = dynamic_cast<framework::RuntimeInferShapeContext*>(ctx);
-  if (runtime_ctx == nullptr) {
-    LOG(FATAL) << "Should have runtime infer context";
-  }
-  const auto& ins = runtime_ctx->OpBase().Inputs();
-  const auto& outs = runtime_ctx->OpBase().Outputs();
-  const auto& scope = runtime_ctx->InferScope();
-  const auto ins_end = ins.end();
-  const auto outs_end = outs.end();
-  auto fair_input = [&](const std::string& name) -> bool {
-    auto it = ins.find(name);
-    if (it == ins_end) {
-      return false;
-    }
-    const auto& in = it->second;
-    if (in.size() != 1 || in[0] == framework::kEmptyVarName) {
-      return false;
-    }
-    return scope.FindVar(in[0]) != nullptr;
-  };
-  auto fair_output = [&](const std::string& name) -> bool {
-    auto it = outs.find(name);
-    if (it == outs_end) {
-      return false;
-    }
-    const auto& out = it->second;
-    if (out.size() != 1 || out[0] == framework::kEmptyVarName) {
-      return false;
-    }
-    return scope.FindVar(out[0]) != nullptr;
-  };
-
+  FUSION_INFERSHAPE_INIT;
   PADDLE_ENFORCE(fair_input("X"), "Assert only one Input(X) of AttentionLSTM.");
   PADDLE_ENFORCE(fair_input("C0"),
                  "Assert only one Input(C0) of AttentionLSTM.");
diff --git a/paddle/fluid/operators/fusion_gru_op.cc b/paddle/fluid/operators/fusion_gru_op.cc
index bcdcb2ac4d..b10d311f05 100644
--- a/paddle/fluid/operators/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fusion_gru_op.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/fusion_gru_op.h"
 #include <cstring>  // for memcpy
 #include <string>
-#include "paddle/fluid/framework/shape_runtime_infer.h"
+#include "paddle/fluid/operators/fusion_infershape_define.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
@@ -26,38 +26,7 @@ namespace paddle {
 namespace operators {
 
 void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
-  auto* runtime_ctx = dynamic_cast<framework::RuntimeInferShapeContext*>(ctx);
-  if (runtime_ctx == nullptr) {
-    LOG(FATAL) << "Should have runtime infer context";
-  }
-  const auto& ins = runtime_ctx->OpBase().Inputs();
-  const auto& outs = runtime_ctx->OpBase().Outputs();
-  const auto& scope = runtime_ctx->InferScope();
-  const auto ins_end = ins.end();
-  const auto outs_end = outs.end();
-  auto fair_input = [&](const std::string& name) -> bool {
-    auto it = ins.find(name);
-    if (it == ins_end) {
-      return false;
-    }
-    const auto& in = it->second;
-    if (in.size() != 1 || in[0] == framework::kEmptyVarName) {
-      return false;
-    }
-    return scope.FindVar(in[0]) != nullptr;
-  };
-  auto fair_output = [&](const std::string& name) -> bool {
-    auto it = outs.find(name);
-    if (it == outs_end) {
-      return false;
-    }
-    const auto& out = it->second;
-    if (out.size() != 1 || out[0] == framework::kEmptyVarName) {
-      return false;
-    }
-    return scope.FindVar(out[0]) != nullptr;
-  };
-
+  FUSION_INFERSHAPE_INIT;
   PADDLE_ENFORCE(fair_input("X"), "Assert only one Input(X) of GRU.");
   PADDLE_ENFORCE(fair_input("WeightX"),
                  "Assert only one Input(WeightX) of GRU.");
diff --git a/paddle/fluid/operators/fusion_infershape_define.h b/paddle/fluid/operators/fusion_infershape_define.h
new file mode 100644
index 0000000000..89521672b0
--- /dev/null
+++ b/paddle/fluid/operators/fusion_infershape_define.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_FLUID_OPERATORS_FUSION_INFERSHAPE_DEFINE_H_
+#define PADDLE_FLUID_OPERATORS_FUSION_INFERSHAPE_DEFINE_H_
+
+#include <string>
+#include "paddle/fluid/framework/shape_runtime_infer.h"
+
+namespace paddle {
+namespace operators {
+
+#define FUSION_INFERSHAPE_INIT                                                 \
+  auto* runtime_ctx = dynamic_cast<framework::RuntimeInferShapeContext*>(ctx); \
+  if (runtime_ctx == nullptr) {                                                \
+    LOG(FATAL) << "Should have runtime infer context";                         \
+  }                                                                            \
+  const auto& ins = runtime_ctx->OpBase().Inputs();                            \
+  const auto& outs = runtime_ctx->OpBase().Outputs();                          \
+  const auto& scope = runtime_ctx->InferScope();                               \
+  const auto ins_end = ins.end();                                              \
+  const auto outs_end = outs.end();                                            \
+  auto fair_input = [&](const std::string& name) -> bool {                     \
+    auto it = ins.find(name);                                                  \
+    if (it == ins_end) {                                                       \
+      return false;                                                            \
+    }                                                                          \
+    const auto& in = it->second;                                               \
+    if (in.size() != 1 || in[0] == framework::kEmptyVarName) {                 \
+      return false;                                                            \
+    }                                                                          \
+    return scope.FindVar(in[0]) != nullptr;                                    \
+  };                                                                           \
+  auto fair_output = [&](const std::string& name) -> bool {                    \
+    auto it = outs.find(name);                                                 \
+    if (it == outs_end) {                                                      \
+      return false;                                                            \
+    }                                                                          \
+    const auto& out = it->second;                                              \
+    if (out.size() != 1 || out[0] == framework::kEmptyVarName) {               \
+      return false;                                                            \
+    }                                                                          \
+    return scope.FindVar(out[0]) != nullptr;                                   \
+  }
+
+}  // namespace operators
+}  // namespace paddle
+
+#endif  // PADDLE_FLUID_OPERATORS_FUSION_INFERSHAPE_DEFINE_H_
diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc
index ae9d5d78ae..08af98f850 100644
--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fusion_lstm_op.h"
 #include <string>
-#include "paddle/fluid/framework/shape_runtime_infer.h"
+#include "paddle/fluid/operators/fusion_infershape_define.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
@@ -25,38 +25,7 @@ namespace paddle {
 namespace operators {
 
 void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
-  auto* runtime_ctx = dynamic_cast<framework::RuntimeInferShapeContext*>(ctx);
-  if (runtime_ctx == nullptr) {
-    LOG(FATAL) << "Should have runtime infer context";
-  }
-  const auto& ins = runtime_ctx->OpBase().Inputs();
-  const auto& outs = runtime_ctx->OpBase().Outputs();
-  const auto& scope = runtime_ctx->InferScope();
-  const auto ins_end = ins.end();
-  const auto outs_end = outs.end();
-  auto fair_input = [&](const std::string& name) -> bool {
-    auto it = ins.find(name);
-    if (it == ins_end) {
-      return false;
-    }
-    const auto& in = it->second;
-    if (in.size() != 1 || in[0] == framework::kEmptyVarName) {
-      return false;
-    }
-    return scope.FindVar(in[0]) != nullptr;
-  };
-  auto fair_output = [&](const std::string& name) -> bool {
-    auto it = outs.find(name);
-    if (it == outs_end) {
-      return false;
-    }
-    const auto& out = it->second;
-    if (out.size() != 1 || out[0] == framework::kEmptyVarName) {
-      return false;
-    }
-    return scope.FindVar(out[0]) != nullptr;
-  };
-
+  FUSION_INFERSHAPE_INIT;
   PADDLE_ENFORCE(fair_input("X"), "Assert only one Input(X) of LSTM.");
   PADDLE_ENFORCE(fair_input("WeightX"),
                  "Assert only one Input(WeightX) of LSTM.");

From 23b12c6f585cd18374810862f98760b80a5ae473 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Tue, 11 Sep 2018 18:32:15 +0800
Subject: [PATCH 41/85] fix invalide bcast in reduce strategy

---
 paddle/fluid/framework/details/multi_devices_graph_pass.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 7a99169849..d44ebbae4d 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -442,8 +442,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
   use_gpu = nccl_ctxs_ != nullptr;
 #endif
 
-  if (use_gpu ||
-      strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
+  if (use_gpu && strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
     // Insert BCast Ops
     for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
       auto &to_bcast_set = bcast_var_name_set[dev_id];

From 7dd54afd0c7f328891fbb0df15e434aa9afba216 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Tue, 11 Sep 2018 12:12:46 +0000
Subject: [PATCH 42/85] fix program desc unit test error

---
 paddle/fluid/framework/program_desc_test.cc | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/program_desc_test.cc b/paddle/fluid/framework/program_desc_test.cc
index 925ea98dbe..7e689a37da 100644
--- a/paddle/fluid/framework/program_desc_test.cc
+++ b/paddle/fluid/framework/program_desc_test.cc
@@ -87,8 +87,17 @@ TEST(ProgramDesc, copy_ctor) {
     ASSERT_EQ(op_origin->Inputs(), op_copy->Inputs());
     ASSERT_EQ(op_origin->Outputs(), op_copy->Outputs());
 
-    ASSERT_EQ(op_copy->Proto()->SerializeAsString(),
-              op_origin->Proto()->SerializeAsString());
+    ASSERT_EQ(op_origin->Proto()->attrs().size(),
+              op_copy->Proto()->attrs().size());
+    for (auto it = op_origin->Proto()->attrs().begin();
+         it != op_origin->Proto()->attrs().end(); ++it) {
+      for (auto it_2 = op_copy->Proto()->attrs().begin();
+           it_2 != op_copy->Proto()->attrs().end(); ++it_2) {
+        if (it->name() == it_2->name()) {
+          ASSERT_TRUE(it_2->SerializeAsString() == it->SerializeAsString());
+        }
+      }
+    }
 
     if (op->Type() == "op_with_subblock") {
       ASSERT_EQ(1, op->GetBlockAttrId("sub_block"));

From 8cbb3c0720ab48abd08b06a49ac2e073b750f22f Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Tue, 11 Sep 2018 21:56:17 +0800
Subject: [PATCH 43/85] refine lac ut and fix fetch

---
 paddle/fluid/inference/api/api_impl.cc        |  2 +-
 .../tests/api/analyzer_lac_tester.cc          | 28 -------------------
 2 files changed, 1 insertion(+), 29 deletions(-)

diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index bd9b4b1a81..6fe13ed027 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -262,7 +262,7 @@ void NativePaddlePredictor::GetFetchOne(const framework::LoDTensor &fetch,
   if (buffer.empty() || buffer.length() < sizeof(T) * data.size()) {
     buffer.Resize(sizeof(T) * data.size());
   }
-  std::memcpy(buffer.data(), data.data(), buffer.length());
+  std::memcpy(buffer.data(), data.data(), sizeof(T) * data.size());
   // copy LoD
   for (const auto &level : fetch.lod()) {
     output->lod.emplace_back(level);
diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
index 522d870db8..7e00cb20ad 100644
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -117,34 +117,6 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   input_slots->assign({input_tensor});
 }
 
-void BenchAllData(const std::string &model_path, const std::string &data_file,
-                  const int batch_size, const int repeat) {
-  NativeConfig config;
-  config.model_dir = model_path;
-  config.use_gpu = false;
-  config.device = 0;
-  config.specify_input_name = true;
-  std::vector<PaddleTensor> input_slots, outputs_slots;
-  DataRecord data(data_file, batch_size);
-  auto predictor =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
-  GetOneBatch(&input_slots, &data, batch_size);
-  for (int i = 0; i < FLAGS_burning; i++) {
-    predictor->Run(input_slots, &outputs_slots);
-  }
-  Timer timer;
-  double sum = 0;
-  for (int i = 0; i < repeat; i++) {
-    for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
-      GetOneBatch(&input_slots, &data, batch_size);
-      timer.tic();
-      predictor->Run(input_slots, &outputs_slots);
-      sum += timer.toc();
-    }
-  }
-  PrintTime(batch_size, repeat, 1, 0, sum / repeat);
-}
-
 const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25,
                                 25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43,
                                 44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39,

From 392ae69650ea453accdbdd2b5ed84f3764b2d2c6 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Tue, 11 Sep 2018 22:46:36 +0800
Subject: [PATCH 44/85] Set parallel executor thread num under nccl2
 distributed env (#13207)

---
 python/paddle/fluid/parallel_executor.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 4790e0f611..bd9f8b3c35 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -128,6 +128,13 @@ class ParallelExecutor(object):
                     os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
                 exec_strategy.num_threads = cpu_num * 2
 
+        # Set 1 thread num under nccl2 distribute 
+        #   env to make sure all gpus run ops in same order.
+        if num_trainers > 1:
+            assert (use_cuda)
+            # FIXME(gongwb): avoid this set.
+            exec_strategy.num_threads = 1
+
         if build_strategy is None:
             build_strategy = BuildStrategy()
 

From 8bb824bb93629fbf69d7e93ffc0dca85e726300c Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 12 Sep 2018 00:06:58 +0800
Subject: [PATCH 45/85] refine infershape hasinput and hasoutput

---
 paddle/fluid/framework/operator.cc            | 274 ++++++++++--------
 paddle/fluid/framework/shape_runtime_infer.h  |  86 ------
 paddle/fluid/operators/attention_lstm_op.cc   |  35 ++-
 paddle/fluid/operators/fusion_gru_op.cc       |  22 +-
 .../operators/fusion_infershape_define.h      |  60 ----
 paddle/fluid/operators/fusion_lstm_op.cc      |  31 +-
 6 files changed, 197 insertions(+), 311 deletions(-)
 delete mode 100644 paddle/fluid/framework/shape_runtime_infer.h
 delete mode 100644 paddle/fluid/operators/fusion_infershape_define.h

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 36025db7ba..bbd141cb3b 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -21,7 +21,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
-#include "paddle/fluid/framework/shape_runtime_infer.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -459,147 +458,184 @@ bool OpSupportGPU(const std::string& op_type) {
   return false;
 }
 
-bool RuntimeInferShapeContext::HasInput(const std::string& name) const {
-  if (!op_.HasInputs(name)) {
-    return false;
-  }
-  auto& ins = Inputs(name);
-  size_t length = ins.size();
-  if (length == 0) {
-    return false;
-  }
-  PADDLE_ENFORCE_EQ(length, 1UL,
-                    "Input %s should not have more than one inputs", name);
-  auto ipt = ins[0];
-  auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
-  return var != nullptr;
-}
+class RuntimeInferShapeContext : public InferShapeContext {
+ public:
+  RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope)
+      : op_(op), scope_(scope) {}
 
-bool RuntimeInferShapeContext::HasOutput(const std::string& name) const {
-  if (!op_.HasOutputs(name)) {
-    return false;
-  }
-  auto& outs = Outputs(name);
-  size_t length = outs.size();
-  if (length == 0) {
-    return false;
-  }
-  PADDLE_ENFORCE_EQ(length, 1UL,
-                    "Output %s should not have more than one inputs", name);
-  auto ipt = outs[0];
-  auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
-  return var != nullptr;
-}
+  bool HasInput(const std::string& name) const override {
+    // has only one input
+    const auto& ins = op_.Inputs();
+    auto it = ins.find(name);
+    if (it == ins.end()) {
+      return false;
+    }
+    const auto& in = it->second;
 
-bool RuntimeInferShapeContext::HasInputs(const std::string& name) const {
-  if (!op_.HasInputs(name)) {
-    return false;
-  }
-  auto inputs = op_.Inputs(name);
-  if (inputs.empty()) {
-    return false;
-  }
-  for (auto& input : inputs) {
-    if (scope_.FindVar(input) == nullptr) {
+    if (in.size() != 1 || in[0] == kEmptyVarName) {
       return false;
     }
+    return scope_.FindVar(in[0]) != nullptr;
   }
-  return true;
-}
 
-bool RuntimeInferShapeContext::HasOutputs(const std::string& name) const {
-  if (!op_.HasOutputs(name)) {
-    return false;
+  bool HasOutput(const std::string& name) const override {
+    // has only one output
+    const auto& outs = op_.Outputs();
+    auto it = outs.find(name);
+    if (it == outs.end()) {
+      return false;
+    }
+    const auto& out = it->second;
+    if (out.size() != 1 || out[0] == kEmptyVarName) {
+      return false;
+    }
+    return scope_.FindVar(out[0]) != nullptr;
   }
-  auto outputs = op_.Outputs(name);
-  if (outputs.empty()) {
-    return false;
+
+  bool HasInputs(const std::string& name) const override {
+    if (!op_.HasInputs(name)) {
+      return false;
+    }
+    auto inputs = op_.Inputs(name);
+    if (inputs.empty()) {
+      return false;
+    }
+    for (auto& input : inputs) {
+      if (scope_.FindVar(input) == nullptr) {
+        return false;
+      }
+    }
+    return true;
   }
-  for (auto& output : outputs) {
-    if (scope_.FindVar(output) == nullptr) {
+
+  bool HasOutputs(const std::string& name) const override {
+    if (!op_.HasOutputs(name)) {
+      return false;
+    }
+    auto outputs = op_.Outputs(name);
+    if (outputs.empty()) {
       return false;
     }
+    for (auto& output : outputs) {
+      if (scope_.FindVar(output) == nullptr) {
+        return false;
+      }
+    }
+    return true;
   }
-  return true;
-}
 
-void RuntimeInferShapeContext::ShareLoD(const std::string& in,
-                                        const std::string& out, size_t i,
-                                        size_t j) const {
-  PADDLE_ENFORCE_LT(i, Inputs(in).size());
-  PADDLE_ENFORCE_LT(j, Outputs(out).size());
-  Variable* in_var = scope_.FindVar(Inputs(in)[i]);
-  Variable* out_var = scope_.FindVar(Outputs(out)[j]);
-  if (!in_var->IsType<LoDTensor>()) return;
-  PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
-                 "The %d-th output of Output(%s) must be LoDTensor.", j, out);
-  auto in_tensor = in_var->Get<LoDTensor>();
-  auto* out_tensor = out_var->GetMutable<LoDTensor>();
-  out_tensor->set_lod(in_tensor.lod());
+  AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
+
+  const std::vector<std::string>& Inputs(
+      const std::string& name) const override {
+    return op_.Inputs(name);
+  }
+
+  const std::vector<std::string>& Outputs(
+      const std::string& name) const override {
+    return op_.Outputs(name);
+  }
+
+  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
+                size_t j = 0) const override {
+    PADDLE_ENFORCE_LT(i, Inputs(in).size());
+    PADDLE_ENFORCE_LT(j, Outputs(out).size());
+    Variable* in_var = scope_.FindVar(Inputs(in)[i]);
+    Variable* out_var = scope_.FindVar(Outputs(out)[j]);
+    if (!in_var->IsType<LoDTensor>()) return;
+    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
+                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
+    auto in_tensor = in_var->Get<LoDTensor>();
+    auto* out_tensor = out_var->GetMutable<LoDTensor>();
+    out_tensor->set_lod(in_tensor.lod());
 
 // TODO(dzhwinter) : reuse ShareLoD in most operators.
 // Need to call ShareLayout explicitly in sequence related ops.
 // Shall we have a better method to shared info between in/out Tensor?
 #ifdef PADDLE_WITH_MKLDNN
-  // Fix me: ugly workaround below
-  // Correct solution:
-  //    set_layout() should NOT be called here (i.e. ShareLoD). Instead,
-  //    layout of output tensor should be set "manually" in Compute()
-  //    of each OPKernel. The reason layout should NOT be shared between
-  //    input and output "automatically" (now by InferShape()->ShareLoD())
-  //    is that layout transform may occur after InferShape().
-  // Workaround:
-  //    Skip set_layout() when input layout is kMKLDNN
-  //    This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN
-  //    OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called
-  //    in Compute()
-  if (in_tensor.layout() != DataLayout::kMKLDNN)
+    // Fix me: ugly workaround below
+    // Correct solution:
+    //    set_layout() should NOT be called here (i.e. ShareLoD). Instead,
+    //    layout of output tensor should be set "manually" in Compute()
+    //    of each OPKernel. The reason layout should NOT be shared between
+    //    input and output "automatically" (now by InferShape()->ShareLoD())
+    //    is that layout transform may occur after InferShape().
+    // Workaround:
+    //    Skip set_layout() when input layout is kMKLDNN
+    //    This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN
+    //    OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called
+    //    in Compute()
+    if (in_tensor.layout() != DataLayout::kMKLDNN)
 #endif
+      out_tensor->set_layout(in_tensor.layout());
+  }
+
+  void ShareLayout(const std::string& in, const std::string& out, size_t i = 0,
+                   size_t j = 0) const {
+    PADDLE_ENFORCE_LT(i, Inputs(in).size());
+    PADDLE_ENFORCE_LT(j, Outputs(out).size());
+    Variable* in_var = scope_.FindVar(Inputs(in)[i]);
+    Variable* out_var = scope_.FindVar(Outputs(out)[j]);
+    if (!in_var->IsType<LoDTensor>()) return;
+    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
+                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
+    auto in_tensor = in_var->Get<LoDTensor>();
+    auto* out_tensor = out_var->GetMutable<LoDTensor>();
     out_tensor->set_layout(in_tensor.layout());
-}
+  }
 
-void RuntimeInferShapeContext::ShareLayout(const std::string& in,
-                                           const std::string& out, size_t i,
-                                           size_t j) const {
-  PADDLE_ENFORCE_LT(i, Inputs(in).size());
-  PADDLE_ENFORCE_LT(j, Outputs(out).size());
-  Variable* in_var = scope_.FindVar(Inputs(in)[i]);
-  Variable* out_var = scope_.FindVar(Outputs(out)[j]);
-  if (!in_var->IsType<LoDTensor>()) return;
-  PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
-                 "The %d-th output of Output(%s) must be LoDTensor.", j, out);
-  auto in_tensor = in_var->Get<LoDTensor>();
-  auto* out_tensor = out_var->GetMutable<LoDTensor>();
-  out_tensor->set_layout(in_tensor.layout());
-}
-
-DDim RuntimeInferShapeContext::GetDim(const std::string& name) const {
-  Variable* var = scope_.FindVar(name);
-  PADDLE_ENFORCE_NOT_NULL(var);
-  if (var->IsType<LoDTensor>()) {
-    return var->Get<LoDTensor>().dims();
-  } else if (var->IsType<SelectedRows>()) {
-    return var->Get<SelectedRows>().GetCompleteDims();
-  } else {
-    PADDLE_THROW(
-        "Only LoDTensor/SelectedRows support 'GetDim', but Variable %s's "
-        "type_id is %s.",
-        name, var->Type().name());
+  bool IsRuntime() const override { return true; }
+
+ protected:
+  DDim GetDim(const std::string& name) const override {
+    Variable* var = scope_.FindVar(name);
+    PADDLE_ENFORCE_NOT_NULL(var);
+    if (var->IsType<LoDTensor>()) {
+      return var->Get<LoDTensor>().dims();
+    } else if (var->IsType<SelectedRows>()) {
+      return var->Get<SelectedRows>().GetCompleteDims();
+    } else {
+      PADDLE_THROW(
+          "Only LoDTensor/SelectedRows support 'GetDim', but Variable %s's "
+          "type_id is %s.",
+          name, var->Type().name());
+    }
   }
-}
 
-void RuntimeInferShapeContext::SetDim(const std::string& name,
-                                      const DDim& dim) {
-  Variable* var = scope_.FindVar(name);
-  if (var->IsType<LoDTensor>()) {
-    var->GetMutable<LoDTensor>()->Resize(dim);
-  } else if (var->IsType<SelectedRows>()) {
-    var->GetMutable<SelectedRows>()->set_height(dim[0]);
-  } else {
-    PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.", name,
-                 var->Type().name());
+  std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
+    PADDLE_THROW("Only compile time support this method");
   }
-}
+
+  void SetDim(const std::string& name, const DDim& dim) override {
+    Variable* var = scope_.FindVar(name);
+    if (var->IsType<LoDTensor>()) {
+      var->GetMutable<LoDTensor>()->Resize(dim);
+    } else if (var->IsType<SelectedRows>()) {
+      var->GetMutable<SelectedRows>()->set_height(dim[0]);
+    } else {
+      PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.",
+                   name, var->Type().name());
+    }
+  }
+
+  void SetRepeatedDims(const std::string& name,
+                       const std::vector<DDim>& dims) override {
+    PADDLE_THROW("Only compile time support this method");
+  }
+
+  proto::VarType::Type GetVarType(const std::string& name) const override {
+    auto* var = scope_.FindVar(name);
+    return ToVarType(var->Type());
+  }
+
+  InferShapeVarPtr GetVarPtr(const std::string& name) override {
+    return scope_.FindVar(name);
+  }
+
+ private:
+  const OperatorBase& op_;
+  const Scope& scope_;
+};
 
 static void CheckTensorNANOrInf(const std::string& name,
                                 const framework::Tensor& tensor) {
diff --git a/paddle/fluid/framework/shape_runtime_infer.h b/paddle/fluid/framework/shape_runtime_infer.h
deleted file mode 100644
index 04d4e33f7a..0000000000
--- a/paddle/fluid/framework/shape_runtime_infer.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/shape_inference.h"
-#include "paddle/fluid/framework/var_type.h"
-
-namespace paddle {
-namespace framework {
-
-class RuntimeInferShapeContext : public InferShapeContext {
- public:
-  RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope)
-      : op_(op), scope_(scope) {}
-
-  bool HasInput(const std::string& name) const override;
-  bool HasOutput(const std::string& name) const override;
-  bool HasInputs(const std::string& name) const override;
-  bool HasOutputs(const std::string& name) const override;
-
-  const OperatorBase& OpBase() const { return op_; }
-
-  const Scope& InferScope() const { return scope_; }
-  AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
-
-  const std::vector<std::string>& Inputs(
-      const std::string& name) const override {
-    return op_.Inputs(name);
-  }
-
-  const std::vector<std::string>& Outputs(
-      const std::string& name) const override {
-    return op_.Outputs(name);
-  }
-
-  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
-                size_t j = 0) const override;
-
-  void ShareLayout(const std::string& in, const std::string& out, size_t i = 0,
-                   size_t j = 0) const;
-
-  bool IsRuntime() const override { return true; }
-
- protected:
-  DDim GetDim(const std::string& name) const override;
-  void SetDim(const std::string& name, const DDim& dim) override;
-
-  std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
-    PADDLE_THROW("Only compile time support this method");
-  }
-  void SetRepeatedDims(const std::string& name,
-                       const std::vector<DDim>& dims) override {
-    PADDLE_THROW("Only compile time support this method");
-  }
-
-  proto::VarType::Type GetVarType(const std::string& name) const override {
-    auto* var = scope_.FindVar(name);
-    return ToVarType(var->Type());
-  }
-
-  InferShapeVarPtr GetVarPtr(const std::string& name) override {
-    return scope_.FindVar(name);
-  }
-
- private:
-  const OperatorBase& op_;
-  const Scope& scope_;
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index 7531aa9a46..9b943440a8 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/attention_lstm_op.h"
 #include <string>
-#include "paddle/fluid/operators/fusion_infershape_define.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
@@ -24,28 +23,28 @@ namespace paddle {
 namespace operators {
 
 void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
-  FUSION_INFERSHAPE_INIT;
-  PADDLE_ENFORCE(fair_input("X"), "Assert only one Input(X) of AttentionLSTM.");
-  PADDLE_ENFORCE(fair_input("C0"),
+  PADDLE_ENFORCE(ctx->HasInput("X"),
+                 "Assert only one Input(X) of AttentionLSTM.");
+  PADDLE_ENFORCE(ctx->HasInput("C0"),
                  "Assert only one Input(C0) of AttentionLSTM.");
-  PADDLE_ENFORCE(fair_input("LSTMWeight"),
+  PADDLE_ENFORCE(ctx->HasInput("LSTMWeight"),
                  "Assert only one Input(LSTMWeight) of AttentionLSTM.");
-  PADDLE_ENFORCE(fair_input("LSTMBias"),
+  PADDLE_ENFORCE(ctx->HasInput("LSTMBias"),
                  "Assert only one Input(LSTMBias) of AttentionLSTM.");
-  PADDLE_ENFORCE(fair_input("AttentionWeight"),
+  PADDLE_ENFORCE(ctx->HasInput("AttentionWeight"),
                  "Assert only one Input(AttentionWeight) of AttentionLSTM.");
 
-  PADDLE_ENFORCE(fair_output("Hidden"),
+  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
                  "Assert only one Output(Hidden) of AttentionLSTM.");
-  PADDLE_ENFORCE(fair_output("Cell"),
+  PADDLE_ENFORCE(ctx->HasOutput("Cell"),
                  "Assert only one Output(Cell) of AttentionLSTM.");
-  PADDLE_ENFORCE(fair_output("AttentionedX"),
+  PADDLE_ENFORCE(ctx->HasOutput("AttentionedX"),
                  "Assert only one Output(AttentionedX) of AttentionLSTM.");
-  PADDLE_ENFORCE(fair_output("AttentionFCOut"),
+  PADDLE_ENFORCE(ctx->HasOutput("AttentionFCOut"),
                  "Assert only one Output(AttentionFCOut) of AttentionLSTM.");
-  PADDLE_ENFORCE(fair_output("LSTMX"),
+  PADDLE_ENFORCE(ctx->HasOutput("LSTMX"),
                  "Assert only one Output(LSTMX) of AttentionLSTM.");
-  PADDLE_ENFORCE(fair_output("LSTMOUT"),
+  PADDLE_ENFORCE(ctx->HasOutput("LSTMOUT"),
                  "Assert only one Output(LSTMOUT) of AttentionLSTM.");
 
   auto x_dims = ctx->GetInputDim("X");
@@ -66,7 +65,7 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
   auto c_dims = ctx->GetInputDim("C0");
   PADDLE_ENFORCE_EQ(c_dims.size(), 2, "Input(C0)'s rank must be 2.");
   PADDLE_ENFORCE_EQ(c_dims[1], D, "C0 dims should be N x %d.", D);
-  if (fair_input("H0")) {
+  if (ctx->HasInput("H0")) {
     auto h_dims = ctx->GetInputDim("H0");
     PADDLE_ENFORCE(h_dims == c_dims,
                    "The dimension of Input(H0) and Input(C0) "
@@ -80,7 +79,7 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
                     "AttentionWeight shapes must be (%d + %d) * 1.", M, D);
   PADDLE_ENFORCE_EQ(atten_w_dims[1], 1,
                     "AttentionWeight shapes must be (%d + %d) * 1.", M, D);
-  if (fair_input("AttentionBias")) {
+  if (ctx->HasInput("AttentionBias")) {
     auto atten_b_dims = ctx->GetInputDim("AttentionBias");
     PADDLE_ENFORCE_EQ(atten_b_dims.size(), 2,
                       "Input(AttentionBias)'s rank must be 2.");
@@ -90,7 +89,7 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
                       "AttentionBias shapes must be 1 * 1.");
   }
 
-  if (fair_input("AttentionScalar")) {
+  if (ctx->HasInput("AttentionScalar")) {
     auto dims = ctx->GetInputDim("AttentionScalar");
     PADDLE_ENFORCE_EQ(dims.size(), 2,
                       "Input(AttentionScalar)'s rank must be 2.");
@@ -98,10 +97,10 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
     PADDLE_ENFORCE_EQ(dims[1], 1, "AttentionScalar shapes must be 1 * 1.");
   }
 
-  if (fair_input("AttentionScalarBias")) {
+  if (ctx->HasInput("AttentionScalarBias")) {
     auto dims = ctx->GetInputDim("AttentionScalarBias");
     PADDLE_ENFORCE(
-        fair_input("AttentionScalar"),
+        ctx->HasInput("AttentionScalar"),
         "AttentionScalar should not be null when have AttentionScalarBias.");
     PADDLE_ENFORCE_EQ(dims.size(), 2,
                       "Input(AttentionScalarBias)'s rank must be 2.");
diff --git a/paddle/fluid/operators/fusion_gru_op.cc b/paddle/fluid/operators/fusion_gru_op.cc
index b10d311f05..31e87d9113 100644
--- a/paddle/fluid/operators/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fusion_gru_op.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/fusion_gru_op.h"
 #include <cstring>  // for memcpy
 #include <string>
-#include "paddle/fluid/operators/fusion_infershape_define.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
@@ -26,14 +25,13 @@ namespace paddle {
 namespace operators {
 
 void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
-  FUSION_INFERSHAPE_INIT;
-  PADDLE_ENFORCE(fair_input("X"), "Assert only one Input(X) of GRU.");
-  PADDLE_ENFORCE(fair_input("WeightX"),
+  PADDLE_ENFORCE(ctx->HasInput("X"), "Assert only one Input(X) of GRU.");
+  PADDLE_ENFORCE(ctx->HasInput("WeightX"),
                  "Assert only one Input(WeightX) of GRU.");
-  PADDLE_ENFORCE(fair_input("WeightH"),
+  PADDLE_ENFORCE(ctx->HasInput("WeightH"),
                  "Assert only one Input(WeightH) of GRU.");
-  PADDLE_ENFORCE(fair_output("XX"), "Assert only one Output(XX) of GRU.");
-  PADDLE_ENFORCE(fair_output("Hidden"),
+  PADDLE_ENFORCE(ctx->HasOutput("XX"), "Assert only one Output(XX) of GRU.");
+  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
                  "Assert only one Output(Hidden) of GRU.");
 
   auto x_dims = ctx->GetInputDim("X");
@@ -60,12 +58,12 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
                     "should be 3 * %d.",
                     frame_size);
 
-  if (fair_input("H0")) {
+  if (ctx->HasInput("H0")) {
     auto h0_dims = ctx->GetInputDim("H0");
     PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
                       "The width of H0 must be equal to frame_size.");
   }
-  if (fair_input("Bias")) {
+  if (ctx->HasInput("Bias")) {
     auto b_dims = ctx->GetInputDim("Bias");
     PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
     PADDLE_ENFORCE_EQ(b_dims[0], 1,
@@ -81,11 +79,11 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
     xx_width = wx_dims[1];
   } else {
     xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
-    PADDLE_ENFORCE(fair_output("ReorderedH0"),
+    PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
                    "Assert only one Output(ReorderedH0) of GRU.");
-    PADDLE_ENFORCE(fair_output("BatchedInput"),
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
                    "Assert only one Output(BatchedInput) of GRU.");
-    PADDLE_ENFORCE(fair_output("BatchedOut"),
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"),
                    "Assert only one Output(BatchedOut) of GRU.");
     ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
     ctx->SetOutputDim("BatchedOut", out_dims);
diff --git a/paddle/fluid/operators/fusion_infershape_define.h b/paddle/fluid/operators/fusion_infershape_define.h
deleted file mode 100644
index 89521672b0..0000000000
--- a/paddle/fluid/operators/fusion_infershape_define.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_FLUID_OPERATORS_FUSION_INFERSHAPE_DEFINE_H_
-#define PADDLE_FLUID_OPERATORS_FUSION_INFERSHAPE_DEFINE_H_
-
-#include <string>
-#include "paddle/fluid/framework/shape_runtime_infer.h"
-
-namespace paddle {
-namespace operators {
-
-#define FUSION_INFERSHAPE_INIT                                                 \
-  auto* runtime_ctx = dynamic_cast<framework::RuntimeInferShapeContext*>(ctx); \
-  if (runtime_ctx == nullptr) {                                                \
-    LOG(FATAL) << "Should have runtime infer context";                         \
-  }                                                                            \
-  const auto& ins = runtime_ctx->OpBase().Inputs();                            \
-  const auto& outs = runtime_ctx->OpBase().Outputs();                          \
-  const auto& scope = runtime_ctx->InferScope();                               \
-  const auto ins_end = ins.end();                                              \
-  const auto outs_end = outs.end();                                            \
-  auto fair_input = [&](const std::string& name) -> bool {                     \
-    auto it = ins.find(name);                                                  \
-    if (it == ins_end) {                                                       \
-      return false;                                                            \
-    }                                                                          \
-    const auto& in = it->second;                                               \
-    if (in.size() != 1 || in[0] == framework::kEmptyVarName) {                 \
-      return false;                                                            \
-    }                                                                          \
-    return scope.FindVar(in[0]) != nullptr;                                    \
-  };                                                                           \
-  auto fair_output = [&](const std::string& name) -> bool {                    \
-    auto it = outs.find(name);                                                 \
-    if (it == outs_end) {                                                      \
-      return false;                                                            \
-    }                                                                          \
-    const auto& out = it->second;                                              \
-    if (out.size() != 1 || out[0] == framework::kEmptyVarName) {               \
-      return false;                                                            \
-    }                                                                          \
-    return scope.FindVar(out[0]) != nullptr;                                   \
-  }
-
-}  // namespace operators
-}  // namespace paddle
-
-#endif  // PADDLE_FLUID_OPERATORS_FUSION_INFERSHAPE_DEFINE_H_
diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc
index 08af98f850..55e465e3af 100644
--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fusion_lstm_op.h"
 #include <string>
-#include "paddle/fluid/operators/fusion_infershape_define.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
@@ -25,23 +24,23 @@ namespace paddle {
 namespace operators {
 
 void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
-  FUSION_INFERSHAPE_INIT;
-  PADDLE_ENFORCE(fair_input("X"), "Assert only one Input(X) of LSTM.");
-  PADDLE_ENFORCE(fair_input("WeightX"),
+  PADDLE_ENFORCE(ctx->HasInput("X"), "Assert only one Input(X) of LSTM.");
+  PADDLE_ENFORCE(ctx->HasInput("WeightX"),
                  "Assert only one Input(WeightX) of LSTM.");
-  PADDLE_ENFORCE(fair_input("WeightH"),
+  PADDLE_ENFORCE(ctx->HasInput("WeightH"),
                  "Assert only one Input(WeightH) of LSTM.");
-  PADDLE_ENFORCE(fair_input("Bias"), "Assert only one Input(Bias) of LSTM.");
-  PADDLE_ENFORCE(fair_output("XX"), "Assert only one Output(XX) of LSTM.");
-  PADDLE_ENFORCE(fair_output("Hidden"),
+  PADDLE_ENFORCE(ctx->HasInput("Bias"), "Assert only one Input(Bias) of LSTM.");
+  PADDLE_ENFORCE(ctx->HasOutput("XX"), "Assert only one Output(XX) of LSTM.");
+  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
                  "Assert only one Output(Hidden) of LSTM.");
-  PADDLE_ENFORCE(fair_output("Cell"), "Assert only one Output(Cell) of LSTM.");
+  PADDLE_ENFORCE(ctx->HasOutput("Cell"),
+                 "Assert only one Output(Cell) of LSTM.");
 
   auto x_dims = ctx->GetInputDim("X");
   PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
 
-  if (fair_input("H0")) {
-    PADDLE_ENFORCE(fair_input("C0"),
+  if (ctx->HasInput("H0")) {
+    PADDLE_ENFORCE(ctx->HasInput("C0"),
                    "Input(Cell) and Input(Hidden) of LSTM should not "
                    "be null at the same time.");
     auto h_dims = ctx->GetInputDim("H0");
@@ -93,15 +92,15 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
     xx_width = wx_dims[1];
   } else {
     xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
-    PADDLE_ENFORCE(fair_output("BatchedInput"),
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
                    "Assert only one Output(BatchedInput) of LSTM.");
-    PADDLE_ENFORCE(fair_output("BatchedHidden"),
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
                    "Assert only one Output(BatchedHidden) of LSTM.");
-    PADDLE_ENFORCE(fair_output("BatchedCell"),
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"),
                    "Assert only one Output(BatchedCell) of LSTM.");
-    PADDLE_ENFORCE(fair_output("ReorderedH0"),
+    PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
                    "Assert only one Output(ReorderedH0) of LSTM");
-    PADDLE_ENFORCE(fair_output("ReorderedC0"),
+    PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"),
                    "Assert only one Output(ReorderedC0) of LSTM.");
     ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
     ctx->SetOutputDim("BatchedHidden", out_dims);

From 8cee9f6176caa87e21109c665fa95c51d3ab296c Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Wed, 12 Sep 2018 09:55:46 +0800
Subject: [PATCH 46/85] Fix rpcclient's wait action in aync env. (#13307)

---
 .../operators/distributed/CMakeLists.txt      |   1 +
 .../operators/distributed/grpc_client.cc      | 142 +++++++++---------
 .../fluid/operators/distributed/grpc_client.h | 110 +++++++-------
 .../operators/distributed/request_handler.h   |  75 +++++++--
 .../fluid/operators/distributed/rpc_client.h  |  63 ++++----
 .../operators/distributed/varhandle_test.cc   |  55 +++++++
 paddle/fluid/operators/prefetch_op.cc         |   8 +-
 paddle/fluid/operators/recv_op.cc             |   7 +-
 paddle/fluid/operators/send_op.cc             |  10 +-
 9 files changed, 296 insertions(+), 175 deletions(-)
 create mode 100644 paddle/fluid/operators/distributed/varhandle_test.cc

diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index da5d20505e..56734b81e8 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -20,6 +20,7 @@ if(WITH_GRPC)
     DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
   cc_test(rpc_server_test SRCS rpc_server_test.cc
     DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_sparse_table_op SERIAL)
+  cc_test(varhandle_test SRCS varhandle_test.cc)
   return()
 endif()
 
diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
index b4f60c9ff9..07ac20797d 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -59,40 +59,32 @@ GRPCClient::~GRPCClient() {
     }
     channels_.clear();
   }
-
   client_thread_->join();
 }
 
-bool GRPCClient::AsyncSendVar(const std::string& ep,
-                              const platform::DeviceContext& ctx,
-                              const framework::Scope& scope,
-                              const std::string& var_name, int64_t time_out) {
+VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
+                                      const platform::DeviceContext& ctx,
+                                      const framework::Scope& scope,
+                                      const std::string& var_name,
+                                      int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string var_name_val = var_name;
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
+  SendProcessor* s = new SendProcessor(ch);
+  VarHandlePtr h(new VarHandle(ep, "Send", var_name_val, p_ctx, p_scope));
+  s->Prepare(h, time_out);
 
-  framework::AsyncIO([var_name_val, p_ctx, ep_val, p_scope, time_out, ch,
-                      this] {
+  framework::AsyncIO([var_name_val, p_scope, p_ctx, s, this] {
     auto* var = p_scope->FindVar(var_name_val);
 
     ::grpc::ByteBuffer req;
     SerializeToByteBuffer(var_name_val, var, *p_ctx, &req);
 
-    // varhandle
-    VarHandle var_h;
-    var_h.ep = ep_val;
-    var_h.scope = p_scope;
-    var_h.name = var_name_val;
-    var_h.ctx = p_ctx;
-    var_h.method = "Send";
-
-    VLOG(3) << var_h.String() << " begin";
+    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
 
     // stub context
-    SendProcessor* s = new SendProcessor(ch);
-    s->Prepare(var_h, time_out);
     s->response_call_back_ = nullptr;
 
     auto call = s->stub_g_.PrepareUnaryCall(
@@ -102,13 +94,13 @@ bool GRPCClient::AsyncSendVar(const std::string& ep,
   });
   req_count_++;
 
-  return true;
+  return h;
 }
 
 void ProcGetResponse(const VarHandle& var_h,
                      const ::grpc::ByteBuffer& ret_msg) {
   framework::Variable* outvar = nullptr;
-  DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar);
+  DeserializeFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar);
 }
 
 template <typename T>
@@ -119,37 +111,30 @@ void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
   result->Swap(&tmp);
 }
 
-bool GRPCClient::AsyncGetVar(const std::string& ep,
-                             const platform::DeviceContext& ctx,
-                             const framework::Scope& scope,
-                             const std::string& var_name, int64_t time_out) {
+VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
+                                     const platform::DeviceContext& ctx,
+                                     const framework::Scope& scope,
+                                     const std::string& var_name,
+                                     int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string var_name_val = var_name;
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
+  GetProcessor* s = new GetProcessor(ch);
+  VarHandlePtr h(new VarHandle(ep, "Get", var_name_val, p_ctx, p_scope));
+  s->Prepare(h, time_out);
 
-  framework::AsyncIO([var_name_val, ep_val, p_scope, p_ctx, time_out, ch,
-                      this] {
+  framework::AsyncIO([var_name_val, p_scope, p_ctx, s, this] {
     // prepare input
     sendrecv::VariableMessage req;
     req.set_varname(var_name_val);
     ::grpc::ByteBuffer buf;
     RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
 
-    // var handle
-    VarHandle var_h;
-    var_h.ep = ep_val;
-    var_h.scope = p_scope;
-    var_h.name = var_name_val;
-    var_h.ctx = p_ctx;
-    var_h.method = "Get";
-
-    VLOG(3) << var_h.String() << " begin";
+    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
 
     // stub context
-    GetProcessor* s = new GetProcessor(ch);
-    s->Prepare(var_h, time_out);
     s->response_call_back_ = ProcGetResponse;
 
     auto call = s->stub_g_.PrepareUnaryCall(
@@ -160,42 +145,36 @@ bool GRPCClient::AsyncGetVar(const std::string& ep,
 
   req_count_++;
 
-  return true;
+  return h;
 }
 
-bool GRPCClient::AsyncPrefetchVar(const std::string& ep,
-                                  const platform::DeviceContext& ctx,
-                                  const framework::Scope& scope,
-                                  const std::string& in_var_name,
-                                  const std::string& out_var_name,
-                                  int64_t time_out) {
+VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
+                                          const platform::DeviceContext& ctx,
+                                          const framework::Scope& scope,
+                                          const std::string& in_var_name,
+                                          const std::string& out_var_name,
+                                          int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string in_var_name_val = in_var_name;
   const std::string out_var_name_val = out_var_name;
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
+  GetProcessor* s = new GetProcessor(ch);
+  VarHandlePtr h(
+      new VarHandle(ep, "Prefetch", out_var_name_val, p_ctx, p_scope));
+  s->Prepare(h, time_out);
 
   framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
-                      time_out, ch, this] {
+                      time_out, s, this] {
     auto* var = p_scope->FindVar(in_var_name_val);
 
     ::grpc::ByteBuffer req;
     SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val);
 
-    // var handle
-    VarHandle var_h;
-    var_h.ep = ep_val;
-    var_h.scope = p_scope;
-    var_h.name = out_var_name_val;
-    var_h.ctx = p_ctx;
-    var_h.method = "Prefetch";
-
-    VLOG(3) << var_h.String() << " begin";
+    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
 
     // stub context
-    GetProcessor* s = new GetProcessor(ch);
-    s->Prepare(var_h, time_out);
     s->response_call_back_ = ProcGetResponse;
 
     auto call = s->stub_g_.PrepareUnaryCall(
@@ -206,56 +185,68 @@ bool GRPCClient::AsyncPrefetchVar(const std::string& ep,
   });
 
   req_count_++;
-  return true;
+  return h;
 }
 
-void GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
-                                       int64_t time_out) {
+VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
+                                               int64_t time_out) {
   const auto ch = GetChannel(ep);
 
   BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  s->Prepare(time_out);
+  VarHandlePtr h(new VarHandle(ep, "BatchBarrier", BATCH_BARRIER_MESSAGE,
+                               nullptr, nullptr));
+  s->Prepare(h, time_out);
 
   sendrecv::VariableMessage req;
   req.set_varname(BATCH_BARRIER_MESSAGE);
   auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
   req_count_++;
+  return h;
 }
 
-void GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
-                                       int64_t time_out) {
+VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
+                                               int64_t time_out) {
   const auto ch = GetChannel(ep);
   FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
-  s->Prepare(time_out);
+  VarHandlePtr h(new VarHandle(ep, "FetchBarrier", FETCH_BARRIER_MESSAGE,
+                               nullptr, nullptr));
+  s->Prepare(h, time_out);
 
   sendrecv::VariableMessage req;
   req.set_varname(FETCH_BARRIER_MESSAGE);
   auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
   req_count_++;
+  return h;
 }
 
-void GRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) {
+VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
+                                           int64_t time_out) {
   const auto ch = GetChannel(ep);
 
   BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  s->Prepare(time_out);
+  VarHandlePtr h(
+      new VarHandle(ep, "SendComplete", COMPLETE_MESSAGE, nullptr, nullptr));
+  s->Prepare(h, time_out);
 
   sendrecv::VariableMessage req;
   req.set_varname(COMPLETE_MESSAGE);
   auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
   req_count_++;
+  return h;
 }
 
-void GRPCClient::AsyncCheckpointNotify(const std::string& ep,
-                                       const std::string& dir,
-                                       int64_t time_out) {
+VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
+                                               const std::string& dir,
+                                               int64_t time_out) {
   const auto ch = GetChannel(ep);
 
   CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch);
-  s->Prepare(time_out);
+  VarHandlePtr h(new VarHandle(ep, "CheckPointNotify", CHECKPOINT_SAVE_MESSAGE,
+                               nullptr, nullptr));
+  s->Prepare(h, time_out);
 
   sendrecv::VariableMessage req;
   req.set_varname(CHECKPOINT_SAVE_MESSAGE);
@@ -264,6 +255,7 @@ void GRPCClient::AsyncCheckpointNotify(const std::string& ep,
   auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_);
   rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
   req_count_++;
+  return h;
 }
 
 bool GRPCClient::Wait() {
@@ -276,25 +268,28 @@ void GRPCClient::Proceed() {
   void* tag = nullptr;
   bool ok = false;
 
+  VLOG(3) << "GRPCClient Proceed begin";
   while (!stopped_ && cq_.Next(&tag, &ok)) {
     BaseProcessor* c = static_cast<BaseProcessor*>(tag);
     GPR_ASSERT(ok);
     PADDLE_ENFORCE(c);
     if (c->status_.ok()) {
-      VLOG(3) << c->var_h_.String() << " process";
+      VLOG(3) << c->GetVarHandlePtr()->String() << " process";
       c->Process();
     } else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) {
-      LOG(ERROR) << c->var_h_.String()
+      LOG(ERROR) << c->GetVarHandlePtr()->String()
                  << " meets grpc error:" << c->status_.error_message();
       {
         std::lock_guard<std::mutex> lk(sync_mutex_);
         ok_ = false;
       }
-      sync_cond_.notify_all();
+      c->Finish(false);
     } else {
-      LOG(FATAL) << c->var_h_.String()
+      LOG(FATAL) << c->GetVarHandlePtr()->String()
                  << " meets grpc error:" << c->status_.error_message();
+      c->Finish(false);
     }
+
     delete c;
     {
       std::lock_guard<std::mutex> lk(sync_mutex_);
@@ -302,6 +297,7 @@ void GRPCClient::Proceed() {
     }
     sync_cond_.notify_all();
   }
+  VLOG(3) << "GRPCClient Proceed end";
 }
 
 std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h
index 0c95ffeb5c..75a3662316 100644
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -53,15 +53,14 @@ void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
 
 class BaseProcessor {
  public:
-  explicit BaseProcessor(std::shared_ptr<grpc::Channel> ch) {
-    context_ = nullptr;
-  }
+  BaseProcessor() { context_ = nullptr; }
 
   virtual ~BaseProcessor() {}
 
-  virtual void Prepare(const VarHandle& var_info, int64_t time_out) {
+  virtual void Prepare(VarHandlePtr h, int64_t time_out) {
+    var_h_ = h;
+
     context_.reset(new grpc::ClientContext());
-    var_h_ = var_info;
     context_->set_wait_for_ready(true);
     if (time_out) {
       std::chrono::system_clock::time_point deadline =
@@ -71,21 +70,21 @@ class BaseProcessor {
     }
   }
 
-  virtual void Prepare(int64_t time_out) {
-    context_.reset(new grpc::ClientContext());
-    context_->set_wait_for_ready(true);
-
-    std::chrono::system_clock::time_point deadline =
-        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
-
-    context_->set_deadline(deadline);
+  void Process() {
+    ProcessImpl();
+    var_h_->Finish(true);
   }
 
-  virtual void Process() = 0;
+  VarHandlePtr GetVarHandlePtr() { return var_h_; }
+  bool Wait() { return var_h_->Wait(); }
+  void Finish(bool ok) { return var_h_->Finish(ok); }
+  virtual void ProcessImpl() = 0;
 
   std::unique_ptr<grpc::ClientContext> context_;
   grpc::Status status_;
-  VarHandle var_h_;
+
+ protected:
+  VarHandlePtr var_h_;
 };
 
 typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
@@ -94,13 +93,13 @@ typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
 class SendProcessor : public BaseProcessor {
  public:
   explicit SendProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch), stub_g_(ch) {}
+      : BaseProcessor(), stub_g_(ch) {}
 
   virtual ~SendProcessor() {}
 
-  virtual void Process() {
+  void ProcessImpl() override {
     if (response_call_back_) {
-      response_call_back_(var_h_, reply_);
+      response_call_back_(*var_h_.get(), reply_);
     }
   }
 
@@ -115,13 +114,13 @@ typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
 class GetProcessor : public BaseProcessor {
  public:
   explicit GetProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch), stub_g_(ch) {}
+      : BaseProcessor(), stub_g_(ch) {}
 
   virtual ~GetProcessor() {}
 
-  virtual void Process() {
+  void ProcessImpl() override {
     if (response_call_back_) {
-      response_call_back_(var_h_, reply_);
+      response_call_back_(*var_h_.get(), reply_);
     }
   }
 
@@ -133,13 +132,13 @@ class GetProcessor : public BaseProcessor {
 class BatchBarrierProcessor : public BaseProcessor {
  public:
   explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {
+      : BaseProcessor() {
     stub_ = sendrecv::SendRecvService::NewStub(ch);
   }
 
   virtual ~BatchBarrierProcessor() {}
 
-  virtual void Process() {}
+  void ProcessImpl() override {}
   sendrecv::VoidMessage reply_;
   std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
 };
@@ -147,13 +146,13 @@ class BatchBarrierProcessor : public BaseProcessor {
 class FetchBarrierProcessor : public BaseProcessor {
  public:
   explicit FetchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {
+      : BaseProcessor() {
     stub_ = sendrecv::SendRecvService::NewStub(ch);
   }
 
   virtual ~FetchBarrierProcessor() {}
 
-  virtual void Process() {}
+  void ProcessImpl() override {}
   sendrecv::VariableMessage reply_;
   std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
 };
@@ -161,13 +160,13 @@ class FetchBarrierProcessor : public BaseProcessor {
 class CheckpointNotifyProcessor : public BaseProcessor {
  public:
   explicit CheckpointNotifyProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {
+      : BaseProcessor() {
     stub_ = sendrecv::SendRecvService::NewStub(ch);
   }
 
   virtual ~CheckpointNotifyProcessor() {}
 
-  virtual void Process() {}
+  void ProcessImpl() override {}
   sendrecv::VoidMessage reply_;
   std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
 };
@@ -177,32 +176,37 @@ class GRPCClient : public RPCClient {
   GRPCClient() : ok_(true), completed_(false), stopped_(false) {}
   virtual ~GRPCClient();
 
-  bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
-                    const framework::Scope& scope, const std::string& var_name,
-                    int64_t time_out = FLAGS_rpc_deadline) override;
-
-  bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
-                   const framework::Scope& scope, const std::string& var_name,
-                   int64_t time_out = FLAGS_rpc_deadline) override;
-
-  bool AsyncPrefetchVar(const std::string& ep,
-                        const platform::DeviceContext& ctx,
-                        const framework::Scope& scope,
-                        const std::string& in_var_name,
-                        const std::string& out_var_name,
-                        int64_t time_out = FLAGS_rpc_deadline) override;
-
-  void AsyncSendBatchBarrier(const std::string& ep,
-                             int64_t time_out = FLAGS_rpc_deadline) override;
-
-  void AsyncSendFetchBarrier(const std::string& ep,
-                             int64_t time_out = FLAGS_rpc_deadline) override;
-
-  void AsyncCheckpointNotify(const std::string& ep, const std::string& dir,
-                             int64_t time_out = FLAGS_rpc_deadline) override;
-
-  void AsyncSendComplete(const std::string& ep,
-                         int64_t time_out = FLAGS_rpc_deadline) override;
+  VarHandlePtr AsyncSendVar(const std::string& ep,
+                            const platform::DeviceContext& ctx,
+                            const framework::Scope& scope,
+                            const std::string& var_name,
+                            int64_t time_out = FLAGS_rpc_deadline) override;
+
+  VarHandlePtr AsyncGetVar(const std::string& ep,
+                           const platform::DeviceContext& ctx,
+                           const framework::Scope& scope,
+                           const std::string& var_name,
+                           int64_t time_out = FLAGS_rpc_deadline) override;
+
+  VarHandlePtr AsyncPrefetchVar(const std::string& ep,
+                                const platform::DeviceContext& ctx,
+                                const framework::Scope& scope,
+                                const std::string& in_var_name,
+                                const std::string& out_var_name,
+                                int64_t time_out = FLAGS_rpc_deadline) override;
+
+  VarHandlePtr AsyncSendBatchBarrier(
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
+
+  VarHandlePtr AsyncSendFetchBarrier(
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
+
+  VarHandlePtr AsyncCheckpointNotify(
+      const std::string& ep, const std::string& dir,
+      int64_t time_out = FLAGS_rpc_deadline) override;
+
+  VarHandlePtr AsyncSendComplete(
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
 
   bool Wait() override;
 
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
index 64ac728184..3c3f9d17c8 100644
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -28,6 +28,7 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
 namespace operators {
@@ -49,23 +50,77 @@ constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
 
 class RPCServer;
 
-struct VarHandle {
-  // RPC endpoint.
-  std::string ep;
-  const platform::DeviceContext* ctx;
-  const framework::Scope* scope;
-  // Variable name.
-  std::string name;
-  // RPC method name.
-  std::string method;
+class VarHandle {
+ public:
+  VarHandle(const std::string ep, const std::string& method,
+            const std::string& name,
+            const platform::DeviceContext* p_ctx = nullptr,
+            const framework::Scope* p_scope = nullptr)
+      : ok_(kVarHandleDefaultState) {
+    ep_ = ep;
+    ctx_ = p_ctx;
+    scope_ = p_scope;
+    name_ = name;
+    method_ = method;
+  }
+
+  virtual ~VarHandle() {}
+
+ public:
+  bool Wait() {
+    {
+      std::unique_lock<std::mutex> lk(sync_mutex_);
+      wait_cond_.wait(lk, [this] { return ok_ != kVarHandleDefaultState; });
+    }
+    VLOG(7) << "VarHandle wait:" << ok_;
+    return ok_ != 0;
+  }
+
+  void Finish(bool ok) {
+    {
+      std::unique_lock<std::mutex> lk(sync_mutex_);
+      ok_ = ok;
+    }
+    VLOG(7) << "VarHandle finish:" << ok;
+    wait_cond_.notify_all();
+  }
 
   std::string String() const {
     std::ostringstream s;
-    s << method << " name:[" << name << "], ep:[" << ep << "]";
+    s << method_ << " name:[" << name_ << "], ep:[" << ep_ << "], ok:[" << ok_
+      << "]";
     return s.str();
   }
+
+  std::string ep() const { return ep_; }
+  const platform::DeviceContext* ctx() const { return ctx_; }
+  const framework::Scope* scope() const { return scope_; }
+  std::string name() const { return name_; }
+  std::string method() const { return method_; }
+
+ protected:
+  // RPC endpoint.
+  std::string ep_;
+  const platform::DeviceContext* ctx_;
+  const framework::Scope* scope_;
+  // Variable name.
+  std::string name_;
+  // RPC method name.
+  std::string method_;
+
+ protected:
+  std::mutex sync_mutex_;
+  std::condition_variable wait_cond_;
+  int ok_;
+
+  static const int kVarHandleDefaultState = -1;
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(VarHandle);
 };
 
+typedef std::shared_ptr<VarHandle> VarHandlePtr;
+
 class RequestHandler {
  public:
   explicit RequestHandler(bool sync_mode)
diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h
index 22a022a5d2..3539ee5e45 100644
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -14,12 +14,14 @@
 
 #pragma once
 
+#include <condition_variable>  // NOLINT
 #include <string>
 #include "gflags/gflags.h"
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
 
 DECLARE_int32(rpc_deadline);
 
@@ -31,37 +33,36 @@ class RPCClient {
  public:
   RPCClient() {}
   virtual ~RPCClient() {}
-  virtual bool AsyncSendVar(const std::string& ep,
-                            const platform::DeviceContext& ctx,
-                            const framework::Scope& scope,
-                            const std::string& var_name,
-                            int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual bool AsyncGetVar(const std::string& ep,
-                           const platform::DeviceContext& ctx,
-                           const framework::Scope& scope,
-                           const std::string& var_name,
-                           int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual bool AsyncPrefetchVar(const std::string& ep,
-                                const platform::DeviceContext& ctx,
-                                const framework::Scope& scope,
-                                const std::string& in_var_name,
-                                const std::string& out_var_name,
-                                int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual void AsyncSendBatchBarrier(const std::string& ep,
-                                     int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual void AsyncSendFetchBarrier(const std::string& ep,
-                                     int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual void AsyncCheckpointNotify(const std::string& ep,
-                                     const std::string& dir,
-                                     int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual void AsyncSendComplete(const std::string& ep,
-                                 int64_t time_out = FLAGS_rpc_deadline) = 0;
+  virtual VarHandlePtr AsyncSendVar(const std::string& ep,
+                                    const platform::DeviceContext& ctx,
+                                    const framework::Scope& scope,
+                                    const std::string& var_name,
+                                    int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual VarHandlePtr AsyncGetVar(const std::string& ep,
+                                   const platform::DeviceContext& ctx,
+                                   const framework::Scope& scope,
+                                   const std::string& var_name,
+                                   int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual VarHandlePtr AsyncPrefetchVar(
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& in_var_name,
+      const std::string& out_var_name,
+      int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual VarHandlePtr AsyncSendBatchBarrier(
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual VarHandlePtr AsyncSendFetchBarrier(
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual VarHandlePtr AsyncCheckpointNotify(
+      const std::string& ep, const std::string& dir,
+      int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual VarHandlePtr AsyncSendComplete(
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
 
   // Complete tells all the pserver instances that finishe the training,
   // the pserver can reduce it's barrier count, and continue to train
diff --git a/paddle/fluid/operators/distributed/varhandle_test.cc b/paddle/fluid/operators/distributed/varhandle_test.cc
new file mode 100644
index 0000000000..a0fcaf8864
--- /dev/null
+++ b/paddle/fluid/operators/distributed/varhandle_test.cc
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <string>
+#include <thread>  // NOLINT
+
+#include "google/protobuf/text_format.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
+
+using paddle::operators::distributed::VarHandlePtr;
+using paddle::operators::distributed::VarHandle;
+
+void WaitTrue(VarHandlePtr s) { EXPECT_TRUE(s->Wait()); }
+
+void WaitFalse(VarHandlePtr s) { EXPECT_FALSE(s->Wait()); }
+
+TEST(VarHandle, Run) {
+  std::vector<VarHandlePtr> a;
+  for (int i = 0; i < 12; i++) {
+    VarHandlePtr s(new VarHandle("", "", "", nullptr, nullptr));
+    a.push_back(s);
+  }
+
+  std::vector<std::unique_ptr<std::thread>> t;
+  for (int i = 0; i < 6; i++) {
+    t.emplace_back(new std::thread(WaitFalse, a[i]));
+  }
+
+  for (int i = 0; i < 6; i++) {
+    a[i]->Finish(false);
+    t[i]->join();
+  }
+
+  for (int i = 6; i < 12; i++) {
+    t.emplace_back(new std::thread(WaitTrue, a[i]));
+  }
+
+  for (int i = 6; i < 12; i++) {
+    a[i]->Finish(true);
+    t[i]->join();
+  }
+}
diff --git a/paddle/fluid/operators/prefetch_op.cc b/paddle/fluid/operators/prefetch_op.cc
index 4b804740a0..0519c15e13 100644
--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
@@ -44,16 +44,20 @@ class PrefetchOp : public framework::OperatorBase {
     distributed::RPCClient* rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
+    std::vector<distributed::VarHandlePtr> rets;
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
         VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get "
                 << outs[i] << " back";
-        rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope, ins[i], outs[i]);
+        rets.push_back(rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope,
+                                                    ins[i], outs[i]));
       } else {
         VLOG(3) << "don't send no-initialied variable: " << ins[i];
       }
     }
-    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
+    for (size_t i = 0; i < rets.size(); i++) {
+      PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc
index a1f368e869..4d34b8a168 100644
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -44,12 +44,15 @@ class RecvOp : public framework::OperatorBase {
     distributed::RPCClient* rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
+    std::vector<distributed::VarHandlePtr> rets;
     for (size_t i = 0; i < outs.size(); i++) {
       VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
-      rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]);
+      rets.push_back(rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]));
     }
     if (sync_mode) {
-      PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
+      for (size_t i = 0; i < rets.size(); i++) {
+        PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+      }
     }
   }
 };
diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
index 82a70e4bf1..48322ac7fd 100644
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <future>  // NOLINT
 #include <ostream>
 
+#include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -45,18 +46,19 @@ class SendOp : public framework::OperatorBase {
     distributed::RPCClient* rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
+    std::vector<distributed::VarHandlePtr> rets;
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
         VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
-        // TODO(Yancey1989): we need to use an IO threadpool which has
-        // a larger number of threads than the computing threadpool.
-        rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]);
+        rets.push_back(rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]));
       } else {
         VLOG(3) << "don't send no-initialied variable: " << ins[i];
       }
     }
     if (sync_send) {
-      PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
+      for (size_t i = 0; i < rets.size(); i++) {
+        PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+      }
     }
   }
 };

From 312e92ab072297dae3bf2baf6479b51bfc9b88e6 Mon Sep 17 00:00:00 2001
From: Shan Yi <35982308+shanyi15@users.noreply.github.com>
Date: Wed, 12 Sep 2018 10:43:32 +0800
Subject: [PATCH 47/85] update-readme

---
 README.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 60ffbe7281..45186ec4ef 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
 
 
-### Latest PaddlePaddle Release: [Fluid 0.14.0](https://github.com/PaddlePaddle/Paddle/tree/v0.14.0)
+### Latest PaddlePaddle Release: [Fluid 0.15.0](https://github.com/PaddlePaddle/Paddle/tree/v0.15.0)
 ### Install Latest Stable Release:
 ```
 # Linux CPU
@@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==0.14.0.post85
 
 ## Installation
 
-It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/install/install_doc.html) on our website.
+It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/install/install_doc.html) on our website.
 
 ## Documentation
 
-We provide [English](http://paddlepaddle.org/documentation/docs/en/0.14.0/getstarted/index_en.html) and
-[Chinese](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/index.html) documentation.
+We provide [English](http://paddlepaddle.org/documentation/docs/en/0.15.0/getstarted/index_en.html) and
+[Chinese](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/index.html) documentation.
 
 - [Deep Learning 101](https://github.com/PaddlePaddle/book)
 
   You might want to start from this online interactive book that can run in a Jupyter Notebook.
 
-- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/user_guides/howto/training/cluster_howto.html)
+- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/user_guides/howto/training/cluster_howto.html)
 
   You can run distributed training jobs on MPI clusters.
 
-- [Python API](http://paddlepaddle.org/documentation/api/zh/0.14.0/fluid.html)
+- [Python API](http://paddlepaddle.org/documentation/api/zh/0.15.0/fluid.html)
 
    Our new API enables much shorter programs.
 
-- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/advanced_usage/development/contribute_to_paddle.html)
+- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/advanced_usage/development/contribute_to_paddle.html)
 
    We appreciate your contributions!
 

From 36d6e44681c3ebe1ff3992b37a981ca468580080 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 12 Sep 2018 03:41:39 +0000
Subject: [PATCH 48/85] fix test_py_reader_using_executor error

---
 .../fluid/tests/unittests/test_py_reader_using_executor.py     | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
index 931cac409f..0fb9518a45 100644
--- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
@@ -96,7 +96,8 @@ class TestPyReaderUsingExecutor(unittest.TestCase):
         self.queue_capacity = 50
 
     def test(self):
-        for use_cuda in [False, True]:
+        for use_cuda in ([False, True]
+                         if core.core.is_compiled_with_cuda() else [False]):
             for use_parallel_executor in [False, True]:
                 for use_double_buffer in [False, True]:
                     print('Test Parameters:'),

From 5ce1a960a5dc91459718422379b8bbf398574584 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Wed, 12 Sep 2018 12:47:28 +0800
Subject: [PATCH 49/85] move bcast op into pass

---
 benchmark/fluid/args.py                       |  6 +++
 benchmark/fluid/fluid_benchmark.py            |  9 ++++
 benchmark/fluid/models/mnist.py               | 11 +++--
 .../framework/details/all_reduce_op_handle.cc |  7 +++-
 .../framework/details/broadcast_op_handle.cc  |  7 ++++
 .../details/data_balance_op_handle.cc         |  7 ++++
 .../details/multi_devices_graph_pass.cc       | 42 ++++++++++++++-----
 .../details/multi_devices_graph_pass.h        |  4 +-
 .../framework/details/reduce_op_handle.cc     |  6 ++-
 .../details/scale_loss_grad_op_handle.cc      |  2 +-
 paddle/fluid/pybind/pybind.cc                 |  1 -
 python/paddle/fluid/parallel_executor.py      | 10 -----
 12 files changed, 82 insertions(+), 30 deletions(-)

diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py
index ed696e82f8..0d5c9652de 100644
--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
@@ -140,5 +140,11 @@ def parse_args():
         '--use_lars',
         action='store_true',
         help='If set, use lars for optimizers, ONLY support resnet module.')
+    parser.add_argument(
+        '--reduce_strategy',
+        type=str,
+        choices=['reduce', 'all_reduce'],
+        default='all_reduce',
+        help='Specify the reduce strategy, can be reduce, all_reduce')
     args = parser.parse_args()
     return args
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index 25622ee06c..ddd9fe8098 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -170,6 +170,14 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
     strategy = fluid.ExecutionStrategy()
     strategy.num_threads = args.cpus
     strategy.allow_op_delay = False
+    build_strategy = fluid.BuildStrategy()
+    if args.reduce_strategy == "reduce":
+        build_strategy.reduce_strategy = fluid.BuildStrategy(
+        ).ReduceStrategy.Reduce
+    else:
+        build_strategy.reduce_strategy = fluid.BuildStrategy(
+        ).ReduceStrategy.AllReduce
+
     avg_loss = train_args[0]
 
     if args.update_method == "pserver":
@@ -184,6 +192,7 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
         avg_loss.name,
         main_program=train_prog,
         exec_strategy=strategy,
+        build_strategy=build_strategy,
         num_trainers=num_trainers,
         trainer_id=trainer_id)
 
diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py
index cef8657ee6..f123e07fb7 100644
--- a/benchmark/fluid/models/mnist.py
+++ b/benchmark/fluid/models/mnist.py
@@ -67,11 +67,14 @@ def cnn_model(data):
 
 def get_model(args, is_train, main_prog, startup_prog):
     # NOTE: mnist is small, we don't implement data sharding yet.
-    filelist = [
-        os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
-    ]
+    opt = None
+    data_file_handle = None
     with fluid.program_guard(main_prog, startup_prog):
         if args.use_reader_op:
+            filelist = [
+                os.path.join(args.data_path, f)
+                for f in os.listdir(args.data_path)
+            ]
             data_file_handle = fluid.layers.open_files(
                 filenames=filelist,
                 shapes=[[-1, 1, 28, 28], (-1, 1)],
@@ -100,7 +103,7 @@ def get_model(args, is_train, main_prog, startup_prog):
             if is_train:
                 opt = fluid.optimizer.AdamOptimizer(
                     learning_rate=0.001, beta1=0.9, beta2=0.999)
-                opt.minimize()
+                opt.minimize(avg_cost)
                 if args.memory_optimize:
                     fluid.memory_optimize(main_prog)
 
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index bf493a3fa4..8450d8eb8b 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -46,7 +46,12 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 #endif
 
 void AllReduceOpHandle::RunImpl() {
-  platform::RecordEvent r("all_reduce", nullptr);
+  if (dev_ctxes_.size() > 0UL) {
+    platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
+  } else {
+    platform::RecordEvent record_event(Name(), nullptr);
+  }
+
   if (NoDummyInputSize() == 1) {
     return;  // No need to all reduce when GPU count = 1;
   } else {
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 1d9f1bd6e4..35962ade99 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -15,12 +15,19 @@
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
 void BroadcastOpHandle::RunImpl() {
+  if (dev_ctxes_.size() > 0UL) {
+    platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
+  } else {
+    platform::RecordEvent record_event(Name(), nullptr);
+  }
+
   if (places_.size() == 1) return;
 
   // The input and output may have dummy vars.
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc
index 525d243224..91f6a42e6e 100644
--- a/paddle/fluid/framework/details/data_balance_op_handle.cc
+++ b/paddle/fluid/framework/details/data_balance_op_handle.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/details/data_balance_op_handle.h"
 #include <algorithm>
 #include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace framework {
@@ -86,6 +87,12 @@ std::vector<std::array<int, 3>> DataBalanceOpHandle::GetBalancePlan(
 }
 
 void DataBalanceOpHandle::RunImpl() {
+  if (dev_ctxes_.size() > 0UL) {
+    platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
+  } else {
+    platform::RecordEvent record_event(Name(), nullptr);
+  }
+
   PADDLE_ENFORCE_GT(places_.size(), 1,
                     "Data balance can only be enabled when the number of "
                     "places to run larger than 1.");
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 7a99169849..cd6c8b50a9 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -348,14 +348,31 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
 
   size_t cur_device_id = 0;
   bool is_forwarding = true;
+  bool is_dist_train = false;
 
   for (ir::Node *node : sorted_ops) {
     if (boost::get<int>(
             node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
         static_cast<int>(OpRole::kRPC)) {
-      CreateRPCOp(&result, node);
+      int op_dev_id = CreateRPCOp(&result, node);
+      PADDLE_ENFORCE(op_dev_id != -1,
+                     "Can not schedule the RPC operator to the right place.");
+      if (node->Op()->Type() == "recv") {
+        auto recv_vars_attr =
+            boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
+                OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+        PADDLE_ENFORCE(recv_vars_attr.size() == 2UL);  // [parameter, gradient]
+        if (recv_vars_attr[0].find(".block") == std::string::npos) {
+          bcast_var_name_set[op_dev_id].emplace(recv_vars_attr[0]);
+        }
+      }
+      is_dist_train = true;
     } else if (IsDistTrainOp(node, send_vars, recv_vars)) {
-      CreateDistTrainOp(&result, node);
+      int op_dev_id = CreateDistTrainOp(&result, node);
+      if (node->Op()->Type() == "concat") {
+        auto origin_param_name = node->Op()->OutputArgumentNames()[0];
+        bcast_var_name_set[op_dev_id].emplace(origin_param_name);
+      }
     } else if (IsScaleLossOp(node)) {
       // user can customize loss@grad if not use_default_grad_scale_
       if (strategy_.gradient_scale_ !=
@@ -414,7 +431,10 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
                     CreateReduceOp(&result, g_name, cur_device_id);
                     graph->Get<ShardedVarDevice>(kShardedVarDevice)
                         .emplace(g_name, cur_device_id);
-                    bcast_var_name_set[cur_device_id].emplace(p_name);
+                    if (!is_dist_train) {
+                      // will send gradients directly when distributed training
+                      bcast_var_name_set[cur_device_id].emplace(p_name);
+                    }
                     break;
                   case BuildStrategy::ReduceStrategy::kAllReduce:
                     if (IsSparseGradient(g_name)) {
@@ -436,14 +456,14 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
       }
     }
   }
-
   bool use_gpu = false;
 #ifdef PADDLE_WITH_CUDA
   use_gpu = nccl_ctxs_ != nullptr;
 #endif
 
-  if (use_gpu ||
-      strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
+  if ((use_gpu &&
+       strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) ||
+      is_dist_train) {
     // Insert BCast Ops
     for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
       auto &to_bcast_set = bcast_var_name_set[dev_id];
@@ -676,8 +696,8 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
   return var;
 }
 
-void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
-                                                ir::Node *node) const {
+int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
+                                               ir::Node *node) const {
   int op_dev_id = -1;
   std::vector<std::string> input_var_names;
   std::vector<std::string> output_var_names;
@@ -720,6 +740,7 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
                  node->Op()->Type());
 
   CreateComputationalOp(result, node, op_dev_id);
+  return op_dev_id;
 }
 
 void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
@@ -738,8 +759,8 @@ void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
 }
 
 // Create RPC related op handles that connects its in ops and out ops.
-void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
-                                          ir::Node *node) const {
+int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
+                                         ir::Node *node) const {
   int op_dev_id = -1;
   if (node->Op()->Type() == "send") {
     // TODO(paddle-dev): getting the first var is not safe.
@@ -825,6 +846,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
       CreateOpOutput(result, op_handle, new_node, p, outvar_dev_id);
     }
   }
+  return op_dev_id;
 }
 
 bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const {
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index ac6d9c5a64..1ca8c4b855 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -54,8 +54,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
 
   bool IsScaleLossOp(ir::Node *node) const;
 
-  void CreateRPCOp(ir::Graph *result, ir::Node *node) const;
-  void CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
+  int CreateRPCOp(ir::Graph *result, ir::Node *node) const;
+  int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
 
   /**
    * Is this operator as the end-point operator before/after send operator.
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index 6c7e5c1fb0..878828693b 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -27,7 +27,11 @@ namespace framework {
 namespace details {
 
 void ReduceOpHandle::RunImpl() {
-  platform::RecordEvent r("reduce", nullptr);
+  if (dev_ctxes_.size() > 0UL) {
+    platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
+  } else {
+    platform::RecordEvent record_event(Name(), nullptr);
+  }
   if (places_.size() == 1) return;
   // the input and output may have dummy var.
   auto in_var_handles = DynamicCast<VarHandle>(inputs_);
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index 609e185819..ba243979b3 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -51,7 +51,7 @@ void ScaleLossGradOpHandle::RunImpl() {
               ->stream();
       memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
                    platform::CPUPlace(), &coeff_, sizeof(float), stream);
-      VLOG(1) << place_ << "RUN Scale loss grad op";
+      VLOG(10) << place_ << "RUN Scale loss grad op";
     });
 #endif
   }
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 20fc08e21d..8bc30fc123 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -683,7 +683,6 @@ All parameter, weight, gradient are variables in Paddle.
                   const std::string &, Scope *, std::vector<Scope *> &,
                   const ExecutionStrategy &, const BuildStrategy &, size_t,
                   size_t>())
-      .def("_bcast_params", &ParallelExecutor::BCastParamsToDevices)
       // NOTE: even we return a vec<Scope*>* to Python use reference policy.
       // We still cannot get local_scope from this vector, since the element
       // of vec<Scope*> will be freed by Python GC. We can only return Scope*
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 4790e0f611..058f414e9b 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -279,21 +279,11 @@ class ParallelExecutor(object):
         self.executor.run(fetch_list, fetch_var_name)
         arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
 
-        if self.is_dist:
-            self._bcast_params()
-
         if return_numpy:
             return executor.as_numpy(arr)
 
         return [arr[i] for i in range(len(arr))]
 
-    def _bcast_params(self):
-        """
-        Broadcast the parameters to other devices. It is used during
-        distributed training.
-        """
-        self.executor._bcast_params(set(self.persistable_vars))
-
     @property
     def device_count(self):
         return len(self._act_places)

From d61c11764af1249c8acc6937f2c25a8ae6c86c3e Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 12 Sep 2018 12:50:50 +0800
Subject: [PATCH 50/85] follow comment add enforce

---
 paddle/fluid/framework/operator.cc | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index bbd141cb3b..b7fae7171a 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -471,10 +471,11 @@ class RuntimeInferShapeContext : public InferShapeContext {
       return false;
     }
     const auto& in = it->second;
-
-    if (in.size() != 1 || in[0] == kEmptyVarName) {
+    if (in.size() == 0 || in[0] == kEmptyVarName) {
       return false;
     }
+    PADDLE_ENFORCE_EQ(in.size(), 1UL,
+                      "Input %s should not have more than one inputs", name);
     return scope_.FindVar(in[0]) != nullptr;
   }
 
@@ -486,9 +487,11 @@ class RuntimeInferShapeContext : public InferShapeContext {
       return false;
     }
     const auto& out = it->second;
-    if (out.size() != 1 || out[0] == kEmptyVarName) {
+    if (out.size() == 0 || out[0] == kEmptyVarName) {
       return false;
     }
+    PADDLE_ENFORCE_EQ(out.size(), 1UL,
+                      "Output %s should not have more than one outputs", name);
     return scope_.FindVar(out[0]) != nullptr;
   }
 

From d41176411fd4f5f06155c7c73264f1145ecccee7 Mon Sep 17 00:00:00 2001
From: Jiabin Yang <marsyang199376@gmail.com>
Date: Wed, 12 Sep 2018 13:08:02 +0800
Subject: [PATCH 51/85] Update test_py_reader_using_executor.py

---
 .../fluid/tests/unittests/test_py_reader_using_executor.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
index 0fb9518a45..b7fad9b3a6 100644
--- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
@@ -97,7 +97,7 @@ class TestPyReaderUsingExecutor(unittest.TestCase):
 
     def test(self):
         for use_cuda in ([False, True]
-                         if core.core.is_compiled_with_cuda() else [False]):
+                         if core.is_compiled_with_cuda() else [False]):
             for use_parallel_executor in [False, True]:
                 for use_double_buffer in [False, True]:
                     print('Test Parameters:'),

From bdd957b4be7a023426f76ae6e3153aa5a0e1686f Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 12 Sep 2018 05:18:22 +0000
Subject: [PATCH 52/85] fix test_parallel_executor_transformer

---
 .../tests/unittests/test_parallel_executor_transformer.py     | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
index 5ad922725a..a55b2002ed 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
@@ -20,6 +20,7 @@ import numpy as np
 from parallel_executor_test_base import TestParallelExecutorBase
 import unittest
 import paddle
+import paddle.fluid.core as core
 import paddle.dataset.wmt16 as wmt16
 import os
 
@@ -170,7 +171,8 @@ class TestTransformer(TestParallelExecutorBase):
                 writer.complete_append_tensor()
 
     def test_main(self):
-        self.check_network_convergence(transformer, use_cuda=True)
+        if core.is_compiled_with_cuda():
+            self.check_network_convergence(transformer, use_cuda=True)
         self.check_network_convergence(transformer, use_cuda=False, iter=5)
 
 

From 670c58bea4c4cd68b575e9d7e3f39da783facb52 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 12 Sep 2018 07:24:50 +0000
Subject: [PATCH 53/85] fix mac test_data_baalancance

---
 python/paddle/fluid/tests/unittests/test_data_balance.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_data_balance.py b/python/paddle/fluid/tests/unittests/test_data_balance.py
index e39eedd282..4bd24510bc 100644
--- a/python/paddle/fluid/tests/unittests/test_data_balance.py
+++ b/python/paddle/fluid/tests/unittests/test_data_balance.py
@@ -84,7 +84,7 @@ class TestDataBalance(unittest.TestCase):
         self.data_file_name = './data_balance_test.recordio'
         self.lod_data_file_name = './data_balance_with_lod_test.recordio'
         self.total_ins_num = 50
-        self.batch_size = 10
+        self.batch_size = 12
         self.prepare_data()
         self.prepare_lod_data()
 

From 41de582bb092dfa67bd2a1fa5d3b469db1ae81e2 Mon Sep 17 00:00:00 2001
From: Sylwester Fraczek <sylwester.fraczek@intel.com>
Date: Wed, 12 Sep 2018 10:22:11 +0200
Subject: [PATCH 54/85] create conv relu pass for MKLDNN (#13258)

---
 paddle/fluid/framework/ir/CMakeLists.txt      |   6 +
 .../ir/conv_relu_mkldnn_fuse_pass.cc          |  90 +++++++++++++++
 .../framework/ir/conv_relu_mkldnn_fuse_pass.h |  39 +++++++
 .../ir/conv_relu_mkldnn_fuse_pass_tester.cc   | 108 ++++++++++++++++++
 .../framework/ir/graph_pattern_detector.cc    |  33 ++++++
 .../framework/ir/graph_pattern_detector.h     |  22 ++++
 6 files changed, 298 insertions(+)
 create mode 100644 paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
 create mode 100644 paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
 create mode 100644 paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index ce3ebed00b..7004f484a9 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -28,6 +28,9 @@ cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph grap
 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
 pass_library(fc_fuse_pass inference)
+if(WITH_MKLDNN)
+  pass_library(conv_relu_mkldnn_fuse_pass inference)
+endif()
 pass_library(attention_lstm_fuse_pass inference)
 pass_library(infer_clean_graph_pass inference)
 pass_library(fc_lstm_fuse_pass inference)
@@ -42,3 +45,6 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r
 cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
+if(WITH_MKLDNN)
+  cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
+endif()
diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
new file mode 100644
index 0000000000..4408cb45ac
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
@@ -0,0 +1,90 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init("conv_relu_mkldnn_fuse", graph.get());
+
+  std::unordered_set<Node*> nodes2delete;
+
+  GraphPatternDetector gpd;
+  auto* conv_input = gpd.mutable_pattern()
+                         ->NewNode("conv_relu_mkldnn_fuse/conv_input")
+                         ->AsInput()
+                         ->assert_is_op_input("conv2d", "Input");
+  patterns::ConvReLU conv_relu_pattern(gpd.mutable_pattern(),
+                                       "conv_relu_mkldnn_fuse");
+  conv_relu_pattern(conv_input);
+
+  int found_conv_relu_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "handle ConvReLU fuse";
+    GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
+                              conv_relu_pattern);  // Filter
+    GET_IR_NODE_FROM_SUBGRAPH(conv_bias, conv_bias, conv_relu_pattern);  // Bias
+    GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern);    // tmp
+    GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_relu_pattern);  // CONV op
+    GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_relu_pattern);  // Out
+    GET_IR_NODE_FROM_SUBGRAPH(relu, relu, conv_relu_pattern);  // ReLU op
+
+    // Create an ConvReLU Node.
+    OpDesc desc;
+    std::string conv_relu_i_in = subgraph.at(conv_input)->Name();
+    std::string conv_relu_w_in = conv_weight->Name();
+    std::string conv_relu_b_in = conv_bias->Name();
+    std::string conv_relu_out = relu_out->Name();
+    desc.SetInput("Input", std::vector<std::string>({conv_relu_i_in}));
+    desc.SetInput("Filter", std::vector<std::string>({conv_relu_w_in}));
+    desc.SetInput("Bias", std::vector<std::string>({conv_relu_b_in}));
+    desc.SetOutput("Out", std::vector<std::string>({conv_relu_out}));
+    desc.SetType("conv2d");
+    for (auto& attr : conv->Op()->GetAttrMap()) {
+      desc.SetAttr(attr.first, attr.second);
+    }
+    desc.SetAttr("fuse_relu", true);
+    auto conv_relu_node = g->CreateOpNode(&desc);  // OpDesc will be copied.
+    GraphSafeRemoveNodes(graph.get(), {conv, relu, conv_out});
+
+    PADDLE_ENFORCE(subgraph.count(conv_input));
+    IR_NODE_LINK_TO(subgraph.at(conv_input), conv_relu_node);
+    IR_NODE_LINK_TO(conv_weight, conv_relu_node);
+    IR_NODE_LINK_TO(conv_bias, conv_relu_node);
+    IR_NODE_LINK_TO(conv_relu_node, relu_out);
+
+    found_conv_relu_count++;
+  };
+
+  gpd(graph.get(), handler);
+
+  AddStatis(found_conv_relu_count);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(conv_relu_mkldnn_fuse_pass,
+              paddle::framework::ir::ConvReLUFusePass);
diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
new file mode 100644
index 0000000000..b5de0d5487
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Fuse the CONV and ReLU to a ConvReLUOp.
+ */
+class ConvReLUFusePass : public FusePassBase {
+ public:
+  virtual ~ConvReLUFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
new file mode 100644
index 0000000000..82b5fa1886
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  if (type == "conv2d") {
+    op->SetAttr("use_mkldnn", true);
+    op->SetInput("Input", {inputs[0]});
+    op->SetInput("Filter", {inputs[1]});
+    op->SetInput("Bias", {inputs[2]});
+  } else if (type == "relu") {
+    op->SetInput("X", inputs);
+  }
+  op->SetOutput("Out", outputs);
+}
+
+// a->OP0->b
+// b->OP1->c
+// (c, weights, bias)->conv->f
+// (f)->relu->g
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+  for (auto& v :
+       std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::SELECTED_ROWS);
+    if (v == "weights" || v == "bias") {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "OP0", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"b"}));
+  SetOp(&prog, "OP1", std::vector<std::string>({"b"}),
+        std::vector<std::string>({"c"}));
+  SetOp(&prog, "conv2d", std::vector<std::string>({"c", "weights", "bias"}),
+        std::vector<std::string>({"f"}));
+  SetOp(&prog, "relu", std::vector<std::string>({"f"}),
+        std::vector<std::string>({"g"}));
+
+  return prog;
+}
+
+TEST(ConvReLUFusePass, basic) {
+  auto prog = BuildProgramDesc();
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("conv_relu_mkldnn_fuse_pass");
+
+  int original_nodes_num = graph->Nodes().size();
+
+  graph = pass->Apply(std::move(graph));
+
+  int current_nodes_num = graph->Nodes().size();
+
+  // Remove 3 Nodes: CONV, RELU, conv_out
+  // Add 1 Node: ConvReLU
+  EXPECT_EQ(original_nodes_num - 2, current_nodes_num);
+
+  // Assert conv_relu op in newly generated graph
+  int conv_relu_count = 0;
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()->Type() == "conv2d") {
+      if (node->Op()->HasAttr("use_mkldnn")) {
+        bool use_mkldnn = boost::get<bool>(node->Op()->GetAttr("use_mkldnn"));
+        if (use_mkldnn) {
+          if (node->Op()->HasAttr("fuse_relu")) {
+            bool fuse_relu = boost::get<bool>(node->Op()->GetAttr("fuse_relu"));
+            if (fuse_relu) {
+              ++conv_relu_count;
+            }
+          }
+        }
+      }
+    }
+  }
+  EXPECT_EQ(conv_relu_count, 1);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(conv_relu_mkldnn_fuse_pass);
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 5825a129b7..11d5998aaf 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -522,6 +522,39 @@ bool VarLinksFromOp(Node* node, const std::string& op_type) {
   return false;
 }
 
+PDNode* patterns::ConvReLU::operator()(
+    paddle::framework::ir::PDNode* conv_input) {
+  // Create Operators
+  conv_input->assert_is_op_input("conv2d", "Input");
+  auto* conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
+  auto* relu_op = pattern->NewNode(relu_repr())->assert_is_op("relu");
+  // Create variables
+  // Filter
+  auto* conv_weight_var = pattern->NewNode(conv_weight_repr())
+                              ->AsInput()
+                              ->assert_is_persistable_var()
+                              ->assert_is_op_input("conv2d", "Filter");
+  // Bias
+  auto* conv_bias_var = pattern->NewNode(conv_bias_repr())
+                            ->AsInput()
+                            ->assert_is_persistable_var()
+                            ->assert_is_op_input("conv2d", "Bias");
+  // intermediate variable, will be removed in the IR after fuse.
+  auto* conv_out_var = pattern->NewNode(conv_out_repr())
+                           ->AsIntermediate()
+                           ->assert_is_only_output_of_op("conv2d")
+                           ->assert_is_op_input("relu");
+  // output
+  auto* relu_out_var = pattern->NewNode(relu_out_repr())
+                           ->AsOutput()
+                           ->assert_is_op_output("relu");
+
+  conv_op->LinksFrom({conv_input, conv_weight_var, conv_bias_var})
+      .LinksTo({conv_out_var});
+  relu_op->LinksFrom({conv_out_var}).LinksTo({relu_out_var});
+  return relu_out_var;
+}
+
 PDNode* patterns::FC::operator()(paddle::framework::ir::PDNode* x,
                                  bool with_bias) {
   // Create shared nodes.
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 57482a07b6..371384dc56 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -360,6 +360,28 @@ struct PatternBase {
   size_t id_;
 };
 
+// CONV with ReLU
+// op: conv + relu
+// named nodes:
+// conv_input, conv_weight,
+// conv_bias, conv_out, conv,
+// relu_out, relu
+struct ConvReLU : public PatternBase {
+  ConvReLU(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "conv_relu") {}
+
+  PDNode* operator()(PDNode* conv_input);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(conv);
+  PATTERN_DECL_NODE(relu);
+  // declare variable node's name
+  PATTERN_DECL_NODE(conv_weight);
+  PATTERN_DECL_NODE(conv_bias);
+  PATTERN_DECL_NODE(conv_out);
+  PATTERN_DECL_NODE(relu_out);
+};
+
 // FC with bias
 // op: mul + elementwise_add
 // named nodes:

From b12322ce959a2ab79a1bae4e7aaf9e4b42d56909 Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Wed, 12 Sep 2018 19:06:17 +0800
Subject: [PATCH 55/85] fix fusion_lstm unique_name bug

---
 paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc     | 5 ++---
 paddle/fluid/inference/analysis/ir_pass_manager.cc | 8 ++++++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
index f7fda87357..aa95d3e9f6 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -51,7 +51,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
     if (with_fc_bias) {
       // Add FC-bias with LSTM-bias and create a new weight
       PADDLE_ENFORCE(scope);
-      const std::string& new_bias_var = name_scope + "_bias.new";
+      const std::string& new_bias_var = patterns::UniqueKey("NewBias");
       auto* bias_var = scope->Var(new_bias_var);
       PADDLE_ENFORCE(bias_var);
       auto* bias_tensor = bias_var->GetMutable<framework::LoDTensor>();
@@ -120,7 +120,6 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
 
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-
     GET_IR_NODE_FROM_SUBGRAPH(lstm, lstm, lstm_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, lstm_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, lstm_pattern);
@@ -136,7 +135,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
                    fc_bias);
       // Remove unneeded nodes.
       std::unordered_set<const Node*> marked_nodes(
-          {mul, lstm, elementwise_add});
+          {mul, lstm, elementwise_add, fc_bias});
       GraphSafeRemoveNodes(graph, marked_nodes);
     } else {
       GET_IR_NODE_FROM_SUBGRAPH(fc_out, mul_out, fc_pattern);
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 30c1e8e93d..e76708baf4 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
 #include <string>
+#include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/scope.h"
@@ -37,13 +38,16 @@ IRPassManager::IRPassManager(const ProgramDesc &program,
 void IRPassManager::Apply(const std::vector<std::string> &passes) {
   // Apply all the passes
   std::string pre_pass;
+  int pass_num = 0;
   for (const std::string &pass_name : passes) {
     PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass_name);
     auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
     if (pass_name == "graph_viz_pass") {
-      std::string dot_file_path =
-          "ir_" + (pre_pass.empty() ? "origin" : pre_pass) + ".dot";
+      std::string dot_file_path = std::to_string(pass_num) + "_ir_" +
+                                  (pre_pass.empty() ? "origin" : pre_pass) +
+                                  ".dot";
       pass->Set("graph_viz_path", new std::string(std::move(dot_file_path)));
+      pass_num++;
     }
     graph_ = pass->Apply(std::move(graph_));
     pre_pass = pass_name;

From 415e0eac692d8aef35726deb2915f92ea7442edf Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 12 Sep 2018 11:29:17 +0000
Subject: [PATCH 56/85] fix mac test_reader_reset

---
 python/paddle/fluid/tests/unittests/test_reader_reset.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_reader_reset.py b/python/paddle/fluid/tests/unittests/test_reader_reset.py
index 8ad11d76f6..a115c37e1d 100644
--- a/python/paddle/fluid/tests/unittests/test_reader_reset.py
+++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py
@@ -41,6 +41,8 @@ class TestReaderReset(unittest.TestCase):
                 self.data_file_name, reader, feeder)
 
     def setUp(self):
+        # set parallel threads to fit 20 batches in line 49
+        os.environ['CPU_NUM'] = str(20)
         self.use_cuda = fluid.core.is_compiled_with_cuda()
         self.data_file_name = './reader_reset_test.recordio'
         self.ins_shape = [3]

From 539b3f300ffe7475cec5114cee32949a54d9d768 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 12 Sep 2018 22:29:39 +0800
Subject: [PATCH 57/85] add ocr analysis ut

---
 .../fluid/inference/tests/api/CMakeLists.txt  |  11 ++
 .../tests/api/analyzer_vis_tester.cc          | 170 ++++++++++++++++++
 2 files changed, 181 insertions(+)
 create mode 100644 paddle/fluid/inference/tests/api/analyzer_vis_tester.cc

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index d44a2cfa7f..ff6bb662c1 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -55,3 +55,14 @@ inference_analysis_test(test_text_classification SRCS analyzer_text_classificati
          --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt
          --topn=1 # Just run top 1 batch.
          )
+
+# ocr
+set(OCR_MODEL_URL "http://paddlemodels.cdn.bcebos.com/inference-vis-demos%2Focr.tar.gz")
+set(OCR_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/ocr")
+if (NOT EXISTS ${OCR_INSTALL_DIR} AND WITH_INFERENCE)
+    inference_download_and_uncompress(${OCR_INSTALL_DIR} ${OCR_MODEL_URL})
+endif()
+inference_analysis_test(test_analyzer_ocr SRCS analyzer_vis_tester.cc
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
+    ARGS --infer_model=${OCR_INSTALL_DIR}/model
+        --infer_data=${OCR_INSTALL_DIR}/data.txt)
diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
new file mode 100644
index 0000000000..7a1bb32a57
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -0,0 +1,170 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/analysis_predictor.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
+
+DEFINE_string(infer_model, "", "model path for LAC");
+DEFINE_string(infer_data, "", "data file for LAC");
+DEFINE_int32(batch_size, 1, "batch size.");
+DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+struct Record {
+  std::vector<float> data;
+  std::vector<int32_t> shape;
+};
+
+Record ProcessALine(const std::string &line) {
+  VLOG(3) << "process a line";
+  std::vector<std::string> columns;
+  split(line, '\t', &columns);
+  CHECK_EQ(columns.size(), 2UL)
+      << "data format error, should be <data>\t<shape>";
+
+  Record record;
+  std::vector<std::string> data_strs;
+  split(columns[0], ' ', &data_strs);
+  for (auto &d : data_strs) {
+    record.data.push_back(std::stof(d));
+  }
+
+  std::vector<std::string> shape_strs;
+  split(columns[1], ' ', &shape_strs);
+  for (auto &s : shape_strs) {
+    record.shape.push_back(std::stoi(s));
+  }
+  VLOG(3) << "data size " << record.data.size();
+  VLOG(3) << "data shape size " << record.shape.size();
+  return record;
+}
+
+/*
+ * Use the native and analysis fluid engine to inference the demo.
+ * ocr, mobilenet and se_resnext50
+ */
+void TestVisualPrediction() {
+  std::unique_ptr<PaddlePredictor> predictor;
+  AnalysisConfig cfg;
+  cfg.param_file = FLAGS_infer_model + "/__params__";
+  cfg.prog_file = FLAGS_infer_model + "/__model__";
+  cfg.use_gpu = false;
+  cfg.device = 0;
+  // cfg.specify_input_name = true;
+  cfg.enable_ir_optim = true;
+  predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
+
+  // Only have single batch of data.
+  std::string line;
+  std::ifstream file(FLAGS_infer_data);
+  std::getline(file, line);
+  auto record = ProcessALine(line);
+  file.close();
+
+  // Inference.
+  PaddleTensor input;
+  input.shape = record.shape;
+  input.data =
+      PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
+  input.dtype = PaddleDType::FLOAT32;
+
+  std::vector<PaddleTensor> outputs_slots;
+  Timer timer;
+  timer.tic();
+  for (int i = 0; i < FLAGS_repeat; i++) {
+    predictor->Run({input}, &outputs_slots);
+  }
+  PrintTime(/*batch size*/ 1, FLAGS_repeat, /*num threads*/ 1, /*thread id*/ 0,
+            timer.toc() / FLAGS_repeat);
+
+  VLOG(3) << "output.size " << outputs_slots.size();
+
+  // run native as reference
+  NativeConfig config;
+  config.param_file = FLAGS_infer_model + "/__params__";
+  config.prog_file = FLAGS_infer_model + "/__model__";
+  config.use_gpu = false;
+  config.device = 0;
+  // config.specify_input_name = true;
+  auto ref_predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  std::vector<PaddleTensor> ref_outputs_slots;
+  ref_predictor->Run({input}, &ref_outputs_slots);
+  EXPECT_EQ(ref_outputs_slots.size(), outputs_slots.size());
+  for (size_t i = 0; i < outputs_slots.size(); ++i) {
+    auto &ref_out = ref_outputs_slots[i];
+    auto &out = outputs_slots[i];
+    size_t ref_size =
+        std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
+                        [](int a, int b) { return a * b; });
+    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                                  [](int a, int b) { return a * b; });
+    EXPECT_EQ(size, ref_size);
+    EXPECT_EQ(out.dtype, ref_out.dtype);
+    switch (out.dtype) {
+      case PaddleDType::INT64: {
+        int64_t *pdata = static_cast<int64_t *>(out.data.data());
+        int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
+        for (size_t j = 0; j < size; ++j) {
+          EXPECT_EQ(pdata_ref[j], pdata[j]);
+        }
+        break;
+      }
+      case PaddleDType::FLOAT32: {
+        float *pdata = static_cast<float *>(out.data.data());
+        float *pdata_ref = static_cast<float *>(ref_out.data.data());
+        for (size_t j = 0; j < size; ++j) {
+          EXPECT_NEAR(pdata_ref[j], pdata[j], 1e-3);
+        }
+        break;
+      }
+    }
+    // print what are fused
+    AnalysisPredictor *analysis_predictor =
+        dynamic_cast<AnalysisPredictor *>(predictor.get());
+    auto &fuse_statis = analysis_predictor->analysis_argument()
+                            .Get<std::unordered_map<std::string, int>>(
+                                framework::ir::kFuseStatisAttr);
+    for (auto &item : fuse_statis) {
+      LOG(INFO) << "fused " << item.first << " " << item.second;
+    }
+    int num_ops = 0;
+    for (auto &node :
+         analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
+      if (node->IsFunction()) {
+        ++num_ops;
+      }
+    }
+    LOG(INFO) << "has num ops: " << num_ops;
+  }
+}
+
+TEST(Analyzer_vis, analysis) { TestVisualPrediction(); }
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle

From 65f901b36ff210f0cd440d2378312921c5172936 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 12 Sep 2018 22:40:45 +0800
Subject: [PATCH 58/85] disable fc gru temporarily

---
 paddle/fluid/inference/tests/api/analyzer_vis_tester.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
index 7a1bb32a57..67bde72304 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -73,8 +73,8 @@ void TestVisualPrediction() {
   cfg.prog_file = FLAGS_infer_model + "/__model__";
   cfg.use_gpu = false;
   cfg.device = 0;
-  // cfg.specify_input_name = true;
   cfg.enable_ir_optim = true;
+  cfg.ir_passes.push_back("fc_gru_fuse_pass");
   predictor =
       CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
 

From 01f0f16884f3587f2d01a830e55c7c446a0c8cde Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 12 Sep 2018 23:18:30 +0800
Subject: [PATCH 59/85] enable mkldnn in infer api

---
 paddle/fluid/inference/api/analysis_predictor.cc        | 3 +++
 paddle/fluid/inference/api/api_impl.cc                  | 3 +++
 paddle/fluid/inference/api/paddle_inference_api.h       | 4 +++-
 paddle/fluid/inference/tests/api/analyzer_vis_tester.cc | 8 ++++++--
 4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 2a9a7aed48..cd52114713 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -77,6 +77,9 @@ bool AnalysisPredictor::Init(
 
   OptimizeInferenceProgram();
   ctx_ = executor_->Prepare(*inference_program_, 0);
+  if (config_.use_mkldnn) {
+    executor_->EnableMKLDNN(*inference_program_);
+  }
 
   VLOG(5) << "to create variables";
   PADDLE_ENFORCE(scope_.get());
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 6fe13ed027..c6cb09667e 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -106,6 +106,9 @@ bool NativePaddlePredictor::Init(
   }
 
   ctx_ = executor_->Prepare(*inference_program_, 0);
+  if (config_.use_mkldnn) {
+    executor_->EnableMKLDNN(*inference_program_);
+  }
   executor_->CreateVariables(*inference_program_,
                              sub_scope_ ? sub_scope_ : scope_.get(), 0);
 
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index 995da11e4a..e8d51bb72c 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -45,7 +45,7 @@ class PaddleBuf {
   PaddleBuf(void* data, size_t length)
       : data_(data), length_(length), memory_owned_{false} {}
   // Own memory.
-  PaddleBuf(size_t length)
+  explicit PaddleBuf(size_t length)
       : data_(new char[length]), length_(length), memory_owned_(true) {}
   // Resize to `length` bytes.
   void Resize(size_t length);
@@ -121,6 +121,8 @@ struct NativeConfig : public PaddlePredictor::Config {
   bool use_gpu{false};
   int device{0};
   float fraction_of_gpu_memory{-1.f};  // Negative to notify initialization.
+  // MKLDNN related fields.
+  bool use_mkldnn{false};
   // Specify the variable's name of each input.
   bool specify_input_name{false};
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
index 67bde72304..135a81a85c 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -66,12 +66,13 @@ Record ProcessALine(const std::string &line) {
  * Use the native and analysis fluid engine to inference the demo.
  * ocr, mobilenet and se_resnext50
  */
-void TestVisualPrediction() {
+void TestVisualPrediction(bool use_mkldnn) {
   std::unique_ptr<PaddlePredictor> predictor;
   AnalysisConfig cfg;
   cfg.param_file = FLAGS_infer_model + "/__params__";
   cfg.prog_file = FLAGS_infer_model + "/__model__";
   cfg.use_gpu = false;
+  cfg.use_mkldnn = use_mkldnn;
   cfg.device = 0;
   cfg.enable_ir_optim = true;
   cfg.ir_passes.push_back("fc_gru_fuse_pass");
@@ -163,7 +164,10 @@ void TestVisualPrediction() {
   }
 }
 
-TEST(Analyzer_vis, analysis) { TestVisualPrediction(); }
+TEST(Analyzer_vis, analysis) { TestVisualPrediction(/*use_mkldnn*/ false); }
+TEST(Analyzer_vis, analysis_mkldnn) {
+  TestVisualPrediction(/*use_mkldnn*/ true);
+}
 
 }  // namespace analysis
 }  // namespace inference

From dd149d469b4c585d852b6bedc3c2835ee4b5424c Mon Sep 17 00:00:00 2001
From: Sylwester Fraczek <sylwester.fraczek@intel.com>
Date: Wed, 12 Sep 2018 10:22:08 -0700
Subject: [PATCH 60/85] hotfix for conv-relu pass

---
 paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc | 2 +-
 paddle/fluid/inference/analysis/analyzer.h              | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
index 4408cb45ac..09c5ec59d6 100644
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
@@ -58,7 +58,7 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
     desc.SetInput("Input", std::vector<std::string>({conv_relu_i_in}));
     desc.SetInput("Filter", std::vector<std::string>({conv_relu_w_in}));
     desc.SetInput("Bias", std::vector<std::string>({conv_relu_b_in}));
-    desc.SetOutput("Out", std::vector<std::string>({conv_relu_out}));
+    desc.SetOutput("Output", std::vector<std::string>({conv_relu_out}));
     desc.SetType("conv2d");
     for (auto& attr : conv->Op()->GetAttrMap()) {
       desc.SetAttr(attr.first, attr.second);
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index 399afbe64a..9bdbefc07c 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -72,6 +72,9 @@ class Analyzer : public OrderedRegistry<PassManager> {
       "mul_gru_fuse_pass",         //
       "seq_concat_fc_fuse_pass",   //
       "fc_fuse_pass",              //
+#ifdef PADDLE_WITH_MKLDNN
+      "conv_relu_mkldnn_fuse_pass",  //
+#endif
   }};
 
   std::unordered_set<std::string> disabled_ir_passes_;

From e69d9c845b30d7150f122c41805b1bc5bf75136c Mon Sep 17 00:00:00 2001
From: Bai Yifan <bai.yf@qq.com>
Date: Thu, 13 Sep 2018 09:49:22 +0800
Subject: [PATCH 61/85] code fix (#13365)

---
 paddle/fluid/operators/softmax_with_cross_entropy_op.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 148faec4af..a07c17348e 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -31,7 +31,8 @@ __global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels,
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < batch_size;
        i += blockDim.x * gridDim.x) {
     int idx = i * class_num + labels[i];
-    logit_grad[idx] -= static_cast<T>(1.);
+    logit_grad[idx] -=
+        ignore_index == labels[i] ? static_cast<T>(0.) : static_cast<T>(1.);
   }
 }
 

From 1e1b6622fdce1b704c7753e2c16656bdc97ac24e Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Thu, 13 Sep 2018 10:44:39 +0800
Subject: [PATCH 62/85] update by comment

---
 paddle/fluid/framework/details/all_reduce_op_handle.cc |  6 +-----
 paddle/fluid/framework/details/broadcast_op_handle.cc  |  6 +-----
 .../fluid/framework/details/data_balance_op_handle.cc  |  6 ------
 .../framework/details/multi_devices_graph_pass.cc      | 10 +++-------
 4 files changed, 5 insertions(+), 23 deletions(-)

diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 8450d8eb8b..7c5f5bd80a 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -46,11 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 #endif
 
 void AllReduceOpHandle::RunImpl() {
-  if (dev_ctxes_.size() > 0UL) {
-    platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
-  } else {
-    platform::RecordEvent record_event(Name(), nullptr);
-  }
+  platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
 
   if (NoDummyInputSize() == 1) {
     return;  // No need to all reduce when GPU count = 1;
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 35962ade99..4fdab5cd94 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -22,11 +22,7 @@ namespace framework {
 namespace details {
 
 void BroadcastOpHandle::RunImpl() {
-  if (dev_ctxes_.size() > 0UL) {
-    platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
-  } else {
-    platform::RecordEvent record_event(Name(), nullptr);
-  }
+  platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
 
   if (places_.size() == 1) return;
 
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc
index 91f6a42e6e..8eb3568e05 100644
--- a/paddle/fluid/framework/details/data_balance_op_handle.cc
+++ b/paddle/fluid/framework/details/data_balance_op_handle.cc
@@ -87,12 +87,6 @@ std::vector<std::array<int, 3>> DataBalanceOpHandle::GetBalancePlan(
 }
 
 void DataBalanceOpHandle::RunImpl() {
-  if (dev_ctxes_.size() > 0UL) {
-    platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
-  } else {
-    platform::RecordEvent record_event(Name(), nullptr);
-  }
-
   PADDLE_ENFORCE_GT(places_.size(), 1,
                     "Data balance can only be enabled when the number of "
                     "places to run larger than 1.");
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index cd6c8b50a9..11b085c5c7 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -431,10 +431,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
                     CreateReduceOp(&result, g_name, cur_device_id);
                     graph->Get<ShardedVarDevice>(kShardedVarDevice)
                         .emplace(g_name, cur_device_id);
-                    if (!is_dist_train) {
-                      // will send gradients directly when distributed training
-                      bcast_var_name_set[cur_device_id].emplace(p_name);
-                    }
+                    bcast_var_name_set[cur_device_id].emplace(p_name);
                     break;
                   case BuildStrategy::ReduceStrategy::kAllReduce:
                     if (IsSparseGradient(g_name)) {
@@ -461,9 +458,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
   use_gpu = nccl_ctxs_ != nullptr;
 #endif
 
-  if ((use_gpu &&
-       strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) ||
-      is_dist_train) {
+  if (use_gpu && strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce &&
+      !is_dist_train) {
     // Insert BCast Ops
     for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
       auto &to_bcast_set = bcast_var_name_set[dev_id];

From 4778c6e21c6918535b648b9c9bc55e9f0ba56e99 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Thu, 13 Sep 2018 10:45:27 +0800
Subject: [PATCH 63/85] delete unused py codes

---
 python/paddle/fluid/parallel_executor.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index a6395d9e20..44af29d339 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -142,11 +142,6 @@ class ParallelExecutor(object):
         main = main if main else framework.default_main_program()
         if scope == None:
             scope = executor.global_scope()
-        # FIXME(Yancey1989): it's a temporary approach to determinate the distribute
-        # train program, call self.bcast_param() at the end of each mini-batch.
-        self.is_dist = True if "recv" in [
-            op.type for op in main.global_block().ops
-        ] else False
 
         if share_vars_from and not isinstance(share_vars_from,
                                               ParallelExecutor):

From 1664899b63ba8175f7ad5616a031a01c1e54ca1a Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Thu, 13 Sep 2018 10:53:16 +0800
Subject: [PATCH 64/85] update

---
 paddle/fluid/framework/details/data_balance_op_handle.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc
index 8eb3568e05..525d243224 100644
--- a/paddle/fluid/framework/details/data_balance_op_handle.cc
+++ b/paddle/fluid/framework/details/data_balance_op_handle.cc
@@ -15,7 +15,6 @@
 #include "paddle/fluid/framework/details/data_balance_op_handle.h"
 #include <algorithm>
 #include "paddle/fluid/framework/details/container_cast.h"
-#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace framework {

From 5b5fa37fb98bfa05f23e5ad508f6dbf3e7ec9f93 Mon Sep 17 00:00:00 2001
From: Jiabin Yang <marsyang199376@gmail.com>
Date: Thu, 13 Sep 2018 10:53:51 +0800
Subject: [PATCH 65/85] Update test_reader_reset.py

import os module to use os.environ in setUp()
---
 python/paddle/fluid/tests/unittests/test_reader_reset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_reader_reset.py b/python/paddle/fluid/tests/unittests/test_reader_reset.py
index a115c37e1d..e97a05b6f9 100644
--- a/python/paddle/fluid/tests/unittests/test_reader_reset.py
+++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from __future__ import print_function
-
+import os
 import paddle.fluid as fluid
 import paddle
 import numpy as np

From 2b10aee52a3f6f23b0243ee64b4f4d722fa41383 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 13 Sep 2018 12:04:16 +0800
Subject: [PATCH 66/85] disable seqexpandconcatfc op test on Mac

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 8ac1cb164e..9d7c528dbd 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -28,6 +28,10 @@ list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/Paddl
 
 list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test
 list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test
+if(APPLE)
+    # this op is not support on mac
+    list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op)
+endif()
 
 function(py_test_modules TARGET_NAME)
   if(WITH_TESTING)

From 49bafc05bf7380874c92bd2954c5c96bca695ee4 Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Thu, 13 Sep 2018 05:35:29 +0000
Subject: [PATCH 67/85] fix comments and set name for trt layer and ITensor

---
 .../fluid/inference/analysis/subgraph_splitter.cc  | 12 ++++++++++--
 .../inference/tensorrt/convert/activation_op.cc    |  2 ++
 .../inference/tensorrt/convert/batch_norm_op.cc    |  2 ++
 .../fluid/inference/tensorrt/convert/concat_op.cc  |  4 ++++
 .../fluid/inference/tensorrt/convert/conv2d_op.cc  |  5 +++++
 .../inference/tensorrt/convert/elementwise_op.cc   |  4 ++++
 paddle/fluid/inference/tensorrt/convert/fc_op.cc   |  2 ++
 .../fluid/inference/tensorrt/convert/pool2d_op.cc  |  2 ++
 paddle/fluid/operators/tensorrt_engine_op.h        | 14 --------------
 9 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc
index 773fceeeb2..c3a2dbf9d1 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -85,6 +85,14 @@ struct BriefNode {
   std::vector<BriefNode *> outlinks;
 };
 
+// Union two adjacent BriefNode.
+// Suppose we have two adjacent nodes src and dst.
+// We will perform the following operations:
+// 1. add all inputs(except src) of dst to src inlinks.
+// 2. add all outputs of dst to src outlinks.
+// 3. change all the dst's inputs and outputs
+// corresponding inlinks and outlinks to src node.
+// 4. delete all dst's inlinks and outlinks.
 void UnionContractedNodes(const std::unordered_map<int, BriefNode *> &node_map,
                           int src_id, int dst_id) {
   // merge the two adjacent nodes into one node.
@@ -224,8 +232,8 @@ std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
     //  Our algorithm must guarantee that:
     //  1. The graph is always directed acyclic graph（DAG）.
     //  2. If there is a path in the subgraph from X to Y (X and Y are both
-    //  nodes
-    //     in the subgraph), then all paths from X to Y are in the subgraph.
+    //  nodes in the subgraph), then all paths from X to Y are in the
+    //  subgraph.
     //
     //  In order to achieve the above guarantee.
     //  For adjacent nodes src -> dst.
diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
index e1cace9cc1..8168cdff1b 100644
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@@ -35,6 +35,8 @@ class ReluOpConverter : public OpConverter {
         engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor),
         nvinfer1::ActivationType::kRELU);
     auto output_name = op_desc.Output("Out")[0];
+    layer->setName(("relu (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
     engine_->SetITensor(output_name, layer->getOutput(0));
     if (test_mode) {  // the test framework can not determine which is the
                       // output, so place the declaration inside.
diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
index 94f8b0ae56..3330af2da6 100644
--- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
@@ -116,6 +116,8 @@ class BatchNormOpConverter : public OpConverter {
                              scale_weights.get(), power_weights.get());
 
     auto output_name = op_desc.Output("Y").front();
+    layer->setName(("batch_norm (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
     engine_->weight_map[op_desc.Input("Bias").front()] =
         std::move(combile_bias_tensor);
     engine_->weight_map[op_desc.Input("Scale").front()] =
diff --git a/paddle/fluid/inference/tensorrt/convert/concat_op.cc b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
index bb9627bf95..2983e91cb2 100644
--- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
@@ -30,7 +30,9 @@ class ConcatOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     std::vector<nvinfer1::ITensor*> itensors;
+    std::cout << "Concat op: " << std::endl;
     for (auto& input_name : op_desc.Input("X")) {
+      std::cout << input_name << std::endl;
       itensors.push_back(engine_->GetITensor(input_name));
     }
     int axis = boost::get<int>(op_desc.GetAttr("axis"));
@@ -42,6 +44,8 @@ class ConcatOpConverter : public OpConverter {
     axis = axis - 1;  // Remove batch dim
     layer->setAxis(axis);
     auto output_name = op_desc.Output("Out")[0];
+    layer->setName(("concat (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
     engine_->SetITensor(output_name, layer->getOutput(0));
     if (test_mode) {  // the test framework can not determine which is the
                       // output, so place the declaration inside.
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 841a95db38..022e43a571 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -26,6 +26,9 @@ class Conv2dOpConverter : public OpConverter {
         << "convert a fluid conv2d op to tensorrt conv layer without bias";
 
     framework::OpDesc op_desc(op, nullptr);
+    std::cout << "Conv op: " << std::endl;
+    std::cout << op_desc.Input("Input").front() << std::endl;
+    std::cout << op_desc.Output("Output").front() << std::endl;
     PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1);
     PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1);  // Y is a weight
     PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1);
@@ -78,8 +81,10 @@ class Conv2dOpConverter : public OpConverter {
     layer->setNbGroups(groups);
 
     auto output_name = op_desc.Output("Output").front();
+    layer->setName(("conv2d (Output: " + output_name + ")").c_str());
     engine_->weight_map[op_desc.Input("Filter").front()] =
         std::move(weight_tensor);
+    layer->getOutput(0)->setName(output_name.c_str());
     engine_->SetITensor(output_name, layer->getOutput(0));
     if (test_mode) {
       engine_->DeclareOutput(output_name);
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index 60a72b4eb5..0a6ce568f1 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -89,6 +89,8 @@ class ElementwiseWeightOpConverter : public OpConverter {
         shift_weights.get(), scale_weights.get(), power_weights.get());
     auto output_name = op_desc.Output("Out")[0];
 
+    layer->setName(("elementwise_add (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
     engine_->weight_map[op_desc.Input("Y").front()] = std::move(weight_tensor);
     engine_->SetITensor(output_name, layer->getOutput(0));
     if (test_mode) {  // the test framework can not determine which is the
@@ -137,6 +139,8 @@ class ElementwiseTensorOpConverter : public OpConverter {
         *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
 
     auto output_name = op_desc.Output("Out")[0];
+    layer->setName(("elementwise (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
     engine_->SetITensor(output_name, layer->getOutput(0));
     if (test_mode) {  // the test framework can not determine which is the
                       // output, so place the declaration inside.
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index ad98d85aae..7c21ecd95d 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -107,6 +107,8 @@ class FcOpConverter : public OpConverter {
                                        n_output, tmp_weight.get(), bias.get());
 
     auto output_name = op_desc.Output("Out").front();
+    layer->setName(("fc (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
     engine_->SetITensor(output_name, layer->getOutput(0));
     engine_->weight_map[op_desc.Input("Y").front()] = std::move(tmp);
     if (test_mode) {
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index 73f1b28ddf..f9bb66a6e9 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -72,6 +72,8 @@ class Pool2dOpConverter : public OpConverter {
     layer->setPadding(nv_paddings);
 
     auto output_name = op_desc.Output("Out")[0];
+    layer->setName(("pool2d (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
     engine_->SetITensor(output_name, layer->getOutput(0));
     if (test_mode) {
       engine_->DeclareOutput(output_name);
diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h
index 395d8bcc07..79e75ea9a0 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -161,20 +161,6 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
               boost::get<platform::CUDAPlace>(context.GetPlace()).device)),
           size * sizeof(float));
 
-      // TODO(zhaolong) : delete it sometimes
-      /* THIS CODE JUST FOR TEST
-      std::cout << output_maps[output_index] << std::endl;
-      platform::CPUPlace cpu_place;
-      framework::LoDTensor temp_tensor;
-      temp_tensor.Resize(framework::make_ddim(ddim));
-      auto* temp_data = temp_tensor.mutable_data<float>(cpu_place);
-
-      TensorCopySync(*fluid_t, cpu_place ,&temp_tensor);
-      for(int i = 0; i < size; i++) {
-        std::cout << temp_data[i] <<  " " ;
-      }
-      std::cout << std::endl;
-      */
       output_index += 1;
     }
 

From bad4ea192e195f7d6f912eb0f8647e29e7ef929e Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Thu, 13 Sep 2018 14:15:17 +0800
Subject: [PATCH 68/85] update by comment

---
 paddle/fluid/framework/details/multi_devices_graph_pass.cc | 5 +++--
 paddle/fluid/framework/details/reduce_op_handle.cc         | 7 ++-----
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 11b085c5c7..5781936cb3 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -458,8 +458,9 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
   use_gpu = nccl_ctxs_ != nullptr;
 #endif
 
-  if (use_gpu && strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce &&
-      !is_dist_train) {
+  if ((use_gpu &&
+       strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) ||
+      is_dist_train) {
     // Insert BCast Ops
     for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
       auto &to_bcast_set = bcast_var_name_set[dev_id];
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index 878828693b..7fc06f234d 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -27,11 +27,8 @@ namespace framework {
 namespace details {
 
 void ReduceOpHandle::RunImpl() {
-  if (dev_ctxes_.size() > 0UL) {
-    platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
-  } else {
-    platform::RecordEvent record_event(Name(), nullptr);
-  }
+  platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
+
   if (places_.size() == 1) return;
   // the input and output may have dummy var.
   auto in_var_handles = DynamicCast<VarHandle>(inputs_);

From dd0b2036c68b6601ca6722f510068d0eb162eda9 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 13 Sep 2018 15:01:41 +0800
Subject: [PATCH 69/85] add note for use mkldnn

---
 paddle/fluid/inference/api/analysis_predictor.cc        | 2 +-
 paddle/fluid/inference/api/api_impl.cc                  | 2 +-
 paddle/fluid/inference/api/paddle_inference_api.h       | 4 ++--
 paddle/fluid/inference/tests/api/analyzer_vis_tester.cc | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index cd52114713..684e0ce0e2 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -77,7 +77,7 @@ bool AnalysisPredictor::Init(
 
   OptimizeInferenceProgram();
   ctx_ = executor_->Prepare(*inference_program_, 0);
-  if (config_.use_mkldnn) {
+  if (config_._use_mkldnn) {
     executor_->EnableMKLDNN(*inference_program_);
   }
 
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index c6cb09667e..2e9e10139f 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -106,7 +106,7 @@ bool NativePaddlePredictor::Init(
   }
 
   ctx_ = executor_->Prepare(*inference_program_, 0);
-  if (config_.use_mkldnn) {
+  if (config_._use_mkldnn) {
     executor_->EnableMKLDNN(*inference_program_);
   }
   executor_->CreateVariables(*inference_program_,
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index e8d51bb72c..55a07ca705 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -121,8 +121,8 @@ struct NativeConfig : public PaddlePredictor::Config {
   bool use_gpu{false};
   int device{0};
   float fraction_of_gpu_memory{-1.f};  // Negative to notify initialization.
-  // MKLDNN related fields.
-  bool use_mkldnn{false};
+  // NOTE: NOT use it, just for the internal test, will discard later
+  bool _use_mkldnn{false};
   // Specify the variable's name of each input.
   bool specify_input_name{false};
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
index 135a81a85c..3675c5f7f3 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -72,7 +72,7 @@ void TestVisualPrediction(bool use_mkldnn) {
   cfg.param_file = FLAGS_infer_model + "/__params__";
   cfg.prog_file = FLAGS_infer_model + "/__model__";
   cfg.use_gpu = false;
-  cfg.use_mkldnn = use_mkldnn;
+  cfg._use_mkldnn = use_mkldnn;
   cfg.device = 0;
   cfg.enable_ir_optim = true;
   cfg.ir_passes.push_back("fc_gru_fuse_pass");

From 26b1704befe6963247b14272046aa5698d2277c3 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Thu, 13 Sep 2018 15:02:10 +0800
Subject: [PATCH 70/85] fix with distribute cmake

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 8ac1cb164e..19e9882ed6 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -46,6 +46,7 @@ function(py_test_modules TARGET_NAME)
 endfunction()
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_dist_train)
+list(REMOVE_ITEM TEST_OPS test_dist_transpiler)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
 list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
@@ -61,11 +62,12 @@ if(WITH_DISTRIBUTE)
     set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
     set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200)
     set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
+    py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
+    py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
+    py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
 endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
 set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 150)
-py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
-py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
 py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
 py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)

From cb4a73be010d2314531173e28022e3b5d163c033 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Thu, 13 Sep 2018 15:45:27 +0800
Subject: [PATCH 71/85] fix fluid_benchmark resnet lr decay

---
 benchmark/fluid/models/resnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py
index d71b855612..1b3bfe659c 100644
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -207,7 +207,7 @@ def get_model(args, is_train, main_prog, startup_prog):
 
                 total_images = 1281167 / trainer_count
 
-                step = int(total_images / args.batch_size + 1)
+                step = int(total_images / (args.batch_size * args.gpus) + 1)
                 epochs = [30, 60, 90]
                 bd = [step * e for e in epochs]
                 base_lr = args.learning_rate

From 0092ad32856ea17c494a64b02e51d8bf14a0ad20 Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Thu, 13 Sep 2018 08:08:35 +0000
Subject: [PATCH 72/85] delete unused log

---
 paddle/fluid/inference/tensorrt/convert/concat_op.cc | 2 --
 paddle/fluid/inference/tensorrt/convert/conv2d_op.cc | 3 ---
 2 files changed, 5 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/concat_op.cc b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
index 2983e91cb2..a11dfa1e8f 100644
--- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
@@ -30,9 +30,7 @@ class ConcatOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     std::vector<nvinfer1::ITensor*> itensors;
-    std::cout << "Concat op: " << std::endl;
     for (auto& input_name : op_desc.Input("X")) {
-      std::cout << input_name << std::endl;
       itensors.push_back(engine_->GetITensor(input_name));
     }
     int axis = boost::get<int>(op_desc.GetAttr("axis"));
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 022e43a571..0a37d3968c 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -26,9 +26,6 @@ class Conv2dOpConverter : public OpConverter {
         << "convert a fluid conv2d op to tensorrt conv layer without bias";
 
     framework::OpDesc op_desc(op, nullptr);
-    std::cout << "Conv op: " << std::endl;
-    std::cout << op_desc.Input("Input").front() << std::endl;
-    std::cout << op_desc.Output("Output").front() << std::endl;
     PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1);
     PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1);  // Y is a weight
     PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1);

From 3a3f28f99b87b2626bf872b1cfc4faf631c07443 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Thu, 13 Sep 2018 16:35:56 +0800
Subject: [PATCH 73/85] add (#13377)

---
 paddle/fluid/operators/distributed/grpc_client.cc | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
index 07ac20797d..e22bc552f8 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -290,12 +290,18 @@ void GRPCClient::Proceed() {
       c->Finish(false);
     }
 
-    delete c;
+    bool notify = false;
     {
       std::lock_guard<std::mutex> lk(sync_mutex_);
       req_count_--;
+      notify = (req_count_ <= 0 || !c->status_.ok());
+    }
+
+    delete c;
+
+    if (notify) {
+      sync_cond_.notify_all();
     }
-    sync_cond_.notify_all();
   }
   VLOG(3) << "GRPCClient Proceed end";
 }

From 29f5a93b5f5b87b5f7a7f059b1471d373b15e740 Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Thu, 13 Sep 2018 17:17:57 +0800
Subject: [PATCH 74/85] add analyzer_rnn2_test

---
 .../fluid/inference/tests/api/CMakeLists.txt  |  58 +++---
 .../tests/api/analyzer_rnn2_tester.cc         | 181 ++++++++++++++++++
 2 files changed, 211 insertions(+), 28 deletions(-)
 create mode 100644 paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index d44a2cfa7f..ece0d33399 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -1,56 +1,58 @@
-function (inference_download_and_uncompress install_dir url)
-    get_filename_component(filename ${url} NAME)
-    message(STATUS "Download inference test stuff ${filename} from ${url}")
+set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
+set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo")
+set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor)
+function (inference_download_and_uncompress install_dir filename)
+    message(STATUS "Download inference test stuff from ${INFERENCE_URL}/${filename}")
     execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
-    execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}")
+    execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${INFERENCE_URL}/${filename}")
     execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}")
     message(STATUS "finish downloading ${filename}")
 endfunction(inference_download_and_uncompress)
 
-function(download_model_and_data install_dir model_url data_url)
+function(download_model_and_data install_dir model_name data_name)
     if (NOT EXISTS ${install_dir} AND WITH_INFERENCE)
-        inference_download_and_uncompress(${install_dir} ${model_url})
-        inference_download_and_uncompress(${install_dir} ${data_url})
+        inference_download_and_uncompress(${install_dir} ${model_name})
+        inference_download_and_uncompress(${install_dir} ${data_name})
     endif()
 endfunction()
 
 # RNN1
-set(RNN1_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fmodel.tar.gz")
-set(RNN1_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fdata.txt.tar.gz")
-set(RNN1_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/rnn1")
-download_model_and_data(${RNN1_INSTALL_DIR} ${RNN1_MODEL_URL} ${RNN1_DATA_URL})
-inference_analysis_test(test_analyzer_rnn1 SRCS analyzer_rnn1_tester.cc
-    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
+set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
+download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz")
+inference_analysis_test(test_analyzer_rnn1 SRCS analyzer_rnn1_tester.cc 
+    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
     ARGS --infer_model=${RNN1_INSTALL_DIR}/model
          --infer_data=${RNN1_INSTALL_DIR}/data.txt)
 
+# RNN2
+set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
+download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
+inference_analysis_test(test_analyzer_rnn2 SRCS analyzer_rnn2_tester.cc
+    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+    ARGS --infer_model=${RNN2_INSTALL_DIR}/model
+         --infer_data=${RNN2_INSTALL_DIR}/data.txt)
+
 # chinese_ner
-set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz")
-set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz")
-set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner")
-download_model_and_data(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} ${CHINESE_NER_DATA_URL})
+set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
+download_model_and_data(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz")
 inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc
-    EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
+    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
     ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model
         --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt)
 
 # lac
-set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz")
-set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz")
-set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac")
-download_model_and_data(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} ${LAC_DATA_URL})
+set(LAC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lac")
+download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" "lac_data.txt.tar.gz")
 inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
-    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
+    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
     ARGS --infer_model=${LAC_INSTALL_DIR}/model
         --infer_data=${LAC_INSTALL_DIR}/data.txt)
 
 # text_classification
-set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz")
-set(TEXT_CLASSIFICATION_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/text_classification_data.txt.tar.gz")
-set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification")
-download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} ${TEXT_CLASSIFICATION_DATA_URL})
+set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
+download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz")
 inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc
-    EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
+    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
     ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta
          --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt
          --topn=1 # Just run top 1 batch.
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
new file mode 100644
index 0000000000..c40ea58eea
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
@@ -0,0 +1,181 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+
+#include <google/protobuf/text_format.h>
+#include <gtest/gtest.h>
+#include <thread>  // NOLINT
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/analysis_predictor.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
+
+DEFINE_string(infer_model, "", "model path");
+DEFINE_string(infer_data, "", "data path");
+DEFINE_int32(batch_size, 1, "batch size.");
+DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
+
+namespace paddle {
+namespace inference {
+
+using namespace framework;  // NOLINT
+
+struct DataRecord {
+  std::vector<std::vector<std::vector<float>>> link_step_data_all;
+  std::vector<size_t> lod;
+  std::vector<std::vector<float>> rnn_link_data;
+  std::vector<float> result_data;
+  size_t batch_iter{0};
+  size_t batch_size{1};
+  DataRecord() = default;
+  explicit DataRecord(const std::string &path, int batch_size = 1)
+      : batch_size(batch_size) {
+    Load(path);
+  }
+  DataRecord NextBatch() {
+    DataRecord data;
+    size_t batch_end = batch_iter + batch_size;
+    // NOTE skip the final batch, if no enough data is provided.
+    if (batch_end <= link_step_data_all.size()) {
+      data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter,
+                                     link_step_data_all.begin() + batch_end);
+      // Prepare LoDs
+      data.lod.push_back(0);
+      CHECK(!data.link_step_data_all.empty()) << "empty";
+      for (size_t j = 0; j < data.link_step_data_all.size(); j++) {
+        for (const auto &d : data.link_step_data_all[j]) {
+          data.rnn_link_data.push_back(d);
+          // calculate lod
+          data.lod.push_back(data.lod.back() + 11);
+        }
+      }
+    }
+    batch_iter += batch_size;
+    return data;
+  }
+  void Load(const std::string &path) {
+    std::ifstream file(path);
+    std::string line;
+    int num_lines = 0;
+    while (std::getline(file, line)) {
+      num_lines++;
+      std::vector<std::string> data;
+      split(line, ':', &data);
+      if (num_lines % 2) {  // feature
+        std::vector<std::string> feature_data;
+        split(data[1], ' ', &feature_data);
+        std::vector<std::vector<float>> link_step_data;
+        int feature_count = 1;
+        std::vector<float> feature;
+        for (auto &step_data : feature_data) {
+          std::vector<float> tmp;
+          split_to_float(step_data, ',', &tmp);
+          feature.insert(feature.end(), tmp.begin(), tmp.end());
+          if (feature_count % 11 == 0) {  // each sample has 11 features
+            link_step_data.push_back(feature);
+            feature.clear();
+          }
+          feature_count++;
+        }
+        link_step_data_all.push_back(std::move(link_step_data));
+      } else {  // result
+        std::vector<float> tmp;
+        split_to_float(data[1], ',', &tmp);
+        result_data.insert(result_data.end(), tmp.begin(), tmp.end());
+      }
+    }
+  }
+};
+void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
+                   int batch_size) {
+  PaddleTensor feed_tensor;
+  feed_tensor.name = "feed";
+  auto one_batch = data->NextBatch();
+  int token_size = one_batch.rnn_link_data.size();
+  // each token has 11 features, each feature's dim is 54.
+  std::vector<int> rnn_link_data_shape({token_size * 11, 54});
+  feed_tensor.shape = rnn_link_data_shape;
+  feed_tensor.lod.assign({one_batch.lod});
+  feed_tensor.dtype = PaddleDType::FLOAT32;
+  TensorAssignData<float>(&feed_tensor, one_batch.rnn_link_data);
+  // Set inputs.
+  input_slots->assign({feed_tensor});
+}
+
+void CompareResult(const std::vector<PaddleTensor> &outputs,
+                   const std::vector<float> &base_result) {
+  PADDLE_ENFORCE_GT(outputs.size(), 0);
+  for (size_t i = 0; i < outputs.size(); i++) {
+    auto &out = outputs[i];
+    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                                  [](int a, int b) { return a * b; });
+    PADDLE_ENFORCE_GT(size, 0);
+    float *data = static_cast<float *>(out.data.data());
+    for (size_t i = 0; i < size; i++) {
+      EXPECT_NEAR(data[i], base_result[i], 1e-3);
+    }
+  }
+}
+// Test with a really complicate model.
+void TestRNN2Prediction() {
+  AnalysisConfig config;
+  config.prog_file = FLAGS_infer_model + "/__model__";
+  config.param_file = FLAGS_infer_model + "/param";
+  config.use_gpu = false;
+  config.device = 0;
+  config.specify_input_name = true;
+  config.enable_ir_optim = true;
+  PADDLE_ENFORCE(config.ir_mode ==
+                 AnalysisConfig::IrPassMode::kExclude);  // default
+
+  int batch_size = FLAGS_batch_size;
+  int num_times = FLAGS_repeat;
+
+  auto base_predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  auto predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
+  std::vector<PaddleTensor> input_slots;
+  DataRecord data(FLAGS_infer_data, batch_size);
+  PrepareInputs(&input_slots, &data, batch_size);
+  std::vector<PaddleTensor> outputs, base_outputs;
+
+  Timer timer1;
+  timer1.tic();
+  for (int i = 0; i < num_times; i++) {
+    base_predictor->Run(input_slots, &base_outputs);
+  }
+  PrintTime(batch_size, num_times, 1, 0, timer1.toc() / num_times);
+
+  Timer timer2;
+  timer2.tic();
+  for (int i = 0; i < num_times; i++) {
+    predictor->Run(input_slots, &outputs);
+  }
+  PrintTime(batch_size, num_times, 1, 0, timer2.toc() / num_times);
+
+  CompareResult(base_outputs, data.result_data);
+  CompareResult(outputs, data.result_data);
+}
+
+TEST(Analyzer, rnn2) { TestRNN2Prediction(); }
+
+}  // namespace inference
+}  // namespace paddle

From 20b40cb06a0e10748328d6925cdbc8759c04249f Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Thu, 13 Sep 2018 18:27:59 +0800
Subject: [PATCH 75/85] add multi-thread for nlp unit-tests

---
 paddle/fluid/inference/api/helper.h           |   8 +-
 .../fluid/inference/tests/api/CMakeLists.txt  |   6 +-
 .../tests/api/analyzer_lac_tester.cc          |  70 +++-------
 .../tests/api/analyzer_ner_tester.cc          |  73 +++-------
 .../tests/api/analyzer_rnn1_tester.cc         |  84 +-----------
 .../analyzer_text_classification_tester.cc    |  75 ++++-------
 .../fluid/inference/tests/api/tester_helper.h | 126 ++++++++++++++++++
 7 files changed, 207 insertions(+), 235 deletions(-)
 create mode 100644 paddle/fluid/inference/tests/api/tester_helper.h

diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index f6893be428..8e359a6773 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -123,10 +123,16 @@ std::string DescribeTensor(const PaddleTensor &tensor) {
 }
 
 void PrintTime(int batch_size, int repeat, int num_threads, int tid,
-               double latency) {
+               double latency, int epoch = 1) {
   LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat
             << ", threads: " << num_threads << ", thread id: " << tid
             << ", latency: " << latency << "ms ======";
+  if (epoch > 1) {
+    int samples = batch_size * epoch;
+    LOG(INFO) << "====== sample number: " << samples
+              << ", average latency of each sample: " << latency / samples
+              << "ms ======";
+  }
 }
 
 }  // namespace inference
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index d44a2cfa7f..ece25db019 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -49,9 +49,7 @@ set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/te
 set(TEXT_CLASSIFICATION_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/text_classification_data.txt.tar.gz")
 set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification")
 download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} ${TEXT_CLASSIFICATION_DATA_URL})
-inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc
+inference_analysis_test(test_analyzer_text_classification SRCS analyzer_text_classification_tester.cc
     EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
     ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta
-         --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt
-         --topn=1 # Just run top 1 batch.
-         )
+         --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt)
diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
index 7e00cb20ad..45c19af520 100644
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -12,21 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
-#include "paddle/fluid/platform/profiler.h"
-
-DEFINE_string(infer_model, "", "model path for LAC");
-DEFINE_string(infer_data, "", "data file for LAC");
-DEFINE_int32(batch_size, 1, "batch size.");
-DEFINE_int32(burning, 0, "Burning before repeat.");
-DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
-DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle {
 namespace inference {
@@ -126,46 +112,37 @@ void TestLACPrediction(const std::string &model_path,
                        const std::string &data_file, const int batch_size,
                        const int repeat, bool test_all_data,
                        bool use_analysis = false) {
-  NativeConfig config;
-  config.model_dir = model_path;
-  config.use_gpu = false;
-  config.device = 0;
-  config.specify_input_name = true;
+  AnalysisConfig cfg;
+  cfg.model_dir = model_path;
+  cfg.use_gpu = false;
+  cfg.device = 0;
+  cfg.specify_input_name = true;
+  cfg.enable_ir_optim = true;
+
   std::vector<PaddleTensor> input_slots, outputs_slots;
   DataRecord data(data_file, batch_size);
   GetOneBatch(&input_slots, &data, batch_size);
   std::unique_ptr<PaddlePredictor> predictor;
   if (use_analysis) {
-    AnalysisConfig cfg;
-    cfg.model_dir = model_path;
-    cfg.use_gpu = false;
-    cfg.device = 0;
-    cfg.specify_input_name = true;
-    cfg.enable_ir_optim = true;
     predictor =
         CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
   } else {
     predictor =
-        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
   }
   for (int i = 0; i < FLAGS_burning; i++) {
     predictor->Run(input_slots, &outputs_slots);
   }
   Timer timer;
-  if (test_all_data) {
-    double sum = 0;
-    LOG(INFO) << "Total number of samples: " << data.datasets.size();
-    for (int i = 0; i < repeat; i++) {
-      for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
-        GetOneBatch(&input_slots, &data, batch_size);
-        timer.tic();
-        predictor->Run(input_slots, &outputs_slots);
-        sum += timer.toc();
-      }
+  if (FLAGS_test_all_data) {
+    LOG(INFO) << "test all data";
+    std::vector<std::vector<PaddleTensor>> input_slots_all;
+    for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
+      GetOneBatch(&input_slots, &data, batch_size);
+      input_slots_all.emplace_back(input_slots);
     }
-    PrintTime(batch_size, repeat, 1, 0, sum / repeat);
-    LOG(INFO) << "Average latency of each sample: "
-              << sum / repeat / data.datasets.size() << " ms";
+    LOG(INFO) << "total number of samples: " << data.datasets.size();
+    TestPrediction(cfg, input_slots_all, &outputs_slots, FLAGS_num_threads);
     return;
   }
   timer.tic();
@@ -190,19 +167,10 @@ void TestLACPrediction(const std::string &model_path,
   if (use_analysis) {
     // run once for comparion as reference
     auto ref_predictor =
-        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
     std::vector<PaddleTensor> ref_outputs_slots;
     ref_predictor->Run(input_slots, &ref_outputs_slots);
-    EXPECT_EQ(ref_outputs_slots.size(), outputs_slots.size());
-    auto &ref_out = ref_outputs_slots[0];
-    size_t ref_size =
-        std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
-                        [](int a, int b) { return a * b; });
-    EXPECT_EQ(size, ref_size);
-    int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
-    for (size_t i = 0; i < size; ++i) {
-      EXPECT_EQ(pdata_ref[i], pdata[i]);
-    }
+    CompareResult(ref_outputs_slots, outputs_slots);
 
     AnalysisPredictor *analysis_predictor =
         dynamic_cast<AnalysisPredictor *>(predictor.get());
diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
index 6e8e43add7..f8c651e32f 100644
--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -12,20 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
-#include "paddle/fluid/platform/profiler.h"
-
-DEFINE_string(infer_model, "", "model path");
-DEFINE_string(infer_data, "", "data path");
-DEFINE_int32(batch_size, 10, "batch size.");
-DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
-DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle {
 namespace inference {
@@ -113,50 +100,35 @@ const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
                                        48, 39, 38, 16, 25};
 
 void TestChineseNERPrediction(bool use_analysis) {
-  NativeConfig config;
-  config.prog_file = FLAGS_infer_model + "/__model__";
-  config.param_file = FLAGS_infer_model + "/param";
-  config.use_gpu = false;
-  config.device = 0;
-  config.specify_input_name = true;
+  AnalysisConfig cfg;
+  cfg.prog_file = FLAGS_infer_model + "/__model__";
+  cfg.param_file = FLAGS_infer_model + "/param";
+  cfg.use_gpu = false;
+  cfg.device = 0;
+  cfg.specify_input_name = true;
+  cfg.enable_ir_optim = true;
 
   std::vector<PaddleTensor> input_slots, outputs;
   std::unique_ptr<PaddlePredictor> predictor;
   Timer timer;
   if (use_analysis) {
-    AnalysisConfig cfg;
-    cfg.prog_file = FLAGS_infer_model + "/__model__";
-    cfg.param_file = FLAGS_infer_model + "/param";
-    cfg.use_gpu = false;
-    cfg.device = 0;
-    cfg.specify_input_name = true;
-    cfg.enable_ir_optim = true;
     predictor =
         CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
   } else {
     predictor =
-        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
   }
 
   if (FLAGS_test_all_data) {
     LOG(INFO) << "test all data";
-    double sum = 0;
-    size_t num_samples;
-    for (int i = 0; i < FLAGS_repeat; i++) {
-      DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
-      // Just one batch, the num_samples remains the same.
-      num_samples = data.num_samples;
-      for (size_t bid = 0; bid < num_samples / FLAGS_batch_size; ++bid) {
-        PrepareInputs(&input_slots, &data, FLAGS_batch_size);
-        timer.tic();
-        predictor->Run(input_slots, &outputs);
-        sum += timer.toc();
-      }
+    DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+    std::vector<std::vector<PaddleTensor>> input_slots_all;
+    for (size_t bid = 0; bid < data.num_samples / FLAGS_batch_size; ++bid) {
+      PrepareInputs(&input_slots, &data, FLAGS_batch_size);
+      input_slots_all.emplace_back(input_slots);
     }
-    LOG(INFO) << "total number of samples: " << num_samples;
-    PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat);
-    LOG(INFO) << "average latency of each sample: "
-              << sum / FLAGS_repeat / num_samples;
+    LOG(INFO) << "total number of samples: " << data.num_samples;
+    TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
     return;
   }
   // Prepare inputs.
@@ -182,19 +154,10 @@ void TestChineseNERPrediction(bool use_analysis) {
   if (use_analysis) {
     // run once for comparion as reference
     auto ref_predictor =
-        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
     std::vector<PaddleTensor> ref_outputs_slots;
     ref_predictor->Run(input_slots, &ref_outputs_slots);
-    EXPECT_EQ(ref_outputs_slots.size(), outputs.size());
-    auto &ref_out = ref_outputs_slots[0];
-    size_t ref_size =
-        std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
-                        [](int a, int b) { return a * b; });
-    EXPECT_EQ(size, ref_size);
-    int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
-    for (size_t i = 0; i < size; ++i) {
-      EXPECT_EQ(pdata_ref[i], result[i]);
-    }
+    CompareResult(ref_outputs_slots, outputs);
 
     AnalysisPredictor *analysis_predictor =
         dynamic_cast<AnalysisPredictor *>(predictor.get());
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
index b8ac468b4e..df96be544e 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -12,24 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/analysis/analyzer.h"
-
-#include <google/protobuf/text_format.h>
-#include <gtest/gtest.h>
-#include <thread>  // NOLINT
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
-
-DEFINE_string(infer_model, "", "model path");
-DEFINE_string(infer_data, "", "data path");
-DEFINE_int32(batch_size, 10, "batch size.");
-DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
-DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle {
 namespace inference {
@@ -164,26 +147,6 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   }
 }
 
-void CompareResult(const std::vector<PaddleTensor> &outputs,
-                   const std::vector<PaddleTensor> &base_outputs) {
-  PADDLE_ENFORCE_GT(outputs.size(), 0);
-  PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
-  for (size_t i = 0; i < outputs.size(); i++) {
-    auto &out = outputs[i];
-    auto &base_out = base_outputs[i];
-    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
-                                  [](int a, int b) { return a * b; });
-    size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
-                                   1, [](int a, int b) { return a * b; });
-    PADDLE_ENFORCE_EQ(size, size1);
-    PADDLE_ENFORCE_GT(size, 0);
-    float *data = static_cast<float *>(out.data.data());
-    float *base_data = static_cast<float *>(base_out.data.data());
-    for (size_t i = 0; i < size; i++) {
-      EXPECT_NEAR(data[i], base_data[i], 1e-3);
-    }
-  }
-}
 // Test with a really complicate model.
 void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {
   AnalysisConfig config;
@@ -198,7 +161,6 @@ void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {
   config.ir_passes.clear();  // Do not exclude any pass.
 
   int batch_size = FLAGS_batch_size;
-  int num_times = FLAGS_repeat;
 
   auto base_predictor =
       CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
@@ -213,45 +175,14 @@ void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {
 
   base_predictor->Run(input_slots, &base_outputs);
 
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  input_slots_all.emplace_back(input_slots);
   if (num_threads == 1) {
-    // Prepare inputs.
-    Timer timer;
-    timer.tic();
-    for (int i = 0; i < num_times; i++) {
-      predictor->Run(input_slots, &outputs);
-    }
-    PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times);
+    TestOneThreadPrediction(config, input_slots_all, &outputs);
     CompareResult(outputs, base_outputs);
   } else {
-    std::vector<std::thread> threads;
-    std::vector<std::unique_ptr<PaddlePredictor>> predictors;
-    // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
-    // because AttentionLSTM's hard code nodeid will be damanged.
-    for (int tid = 0; tid < num_threads; ++tid) {
-      predictors.emplace_back(
-          CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-              config));
-    }
-    for (int tid = 0; tid < num_threads; ++tid) {
-      threads.emplace_back([&, tid]() {
-        // Each thread should have local input_slots and outputs.
-        std::vector<PaddleTensor> input_slots;
-        DataRecord data(FLAGS_infer_data, batch_size);
-        PrepareInputs(&input_slots, &data, batch_size);
-        std::vector<PaddleTensor> outputs;
-        Timer timer;
-        timer.tic();
-        for (int i = 0; i < num_times; i++) {
-          predictors[tid]->Run(input_slots, &outputs);
-        }
-        PrintTime(batch_size, num_times, num_threads, tid,
-                  timer.toc() / num_times);
-        CompareResult(outputs, base_outputs);
-      });
-    }
-    for (int i = 0; i < num_threads; ++i) {
-      threads[i].join();
-    }
+    // only return the output of first thread
+    TestMultiThreadPrediction(config, input_slots_all, &outputs, num_threads);
   }
 
   if (use_analysis && activate_ir) {
@@ -293,8 +224,7 @@ TEST(Analyzer, RNN_tests) {
     // Directly infer with the original model.
     TestRNN1Prediction(false, false, i);
     // Inference with the original model with the analysis turned on, the
-    // analysis
-    // module will transform the program to a data flow graph.
+    // analysis module will transform the program to a data flow graph.
     TestRNN1Prediction(true, false, i);
     // Inference with analysis and IR. The IR module will fuse some large
     // kernels.
diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
index 65169f8cfc..1472c475e4 100644
--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -12,23 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include <gflags/gflags.h>
-#include <glog/logging.h>  // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
-#include <gtest/gtest.h>
-#include <fstream>
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
-#include "paddle/fluid/inference/api/timer.h"
-
-DEFINE_string(infer_model, "", "Directory of the inference model.");
-DEFINE_string(infer_data, "", "Path of the dataset.");
-DEFINE_int32(batch_size, 1, "batch size.");
-DEFINE_int32(repeat, 1, "How many times to repeat run.");
-DEFINE_int32(topn, -1, "Run top n batches of data to save time");
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle {
 namespace inference {
@@ -37,24 +21,25 @@ struct DataReader {
   explicit DataReader(const std::string &path)
       : file(new std::ifstream(path)) {}
 
-  bool NextBatch(PaddleTensor *tensor, int batch_size) {
+  bool NextBatch(std::vector<PaddleTensor> *input, int batch_size) {
     PADDLE_ENFORCE_EQ(batch_size, 1);
     std::string line;
-    tensor->lod.clear();
-    tensor->lod.emplace_back(std::vector<size_t>({0}));
+    PaddleTensor tensor;
+    tensor.dtype = PaddleDType::INT64;
+    tensor.lod.emplace_back(std::vector<size_t>({0}));
     std::vector<int64_t> data;
 
     for (int i = 0; i < batch_size; i++) {
       if (!std::getline(*file, line)) return false;
       inference::split_to_int64(line, ' ', &data);
     }
-    tensor->lod.front().push_back(data.size());
+    tensor.lod.front().push_back(data.size());
 
-    tensor->data.Resize(data.size() * sizeof(int64_t));
-    memcpy(tensor->data.data(), data.data(), data.size() * sizeof(int64_t));
-    tensor->shape.clear();
-    tensor->shape.push_back(data.size());
-    tensor->shape.push_back(1);
+    tensor.data.Resize(data.size() * sizeof(int64_t));
+    memcpy(tensor.data.data(), data.data(), data.size() * sizeof(int64_t));
+    tensor.shape.push_back(data.size());
+    tensor.shape.push_back(1);
+    input->assign({tensor});
     return true;
   }
 
@@ -68,32 +53,28 @@ void Main(int batch_size) {
   config.model_dir = FLAGS_infer_model;
   config.use_gpu = false;
   config.enable_ir_optim = true;
-  auto predictor =
-      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-          config);
-
-  std::vector<PaddleTensor> input_slots(1);
-  // one batch starts
-  // data --
-  auto &input = input_slots[0];
-  input.dtype = PaddleDType::INT64;
 
-  inference::Timer timer;
-  double sum = 0;
-  std::vector<PaddleTensor> output_slots;
+  std::vector<PaddleTensor> input_slots, output_slots;
+  DataReader reader(FLAGS_infer_data);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
 
-  int num_batches = 0;
-  for (int t = 0; t < FLAGS_repeat; t++) {
-    DataReader reader(FLAGS_infer_data);
-    while (reader.NextBatch(&input, FLAGS_batch_size)) {
-      if (FLAGS_topn > 0 && num_batches > FLAGS_topn) break;
-      timer.tic();
-      CHECK(predictor->Run(input_slots, &output_slots));
-      sum += timer.toc();
+  if (FLAGS_test_all_data) {
+    LOG(INFO) << "test all data";
+    int num_batches = 0;
+    while (reader.NextBatch(&input_slots, FLAGS_batch_size)) {
+      input_slots_all.emplace_back(input_slots);
       ++num_batches;
     }
+    LOG(INFO) << "total number of samples: " << num_batches * FLAGS_batch_size;
+    TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads);
+    return;
   }
-  PrintTime(batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat);
+
+  // one batch starts
+  // data --
+  reader.NextBatch(&input_slots, FLAGS_batch_size);
+  input_slots_all.emplace_back(input_slots);
+  TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads);
 
   // Get output
   LOG(INFO) << "get outputs " << output_slots.size();
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
new file mode 100644
index 0000000000..44688ad36e
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -0,0 +1,126 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <gtest/gtest.h>
+#include <thread>  // NOLINT
+#include <vector>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/analysis_predictor.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DEFINE_string(infer_model, "", "model path");
+DEFINE_string(infer_data, "", "data file");
+DEFINE_int32(batch_size, 1, "batch size.");
+DEFINE_int32(burning, 0, "Burning before repeat.");
+DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
+DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
+
+namespace paddle {
+namespace inference {
+
+void CompareResult(const std::vector<PaddleTensor> &outputs,
+                   const std::vector<PaddleTensor> &base_outputs) {
+  PADDLE_ENFORCE_GT(outputs.size(), 0);
+  PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
+  for (size_t i = 0; i < outputs.size(); i++) {
+    auto &out = outputs[i];
+    auto &base_out = base_outputs[i];
+    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                                  [](int a, int b) { return a * b; });
+    size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
+                                   1, [](int a, int b) { return a * b; });
+    PADDLE_ENFORCE_EQ(size, size1);
+    PADDLE_ENFORCE_GT(size, 0);
+    float *data = static_cast<float *>(out.data.data());
+    float *base_data = static_cast<float *>(base_out.data.data());
+    for (size_t i = 0; i < size; i++) {
+      EXPECT_NEAR(data[i], base_data[i], 1e-3);
+    }
+  }
+}
+
+void TestOneThreadPrediction(
+    AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs,
+    std::vector<PaddleTensor> *outputs) {
+  int batch_size = FLAGS_batch_size;
+  int num_times = FLAGS_repeat;
+  auto predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
+  Timer timer;
+  timer.tic();
+  for (int i = 0; i < num_times; i++) {
+    for (size_t j = 0; j < inputs.size(); j++) {
+      predictor->Run(inputs[j], outputs);
+    }
+  }
+  PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times,
+            inputs.size());
+}
+
+void TestMultiThreadPrediction(
+    AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs,
+    std::vector<PaddleTensor> *outputs, int num_threads) {
+  int batch_size = FLAGS_batch_size;
+  int num_times = FLAGS_repeat;
+  std::vector<std::thread> threads;
+  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+  // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
+  // because AttentionLSTM's hard code nodeid will be damanged.
+  for (int tid = 0; tid < num_threads; ++tid) {
+    predictors.emplace_back(
+        CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+            config));
+  }
+  for (int tid = 0; tid < num_threads; ++tid) {
+    threads.emplace_back([&, tid]() {
+      // Each thread should have local inputs and outputs.
+      // The inputs of each thread are all the same.
+      std::vector<std::vector<PaddleTensor>> inputs_tid = inputs;
+      std::vector<PaddleTensor> outputs_tid;
+      Timer timer;
+      timer.tic();
+      for (int i = 0; i < num_times; i++) {
+        for (size_t j = 0; j < inputs_tid.size(); j++) {
+          predictors[tid]->Run(inputs_tid[j], &outputs_tid);
+        }
+      }
+      PrintTime(batch_size, num_times, num_threads, tid,
+                timer.toc() / num_times, inputs_tid.size());
+    });
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    threads[i].join();
+  }
+}
+
+void TestPrediction(AnalysisConfig config,
+                    const std::vector<std::vector<PaddleTensor>> inputs,
+                    std::vector<PaddleTensor> *outputs, int num_threads) {
+  if (num_threads == 1) {
+    TestOneThreadPrediction(config, inputs, outputs);
+  } else {
+    TestMultiThreadPrediction(config, inputs, outputs, num_threads);
+  }
+}
+
+}  // namespace inference
+}  // namespace paddle

From 1052a793bc4eb9cdebd4a16772b4b230f808f2c4 Mon Sep 17 00:00:00 2001
From: chuanqiw <chuanqi.wang@intel.com>
Date: Thu, 13 Sep 2018 14:08:13 +0800
Subject: [PATCH 76/85] support group convolution layer with mkldnn.

---
 paddle/fluid/operators/conv_mkldnn_op.cc | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index 3eb02c6b61..5385bcdaec 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -302,8 +302,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     bool fuse_relu = ctx.Attr<bool>("fuse_relu");
     int groups = ctx.Attr<int>("groups");
 
-    // TODO(pzelazko-intel) add support for group convolution and dilation
-    PADDLE_ENFORCE(groups == 1, "group convolution is not implemented yet");
+    // TODO: add support for dilation
     PADDLE_ENFORCE(
         dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
         "dilation in convolution is not implemented yet");
@@ -314,6 +313,19 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
     std::vector<int> weights_tz =
         paddle::framework::vectorize2int(filter->dims());
+    int g = std::max(groups, 1);
+    if (g > 1) {
+      int o = weights_tz[0];
+      int i = weights_tz[1];
+      int h = weights_tz[2];
+      int w = weights_tz[3];
+      weights_tz.resize(5);
+      weights_tz[0] = g;
+      weights_tz[1] = o / g;
+      weights_tz[2] = i;
+      weights_tz[3] = h;
+      weights_tz[4] = w;
+    }
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
 
     // Get unique name for storing MKLDNN primitives
@@ -327,7 +339,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto user_src_md = platform::MKLDNNMemDesc(
         {src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
     auto user_weights_md = platform::MKLDNNMemDesc(
-        {weights_tz}, platform::MKLDNNGetDataType<T>(), filter->format());
+        {weights_tz}, platform::MKLDNNGetDataType<T>(),
+        (g == 1) ? filter->format() : mkldnn::memory::format::goihw);
 
     /* create memory descriptor for convolution without specified format
      * ('any') which lets a primitive (convolution in this case) choose
@@ -340,7 +353,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto src_md = platform::MKLDNNMemDesc(
         src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto weights_md = platform::MKLDNNMemDesc(
-        weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+        weights_tz, platform::MKLDNNGetDataType<T>(),
+        (g == 1) ? chosen_memory_format : mkldnn::memory::format::goihw);
     std::vector<int> bias_tz;  // TODO(mgallus): avoid empty vector creation.
                                // Currently used whenever bias is != nullptr.
     auto dst_md = platform::MKLDNNMemDesc(

From 9ee1b7bc045091522c53cc69d174bebda979667e Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Thu, 13 Sep 2018 21:17:30 +0800
Subject: [PATCH 77/85] add some comments

---
 paddle/fluid/framework/details/multi_devices_graph_pass.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 5781936cb3..7e7f1234c2 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -431,7 +431,9 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
                     CreateReduceOp(&result, g_name, cur_device_id);
                     graph->Get<ShardedVarDevice>(kShardedVarDevice)
                         .emplace(g_name, cur_device_id);
-                    bcast_var_name_set[cur_device_id].emplace(p_name);
+                    if (!is_dist_train) {
+                      bcast_var_name_set[cur_device_id].emplace(p_name);
+                    }
                     break;
                   case BuildStrategy::ReduceStrategy::kAllReduce:
                     if (IsSparseGradient(g_name)) {
@@ -461,7 +463,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
   if ((use_gpu &&
        strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) ||
       is_dist_train) {
-    // Insert BCast Ops
+    // allways broadcast receieved parameters for distributed training
     for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
       auto &to_bcast_set = bcast_var_name_set[dev_id];
       for (auto &bcast_name : to_bcast_set) {

From 3ab3a7f39241db6ed5698e306d80f8760c63d26f Mon Sep 17 00:00:00 2001
From: Wu Yi <typhoonzero1986@gmail.com>
Date: Thu, 13 Sep 2018 22:15:27 +0800
Subject: [PATCH 78/85] Trainer auto wait pserver ports (#13341)

* trainer auto wait pserver port ready

* add file

* fix docstring

* add option to not wait

* update api spec

* clean

* fix test hang
---
 paddle/fluid/API.spec                         |  4 +-
 .../tests/unittests/test_dist_transpiler.py   |  2 +-
 .../fluid/transpiler/details/__init__.py      |  1 +
 .../fluid/transpiler/details/checkport.py     | 50 +++++++++++++++++++
 .../fluid/transpiler/distribute_transpiler.py |  5 +-
 5 files changed, 58 insertions(+), 4 deletions(-)
 create mode 100644 python/paddle/fluid/transpiler/details/checkport.py

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 842fde1ec5..1971774527 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -59,7 +59,7 @@ paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], vara
 paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
-paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
 paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None))
 paddle.fluid.InferenceTranspiler.__init__ 
 paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
@@ -346,7 +346,7 @@ paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'con
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
-paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
 paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None))
 paddle.fluid.transpiler.InferenceTranspiler.__init__ 
 paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index b85501ef6b..a198b25520 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -62,7 +62,7 @@ class TranspilerTest(unittest.TestCase):
 
         t = self._transpiler_instance(config)
 
-        trainer_main = t.get_trainer_program()
+        trainer_main = t.get_trainer_program(wait_port=False)
         trainer_startup = fluid.default_startup_program()
 
         assert (src.num_blocks == 1)
diff --git a/python/paddle/fluid/transpiler/details/__init__.py b/python/paddle/fluid/transpiler/details/__init__.py
index 5e98266a76..f33c05ed2f 100644
--- a/python/paddle/fluid/transpiler/details/__init__.py
+++ b/python/paddle/fluid/transpiler/details/__init__.py
@@ -16,3 +16,4 @@ from __future__ import print_function
 
 from .program_utils import *
 from .ufind import *
+from .checkport import *
diff --git a/python/paddle/fluid/transpiler/details/checkport.py b/python/paddle/fluid/transpiler/details/checkport.py
new file mode 100644
index 0000000000..7bad4b427a
--- /dev/null
+++ b/python/paddle/fluid/transpiler/details/checkport.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import time
+import socket
+from contextlib import closing
+
+
+def wait_server_ready(endpoints):
+    """
+    Wait until parameter servers are ready, use connext_ex to detect
+    port readiness.
+
+    Args:
+        endpoints (list): endpoints string list, like:
+                         ["127.0.0.1:8080", "127.0.0.1:8081"]
+
+    Examples:
+        .. code-block:: python
+
+           wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"])
+    """
+    while True:
+        all_ok = True
+        for ep in endpoints:
+            ip_port = ep.split(":")
+            with closing(socket.socket(socket.AF_INET,
+                                       socket.SOCK_STREAM)) as sock:
+                sock.settimeout(2)
+                result = sock.connect_ex((ip_port[0], int(ip_port[1])))
+                if result != 0:
+                    all_ok = False
+        if not all_ok:
+            sys.stderr.write("pserver not ready, wait 3 sec to retry...\n")
+            sys.stderr.flush()
+            time.sleep(3)
+        else:
+            break
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index d4d218d547..53c9cbe23d 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -381,7 +381,7 @@ class DistributeTranspiler(object):
                                                         pserver_endpoints)
             self._split_table_grad_and_add_send_vars(program, pserver_endpoints)
 
-    def get_trainer_program(self):
+    def get_trainer_program(self, wait_port=True):
         """
         Get transpiled trainer side program.
 
@@ -393,6 +393,9 @@ class DistributeTranspiler(object):
         delete_ops(self.origin_program.global_block(), self.optimize_ops)
         self.origin_program.__str__()
 
+        if wait_port:
+            wait_server_ready(self.pserver_endpoints)
+
         return self.origin_program
 
     def _get_trainer_startup_program(self, recv_vars, eplist):

From 757f9683abf603e4b3860934d63fce774b65ec37 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Fri, 14 Sep 2018 00:46:15 +0800
Subject: [PATCH 79/85] update comment text

---
 paddle/fluid/framework/details/multi_devices_graph_pass.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 7e7f1234c2..250e093a5f 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -460,10 +460,14 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
   use_gpu = nccl_ctxs_ != nullptr;
 #endif
 
+  // Insert broadcast operators principle:
+  // 1. Broadcast optimized parameters in Reduce strategy;
+  // 2. No need broadcast optimized parameters in AllReduce strategy because of
+  //    the optimization sub-graph would be run on every GPU;
+  // 3. Allways broadcast received parameters in Distribute Training.
   if ((use_gpu &&
        strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) ||
       is_dist_train) {
-    // allways broadcast receieved parameters for distributed training
     for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
       auto &to_bcast_set = bcast_var_name_set[dev_id];
       for (auto &bcast_name : to_bcast_set) {

From 0c7f883d4faeb6597156f257ec373683b3ce6d66 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Fri, 14 Sep 2018 07:31:25 +0800
Subject: [PATCH 80/85] small fix (#13322)

---
 paddle/fluid/string/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/fluid/string/CMakeLists.txt b/paddle/fluid/string/CMakeLists.txt
index 719411bf66..8572dc1e8e 100644
--- a/paddle/fluid/string/CMakeLists.txt
+++ b/paddle/fluid/string/CMakeLists.txt
@@ -1,6 +1,5 @@
 cc_library(stringpiece SRCS piece.cc)
 cc_library(pretty_log SRCS pretty_log.cc)
-cc_test(test_pretty_log SRCS pretty_log.cc)
 cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
 cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
 cc_test(to_string_test SRCS to_string_test.cc)

From b7a64e8698f61ddd82f6a8718e722d3309fd5aa7 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 14 Sep 2018 10:59:48 +0800
Subject: [PATCH 81/85] fix confilts

---
 paddle/fluid/inference/tests/api/CMakeLists.txt         | 7 ++++++-
 paddle/fluid/inference/tests/api/analyzer_lac_tester.cc | 7 +++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index f1075ea708..3eba375514 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -60,7 +60,12 @@ inference_analysis_test(test_analyzer_text_classification SRCS analyzer_text_cla
 set(OCR_MODEL_URL "http://paddlemodels.cdn.bcebos.com/inference-vis-demos%2Focr.tar.gz")
 set(OCR_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/ocr")
 if (NOT EXISTS ${OCR_INSTALL_DIR} AND WITH_INFERENCE)
-    inference_download_and_uncompress(${OCR_INSTALL_DIR} ${OCR_MODEL_URL})
+    get_filename_component(filename ${OCR_MODEL_URL} NAME)
+    message(STATUS "Download inference test stuff ${filename} from ${OCR_MODEL_URL}")
+    execute_process(COMMAND bash -c "mkdir -p ${OCR_INSTALL_DIR}")
+    execute_process(COMMAND bash -c "cd ${OCR_INSTALL_DIR} && wget -q ${OCR_MODEL_URL}")
+    execute_process(COMMAND bash -c "cd ${OCR_INSTALL_DIR} && tar xzf ${filename}")
+    message(STATUS "finish downloading ${filename}")
 endif()
 inference_analysis_test(test_analyzer_ocr SRCS analyzer_vis_tester.cc
     EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
index 45c19af520..bf893e3256 100644
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -110,8 +110,7 @@ const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25,
 
 void TestLACPrediction(const std::string &model_path,
                        const std::string &data_file, const int batch_size,
-                       const int repeat, bool test_all_data,
-                       bool use_analysis = false) {
+                       const int repeat, bool use_analysis = false) {
   AnalysisConfig cfg;
   cfg.model_dir = model_path;
   cfg.use_gpu = false;
@@ -199,13 +198,13 @@ void TestLACPrediction(const std::string &model_path,
 TEST(Analyzer_LAC, native) {
   LOG(INFO) << "LAC with native";
   TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
-                    FLAGS_repeat, FLAGS_test_all_data);
+                    FLAGS_repeat);
 }
 
 TEST(Analyzer_LAC, analysis) {
   LOG(INFO) << "LAC with analysis";
   TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
-                    FLAGS_repeat, FLAGS_test_all_data, true);
+                    FLAGS_repeat, true);
 }
 
 }  // namespace analysis

From 1a99302c141c8de2cd1202b16205a2ec02fb1b67 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 14 Sep 2018 11:24:03 +0800
Subject: [PATCH 82/85] refine and reuse code

---
 .../fluid/inference/tests/api/CMakeLists.txt  |  2 +-
 .../tests/api/analyzer_vis_tester.cc          | 86 +++++--------------
 .../fluid/inference/tests/api/tester_helper.h | 39 ++++++---
 3 files changed, 48 insertions(+), 79 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 3eba375514..e8c34047ab 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -68,6 +68,6 @@ if (NOT EXISTS ${OCR_INSTALL_DIR} AND WITH_INFERENCE)
     message(STATUS "finish downloading ${filename}")
 endif()
 inference_analysis_test(test_analyzer_ocr SRCS analyzer_vis_tester.cc
-    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
+    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
     ARGS --infer_model=${OCR_INSTALL_DIR}/model
         --infer_data=${OCR_INSTALL_DIR}/data.txt)
diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
index 3675c5f7f3..0591869996 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -12,22 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <gtest/gtest.h>
 #include <fstream>
 #include <iostream>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
-
-DEFINE_string(infer_model, "", "model path for LAC");
-DEFINE_string(infer_data, "", "data file for LAC");
-DEFINE_int32(batch_size, 1, "batch size.");
-DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle {
 namespace inference {
@@ -105,69 +92,36 @@ void TestVisualPrediction(bool use_mkldnn) {
   VLOG(3) << "output.size " << outputs_slots.size();
 
   // run native as reference
-  NativeConfig config;
-  config.param_file = FLAGS_infer_model + "/__params__";
-  config.prog_file = FLAGS_infer_model + "/__model__";
-  config.use_gpu = false;
-  config.device = 0;
-  // config.specify_input_name = true;
   auto ref_predictor =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
   std::vector<PaddleTensor> ref_outputs_slots;
   ref_predictor->Run({input}, &ref_outputs_slots);
-  EXPECT_EQ(ref_outputs_slots.size(), outputs_slots.size());
-  for (size_t i = 0; i < outputs_slots.size(); ++i) {
-    auto &ref_out = ref_outputs_slots[i];
-    auto &out = outputs_slots[i];
-    size_t ref_size =
-        std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
-                        [](int a, int b) { return a * b; });
-    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
-                                  [](int a, int b) { return a * b; });
-    EXPECT_EQ(size, ref_size);
-    EXPECT_EQ(out.dtype, ref_out.dtype);
-    switch (out.dtype) {
-      case PaddleDType::INT64: {
-        int64_t *pdata = static_cast<int64_t *>(out.data.data());
-        int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
-        for (size_t j = 0; j < size; ++j) {
-          EXPECT_EQ(pdata_ref[j], pdata[j]);
-        }
-        break;
-      }
-      case PaddleDType::FLOAT32: {
-        float *pdata = static_cast<float *>(out.data.data());
-        float *pdata_ref = static_cast<float *>(ref_out.data.data());
-        for (size_t j = 0; j < size; ++j) {
-          EXPECT_NEAR(pdata_ref[j], pdata[j], 1e-3);
-        }
-        break;
-      }
-    }
-    // print what are fused
-    AnalysisPredictor *analysis_predictor =
-        dynamic_cast<AnalysisPredictor *>(predictor.get());
-    auto &fuse_statis = analysis_predictor->analysis_argument()
-                            .Get<std::unordered_map<std::string, int>>(
-                                framework::ir::kFuseStatisAttr);
-    for (auto &item : fuse_statis) {
-      LOG(INFO) << "fused " << item.first << " " << item.second;
-    }
-    int num_ops = 0;
-    for (auto &node :
-         analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
-      if (node->IsFunction()) {
-        ++num_ops;
-      }
+  CompareResult(outputs_slots, ref_outputs_slots);
+  // print what are fused
+  AnalysisPredictor *analysis_predictor =
+      dynamic_cast<AnalysisPredictor *>(predictor.get());
+  auto &fuse_statis = analysis_predictor->analysis_argument()
+                          .Get<std::unordered_map<std::string, int>>(
+                              framework::ir::kFuseStatisAttr);
+  for (auto &item : fuse_statis) {
+    LOG(INFO) << "fused " << item.first << " " << item.second;
+  }
+  int num_ops = 0;
+  for (auto &node :
+       analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
+    if (node->IsFunction()) {
+      ++num_ops;
     }
-    LOG(INFO) << "has num ops: " << num_ops;
   }
+  LOG(INFO) << "has num ops: " << num_ops;
 }
 
 TEST(Analyzer_vis, analysis) { TestVisualPrediction(/*use_mkldnn*/ false); }
+#ifdef PADDLE_WITH_MKLDNN
 TEST(Analyzer_vis, analysis_mkldnn) {
   TestVisualPrediction(/*use_mkldnn*/ true);
 }
+#endif
 
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 44688ad36e..43e97614e3 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -37,22 +37,37 @@ namespace paddle {
 namespace inference {
 
 void CompareResult(const std::vector<PaddleTensor> &outputs,
-                   const std::vector<PaddleTensor> &base_outputs) {
-  PADDLE_ENFORCE_GT(outputs.size(), 0);
-  PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
+                   const std::vector<PaddleTensor> &ref_outputs) {
+  EXPECT_GT(outputs.size(), 0);
+  EXPECT_EQ(outputs.size(), ref_outputs.size());
   for (size_t i = 0; i < outputs.size(); i++) {
     auto &out = outputs[i];
-    auto &base_out = base_outputs[i];
+    auto &ref_out = ref_outputs[i];
     size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
                                   [](int a, int b) { return a * b; });
-    size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
-                                   1, [](int a, int b) { return a * b; });
-    PADDLE_ENFORCE_EQ(size, size1);
-    PADDLE_ENFORCE_GT(size, 0);
-    float *data = static_cast<float *>(out.data.data());
-    float *base_data = static_cast<float *>(base_out.data.data());
-    for (size_t i = 0; i < size; i++) {
-      EXPECT_NEAR(data[i], base_data[i], 1e-3);
+    size_t ref_size =
+        std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
+                        [](int a, int b) { return a * b; });
+    EXPECT_GT(size, 0);
+    EXPECT_EQ(size, ref_size);
+    EXPECT_EQ(out.dtype, ref_out.dtype);
+    switch (out.dtype) {
+      case PaddleDType::INT64: {
+        int64_t *pdata = static_cast<int64_t *>(out.data.data());
+        int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
+        for (size_t j = 0; j < size; ++j) {
+          EXPECT_EQ(pdata_ref[j], pdata[j]);
+        }
+        break;
+      }
+      case PaddleDType::FLOAT32: {
+        float *pdata = static_cast<float *>(out.data.data());
+        float *pdata_ref = static_cast<float *>(ref_out.data.data());
+        for (size_t j = 0; j < size; ++j) {
+          EXPECT_NEAR(pdata_ref[j], pdata[j], 1e-3);
+        }
+        break;
+      }
     }
   }
 }

From 9a9105018d6acf71bec681d8fdcd3fc6559b80ac Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Fri, 14 Sep 2018 04:01:04 +0000
Subject: [PATCH 83/85] fix mac compile error in subgraph_splitter

---
 paddle/fluid/inference/analysis/subgraph_splitter.cc | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc
index c3a2dbf9d1..b879067d2f 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -120,13 +120,20 @@ void UnionContractedNodes(const std::unordered_map<int, BriefNode *> &node_map,
     outputs.insert(node);
   }
 
-  // update the dst and src node's inlinks and outlinks.
+// update the dst and src node's inlinks and outlinks.
+#ifdef __clang__
+  src_node->inlinks = std::vector<BriefNode *>(inputs.begin(), inputs.end());
+  src_node->outlinks = std::vector<BriefNode *>(outputs.begin(), outputs.end());
+  dst_node->inlinks.clear();
+  dst_node->outlinks.clear();
+#else
   src_node->inlinks =
       std::move(std::vector<BriefNode *>(inputs.begin(), inputs.end()));
   src_node->outlinks =
       std::move(std::vector<BriefNode *>(outputs.begin(), outputs.end()));
   dst_node->inlinks.clear();
   dst_node->outlinks.clear();
+#endif
 
   auto inlink_or_outlink_cleaner = [&](std::vector<BriefNode *> &nodes) {
     for (auto *&n : nodes) {

From 26fc698f8510873594e7abbd9e64d141f1233887 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 14 Sep 2018 13:11:50 +0800
Subject: [PATCH 84/85] disable mkldnn fuse on ocr test

---
 paddle/fluid/inference/tests/api/analyzer_vis_tester.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
index 0591869996..a207c41b71 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -62,7 +62,12 @@ void TestVisualPrediction(bool use_mkldnn) {
   cfg._use_mkldnn = use_mkldnn;
   cfg.device = 0;
   cfg.enable_ir_optim = true;
+  // TODO(TJ): fix fusion gru
   cfg.ir_passes.push_back("fc_gru_fuse_pass");
+#ifdef PADDLE_WITH_MKLDNN
+  // disable mkldnn fuse since it should have some bugs
+  cfg.ir_passes.push_back("conv_relu_mkldnn_fuse_pass");
+#endif
   predictor =
       CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
 

From 9e2e893f5975b282fa7a7d0bc599971ab342bef8 Mon Sep 17 00:00:00 2001
From: Xingyuan Bu <sefira@users.noreply.github.com>
Date: Fri, 14 Sep 2018 14:53:37 +0800
Subject: [PATCH 85/85] Enhence generate_proposal_labels_op and fix some bug.
 (#13239)

* Enhence generate_proposal_labels_op
* Fix bug in generate_proposals_op
* Refine rpn_target_assign_op.
* by Bu Xingyuan, Wang Guanzhong and Dang Qingqing
---
 paddle/fluid/API.spec                         |   4 +-
 paddle/fluid/operators/detection/bbox_util.h  |  33 +-
 .../detection/generate_proposal_labels_op.cc  | 139 ++--
 .../detection/generate_proposals_op.cc        |  46 +-
 .../detection/rpn_target_assign_op.cc         | 602 ++++++++++++------
 python/paddle/fluid/layers/detection.py       |  85 ++-
 python/paddle/fluid/tests/test_detection.py   | 143 +++--
 ...py => test_generate_proposal_labels_op.py} |  93 +--
 ...osals.py => test_generate_proposals_op.py} |  41 +-
 .../unittests/test_rpn_target_assign_op.py    | 214 ++++---
 10 files changed, 874 insertions(+), 526 deletions(-)
 rename python/paddle/fluid/tests/unittests/{test_generate_proposal_labels.py => test_generate_proposal_labels_op.py} (77%)
 rename python/paddle/fluid/tests/unittests/{test_generate_proposals.py => test_generate_proposals_op.py} (88%)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 1971774527..e362d34864 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -305,9 +305,9 @@ paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'neg
 paddle.fluid.layers.detection_output ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0))
 paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None))
 paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral'))
-paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'anchor_var', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3))
+paddle.fluid.layers.rpn_target_assign ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True))
 paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None))
-paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'gt_boxes', 'im_scales', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None))
+paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True))
 paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None))
 paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h
index 0dee178162..6abeca1da4 100644
--- a/paddle/fluid/operators/detection/bbox_util.h
+++ b/paddle/fluid/operators/detection/bbox_util.h
@@ -9,6 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <algorithm>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
 
@@ -21,7 +22,7 @@ namespace operators {
  */
 template <typename T>
 inline void BoxToDelta(const int box_num, const framework::Tensor& ex_boxes,
-                       const framework::Tensor& gt_boxes, const T* weights,
+                       const framework::Tensor& gt_boxes, const float* weights,
                        const bool normalized, framework::Tensor* box_delta) {
   auto ex_boxes_et = framework::EigenTensor<T, 2>::From(ex_boxes);
   auto gt_boxes_et = framework::EigenTensor<T, 2>::From(gt_boxes);
@@ -62,5 +63,35 @@ void Gather(const T* in, const int in_stride, const int* index, const int num,
   }
 }
 
+template <typename T>
+void BboxOverlaps(const framework::Tensor& r_boxes,
+                  const framework::Tensor& c_boxes,
+                  framework::Tensor* overlaps) {
+  auto r_boxes_et = framework::EigenTensor<T, 2>::From(r_boxes);
+  auto c_boxes_et = framework::EigenTensor<T, 2>::From(c_boxes);
+  auto overlaps_et = framework::EigenTensor<T, 2>::From(*overlaps);
+  int r_num = r_boxes.dims()[0];
+  int c_num = c_boxes.dims()[0];
+  auto zero = static_cast<T>(0.0);
+  T r_box_area, c_box_area, x_min, y_min, x_max, y_max, inter_w, inter_h,
+      inter_area;
+  for (int i = 0; i < r_num; ++i) {
+    r_box_area = (r_boxes_et(i, 2) - r_boxes_et(i, 0) + 1) *
+                 (r_boxes_et(i, 3) - r_boxes_et(i, 1) + 1);
+    for (int j = 0; j < c_num; ++j) {
+      c_box_area = (c_boxes_et(j, 2) - c_boxes_et(j, 0) + 1) *
+                   (c_boxes_et(j, 3) - c_boxes_et(j, 1) + 1);
+      x_min = std::max(r_boxes_et(i, 0), c_boxes_et(j, 0));
+      y_min = std::max(r_boxes_et(i, 1), c_boxes_et(j, 1));
+      x_max = std::min(r_boxes_et(i, 2), c_boxes_et(j, 2));
+      y_max = std::min(r_boxes_et(i, 3), c_boxes_et(j, 3));
+      inter_w = std::max(x_max - x_min + 1, zero);
+      inter_h = std::max(y_max - y_min + 1, zero);
+      inter_area = inter_w * inter_h;
+      overlaps_et(i, j) = inter_area / (r_box_area + c_box_area - inter_area);
+    }
+  }
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index be06dc1974..d7a53f1bef 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -42,10 +42,11 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
                    "Input(RpnRois) shouldn't be null.");
     PADDLE_ENFORCE(ctx->HasInput("GtClasses"),
                    "Input(GtClasses) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("IsCrowd"),
+                   "Input(IsCrowd) shouldn't be null.");
     PADDLE_ENFORCE(ctx->HasInput("GtBoxes"),
                    "Input(GtBoxes) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("ImScales"),
-                   "Input(ImScales) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null.");
 
     PADDLE_ENFORCE(ctx->HasOutput("Rois"),
                    "Output(Rois) of RpnTargetAssignOp should not be null");
@@ -64,22 +65,21 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
 
     auto rpn_rois_dims = ctx->GetInputDim("RpnRois");
     auto gt_classes_dims = ctx->GetInputDim("GtClasses");
+    auto is_crowd_dims = ctx->GetInputDim("IsCrowd");
     auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
-    auto im_scales_dims = ctx->GetInputDim("ImScales");
+    auto im_info_dims = ctx->GetInputDim("ImInfo");
 
     PADDLE_ENFORCE_EQ(rpn_rois_dims.size(), 2,
                       "The rank of Input(RpnRois) must be 2.");
-    PADDLE_ENFORCE_EQ(gt_classes_dims.size(), 1,
-                      "The rank of Input(GtClasses) must be 1.");
     PADDLE_ENFORCE_EQ(gt_boxes_dims.size(), 2,
                       "The rank of Input(GtBoxes) must be 2.");
-    PADDLE_ENFORCE_EQ(im_scales_dims.size(), 1,
-                      "The rank of Input(ImScales) must be 1.");
+    PADDLE_ENFORCE_EQ(im_info_dims.size(), 2,
+                      "The rank of Input(ImInfo) must be 2.");
 
     int class_nums = ctx->Attrs().Get<int>("class_nums");
 
     ctx->SetOutputDim("Rois", {-1, 4});
-    ctx->SetOutputDim("LabelsInt32", {-1});
+    ctx->SetOutputDim("LabelsInt32", {-1, 1});
     ctx->SetOutputDim("BboxTargets", {-1, 4 * class_nums});
     ctx->SetOutputDim("BboxInsideWeights", {-1, 4 * class_nums});
     ctx->SetOutputDim("BboxOutsideWeights", {-1, 4 * class_nums});
@@ -105,45 +105,18 @@ void Concat(const platform::CPUDeviceContext& context,
   concat_functor(context, inputs, axis, out_tensor);
 }
 
-template <typename T>
-void BboxOverlaps(const Tensor& r_boxes, const Tensor& c_boxes,
-                  Tensor* overlaps) {
-  auto r_boxes_et = framework::EigenTensor<T, 2>::From(r_boxes);
-  auto c_boxes_et = framework::EigenTensor<T, 2>::From(c_boxes);
-  auto overlaps_et = framework::EigenTensor<T, 2>::From(*overlaps);
-  int r_num = r_boxes.dims()[0];
-  int c_num = c_boxes.dims()[0];
-  auto zero = static_cast<T>(0.0);
-  T r_box_area, c_box_area, x_min, y_min, x_max, y_max, inter_w, inter_h,
-      inter_area;
-  for (int i = 0; i < r_num; ++i) {
-    r_box_area = (r_boxes_et(i, 2) - r_boxes_et(i, 0) + 1) *
-                 (r_boxes_et(i, 3) - r_boxes_et(i, 1) + 1);
-    for (int j = 0; j < c_num; ++j) {
-      c_box_area = (c_boxes_et(j, 2) - c_boxes_et(j, 0) + 1) *
-                   (c_boxes_et(j, 3) - c_boxes_et(j, 1) + 1);
-      x_min = std::max(r_boxes_et(i, 0), c_boxes_et(j, 0));
-      y_min = std::max(r_boxes_et(i, 1), c_boxes_et(j, 1));
-      x_max = std::min(r_boxes_et(i, 2), c_boxes_et(j, 2));
-      y_max = std::min(r_boxes_et(i, 3), c_boxes_et(j, 3));
-      inter_w = std::max(x_max - x_min + 1, zero);
-      inter_h = std::max(y_max - y_min + 1, zero);
-      inter_area = inter_w * inter_h;
-      overlaps_et(i, j) = inter_area / (r_box_area + c_box_area - inter_area);
-    }
-  }
-}
-
 template <typename T>
 std::vector<std::vector<int>> SampleFgBgGt(
     const platform::CPUDeviceContext& context, Tensor* iou,
-    const int batch_size_per_im, const float fg_fraction, const float fg_thresh,
-    const float bg_thresh_hi, const float bg_thresh_lo,
-    std::minstd_rand engine) {
+    const Tensor& is_crowd, const int batch_size_per_im,
+    const float fg_fraction, const float fg_thresh, const float bg_thresh_hi,
+    const float bg_thresh_lo, std::minstd_rand engine, const bool use_random) {
   std::vector<int> fg_inds;
   std::vector<int> bg_inds;
   std::vector<int> gt_inds;
-  T* proposal_to_gt_overlaps = iou->mutable_data<T>(context.GetPlace());
+  int64_t gt_num = is_crowd.numel();
+  const int* crowd_data = is_crowd.data<int>();
+  T* proposal_to_gt_overlaps = iou->data<T>();
   int64_t row = iou->dims()[0];
   int64_t col = iou->dims()[1];
   float epsilon = 0.00001;
@@ -152,6 +125,9 @@ std::vector<std::vector<int>> SampleFgBgGt(
   for (int64_t i = 0; i < row; ++i) {
     const T* v = proposal_to_gt_overlaps + i * col;
     T max_overlap = *std::max_element(v, v + col);
+    if ((i < gt_num) && (crowd_data[i])) {
+      max_overlap = -1.0;
+    }
     if (max_overlap > fg_thresh) {
       for (int64_t j = 0; j < col; ++j) {
         T val = proposal_to_gt_overlaps[i * col + j];
@@ -170,17 +146,19 @@ std::vector<std::vector<int>> SampleFgBgGt(
   }
 
   // Reservoir Sampling
+  std::uniform_real_distribution<float> uniform(0, 1);
   int fg_rois_per_im = std::floor(batch_size_per_im * fg_fraction);
   int fg_rois_this_image = fg_inds.size();
   int fg_rois_per_this_image = std::min(fg_rois_per_im, fg_rois_this_image);
-  std::uniform_real_distribution<float> uniform(0, 1);
-  const int64_t fg_size = static_cast<int64_t>(fg_inds.size());
-  if (fg_size > fg_rois_per_this_image) {
-    for (int64_t i = fg_rois_per_this_image; i < fg_size; ++i) {
-      int rng_ind = std::floor(uniform(engine) * i);
-      if (rng_ind < fg_rois_per_this_image) {
-        std::iter_swap(fg_inds.begin() + rng_ind, fg_inds.begin() + i);
-        std::iter_swap(gt_inds.begin() + rng_ind, gt_inds.begin() + i);
+  if (use_random) {
+    const int64_t fg_size = static_cast<int64_t>(fg_inds.size());
+    if (fg_size > fg_rois_per_this_image) {
+      for (int64_t i = fg_rois_per_this_image; i < fg_size; ++i) {
+        int rng_ind = std::floor(uniform(engine) * i);
+        if (rng_ind < fg_rois_per_this_image) {
+          std::iter_swap(fg_inds.begin() + rng_ind, fg_inds.begin() + i);
+          std::iter_swap(gt_inds.begin() + rng_ind, gt_inds.begin() + i);
+        }
       }
     }
   }
@@ -192,12 +170,14 @@ std::vector<std::vector<int>> SampleFgBgGt(
   int bg_rois_per_image = batch_size_per_im - fg_rois_per_this_image;
   int bg_rois_this_image = bg_inds.size();
   int bg_rois_per_this_image = std::min(bg_rois_per_image, bg_rois_this_image);
-  const int64_t bg_size = static_cast<int64_t>(bg_inds.size());
-  if (bg_size > bg_rois_per_this_image) {
-    for (int64_t i = bg_rois_per_this_image; i < bg_size; ++i) {
-      int rng_ind = std::floor(uniform(engine) * i);
-      if (rng_ind < fg_rois_per_this_image)
-        std::iter_swap(bg_inds.begin() + rng_ind, bg_inds.begin() + i);
+  if (use_random) {
+    const int64_t bg_size = static_cast<int64_t>(bg_inds.size());
+    if (bg_size > bg_rois_per_this_image) {
+      for (int64_t i = bg_rois_per_this_image; i < bg_size; ++i) {
+        int rng_ind = std::floor(uniform(engine) * i);
+        if (rng_ind < fg_rois_per_this_image)
+          std::iter_swap(bg_inds.begin() + rng_ind, bg_inds.begin() + i);
+      }
     }
   }
   std::vector<int> new_bg_inds(bg_inds.begin(),
@@ -248,14 +228,14 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context,
 template <typename T>
 std::vector<Tensor> SampleRoisForOneImage(
     const platform::CPUDeviceContext& context, Tensor* rpn_rois,
-    Tensor* gt_classes, Tensor* gt_boxes, Tensor* im_scale,
+    Tensor* gt_classes, Tensor* is_crowd, Tensor* gt_boxes, Tensor* im_info,
     const int batch_size_per_im, const float fg_fraction, const float fg_thresh,
     const float bg_thresh_hi, const float bg_thresh_lo,
     const std::vector<float>& bbox_reg_weights, const int class_nums,
-    std::minstd_rand engine) {
+    std::minstd_rand engine, bool use_random) {
   auto rpn_rois_et = framework::EigenTensor<T, 2>::From(*rpn_rois);
-  auto im_scale_data = im_scale->data<T>()[0];
-  rpn_rois_et = rpn_rois_et / im_scale_data;
+  auto im_scale = im_info->data<T>()[2];
+  rpn_rois_et = rpn_rois_et / im_scale;
 
   Tensor boxes;
   int proposals_num = gt_boxes->dims()[0] + rpn_rois->dims()[0];
@@ -270,8 +250,8 @@ std::vector<Tensor> SampleRoisForOneImage(
 
   // Generate proposal index
   std::vector<std::vector<int>> fg_bg_gt = SampleFgBgGt<T>(
-      context, &proposal_to_gt_overlaps, batch_size_per_im, fg_fraction,
-      fg_thresh, bg_thresh_hi, bg_thresh_lo, engine);
+      context, &proposal_to_gt_overlaps, *is_crowd, batch_size_per_im,
+      fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, engine, use_random);
   std::vector<int> fg_inds = fg_bg_gt[0];
   std::vector<int> bg_inds = fg_bg_gt[1];
   std::vector<int> gt_inds = fg_bg_gt[2];
@@ -291,15 +271,15 @@ std::vector<Tensor> SampleRoisForOneImage(
   // Compute targets
   Tensor bbox_targets_single;
   bbox_targets_single.mutable_data<T>(bbox_dim, context.GetPlace());
-  BoxToDelta<T>(fg_num, sampled_boxes, sampled_gts, nullptr, false,
-                &bbox_targets_single);
+  BoxToDelta<T>(fg_num, sampled_boxes, sampled_gts, bbox_reg_weights.data(),
+                false, &bbox_targets_single);
 
   // Scale rois
   Tensor sampled_rois;
   sampled_rois.mutable_data<T>(sampled_boxes.dims(), context.GetPlace());
   auto sampled_rois_et = framework::EigenTensor<T, 2>::From(sampled_rois);
   auto sampled_boxes_et = framework::EigenTensor<T, 2>::From(sampled_boxes);
-  sampled_rois_et = sampled_boxes_et * im_scale_data;
+  sampled_rois_et = sampled_boxes_et * im_scale;
 
   // Expand box targets
   Tensor bbox_targets, bbox_inside_weights, bbox_outside_weights;
@@ -351,8 +331,9 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto* rpn_rois = context.Input<LoDTensor>("RpnRois");
     auto* gt_classes = context.Input<LoDTensor>("GtClasses");
+    auto* is_crowd = context.Input<LoDTensor>("IsCrowd");
     auto* gt_boxes = context.Input<LoDTensor>("GtBoxes");
-    auto* im_scales = context.Input<LoDTensor>("ImScales");
+    auto* im_info = context.Input<LoDTensor>("ImInfo");
 
     auto* rois = context.Output<LoDTensor>("Rois");
     auto* labels_int32 = context.Output<LoDTensor>("LabelsInt32");
@@ -369,18 +350,21 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
     std::vector<float> bbox_reg_weights =
         context.Attr<std::vector<float>>("bbox_reg_weights");
     int class_nums = context.Attr<int>("class_nums");
+    bool use_random = context.Attr<bool>("use_random");
 
     PADDLE_ENFORCE_EQ(rpn_rois->lod().size(), 1UL,
                       "GenerateProposalLabelsOp rpn_rois needs 1 level of LoD");
     PADDLE_ENFORCE_EQ(
         gt_classes->lod().size(), 1UL,
         "GenerateProposalLabelsOp gt_classes needs 1 level of LoD");
+    PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL,
+                      "GenerateProposalLabelsOp is_crowd needs 1 level of LoD");
     PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL,
                       "GenerateProposalLabelsOp gt_boxes needs 1 level of LoD");
     int64_t n = static_cast<int64_t>(rpn_rois->lod().back().size() - 1);
 
     rois->mutable_data<T>({n * batch_size_per_im, kBoxDim}, context.GetPlace());
-    labels_int32->mutable_data<int>({n * batch_size_per_im},
+    labels_int32->mutable_data<int>({n * batch_size_per_im, 1},
                                     context.GetPlace());
     bbox_targets->mutable_data<T>({n * batch_size_per_im, kBoxDim * class_nums},
                                   context.GetPlace());
@@ -391,8 +375,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
 
     std::random_device rnd;
     std::minstd_rand engine;
-    int seed =
-        context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
+    int seed = rnd();
     engine.seed(seed);
 
     framework::LoD lod;
@@ -403,19 +386,23 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
 
     auto rpn_rois_lod = rpn_rois->lod().back();
     auto gt_classes_lod = gt_classes->lod().back();
+    auto is_crowd_lod = is_crowd->lod().back();
     auto gt_boxes_lod = gt_boxes->lod().back();
     for (int i = 0; i < n; ++i) {
       Tensor rpn_rois_slice =
           rpn_rois->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]);
       Tensor gt_classes_slice =
           gt_classes->Slice(gt_classes_lod[i], gt_classes_lod[i + 1]);
+      Tensor is_crowd_slice =
+          is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
       Tensor gt_boxes_slice =
           gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
-      Tensor im_scales_slice = im_scales->Slice(i, i + 1);
+      Tensor im_info_slice = im_info->Slice(i, i + 1);
       std::vector<Tensor> tensor_output = SampleRoisForOneImage<T>(
-          dev_ctx, &rpn_rois_slice, &gt_classes_slice, &gt_boxes_slice,
-          &im_scales_slice, batch_size_per_im, fg_fraction, fg_thresh,
-          bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums, engine);
+          dev_ctx, &rpn_rois_slice, &gt_classes_slice, &is_crowd_slice,
+          &gt_boxes_slice, &im_info_slice, batch_size_per_im, fg_fraction,
+          fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums,
+          engine, use_random);
       Tensor sampled_rois = tensor_output[0];
       Tensor sampled_labels_int32 = tensor_output[1];
       Tensor sampled_bbox_targets = tensor_output[2];
@@ -442,7 +429,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
     bbox_inside_weights->set_lod(lod);
     bbox_outside_weights->set_lod(lod);
     rois->Resize({num_rois, kBoxDim});
-    labels_int32->Resize({num_rois});
+    labels_int32->Resize({num_rois, 1});
     bbox_targets->Resize({num_rois, kBoxDim * class_nums});
     bbox_inside_weights->Resize({num_rois, kBoxDim * class_nums});
     bbox_outside_weights->Resize({num_rois, kBoxDim * class_nums});
@@ -455,8 +442,9 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
     // TODO(buxingyuan): Add Document
     AddInput("RpnRois", "RpnRois.");
     AddInput("GtClasses", "GtClasses.");
+    AddInput("IsCrowd", "IsCrowd.");
     AddInput("GtBoxes", "GtBoxes.");
-    AddInput("ImScales", "ImScales.");
+    AddInput("ImInfo", "ImInfo.");
 
     AddOutput("Rois", "Rois.");
     AddOutput("LabelsInt32", "LabelsInt32.");
@@ -471,8 +459,7 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<float>("bg_thresh_lo", "bg_thresh_lo");
     AddAttr<std::vector<float>>("bbox_reg_weights", "bbox_reg_weights");
     AddAttr<int>("class_nums", "class_nums");
-    AddAttr<bool>("fix_seed", "fix_seed").SetDefault(false);
-    AddAttr<int>("seed", "seed").SetDefault(0);
+    AddAttr<bool>("use_random", "use_random").SetDefault(true);
 
     AddComment(R"DOC(
 Generate Proposals Labels Operator.
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index ebe6830ecc..c33aa25536 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -89,12 +89,11 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
   }
 
   for (int64_t i = 0; i < row; ++i) {
-    T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len];
-    T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1];
+    T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0;
+    T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0;
 
-    T anchor_center_x = (anchor_data[i * len + 2] + anchor_data[i * len]) / 2;
-    T anchor_center_y =
-        (anchor_data[i * len + 3] + anchor_data[i * len + 1]) / 2;
+    T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width;
+    T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height;
 
     T bbox_center_x = 0, bbox_center_y = 0;
     T bbox_width = 0, bbox_height = 0;
@@ -106,25 +105,31 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
       bbox_center_y = variances_data[i * len + 1] *
                           bbox_deltas_data[i * len + 1] * anchor_height +
                       anchor_center_y;
-      bbox_width = std::exp(variances_data[i * len + 2] *
-                            bbox_deltas_data[i * len + 2]) *
+      bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
+                                            bbox_deltas_data[i * len + 2],
+                                        std::log(1000.0 / 16.0))) *
                    anchor_width;
-      bbox_height = std::exp(variances_data[i * len + 3] *
-                             bbox_deltas_data[i * len + 3]) *
+      bbox_height = std::exp(std::min<T>(variances_data[i * len + 3] *
+                                             bbox_deltas_data[i * len + 3],
+                                         std::log(1000.0 / 16.0))) *
                     anchor_height;
     } else {
       bbox_center_x =
           bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
       bbox_center_y =
           bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
-      bbox_width = std::exp(bbox_deltas_data[i * len + 2]) * anchor_width;
-      bbox_height = std::exp(bbox_deltas_data[i * len + 3]) * anchor_height;
+      bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
+                                        std::log(1000.0 / 16.0))) *
+                   anchor_width;
+      bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
+                                         std::log(1000.0 / 16.0))) *
+                    anchor_height;
     }
 
     proposals_data[i * len] = bbox_center_x - bbox_width / 2;
     proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
-    proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2;
-    proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2;
+    proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1;
+    proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1;
   }
   // return proposals;
 }
@@ -156,18 +161,23 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes,
                  float min_size, const Tensor &im_info, Tensor *keep) {
   const T *im_info_data = im_info.data<T>();
   T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
-  min_size *= im_info_data[2];
+  T im_scale = im_info_data[2];
   keep->Resize({boxes->dims()[0], 1});
+  min_size = std::max(min_size, 1.0f);
   int *keep_data = keep->mutable_data<int>(ctx.GetPlace());
 
   int keep_len = 0;
   for (int i = 0; i < boxes->dims()[0]; ++i) {
     T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1;
     T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1;
+    T ws_origin_scale =
+        (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1;
+    T hs_origin_scale =
+        (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1;
     T x_ctr = boxes_data[4 * i] + ws / 2;
     T y_ctr = boxes_data[4 * i + 1] + hs / 2;
-    if (ws >= min_size && hs >= min_size && x_ctr <= im_info_data[1] &&
-        y_ctr <= im_info_data[0]) {
+    if (ws_origin_scale >= min_size && hs_origin_scale >= min_size &&
+        x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) {
       keep_data[keep_len++] = i;
     }
   }
@@ -218,8 +228,8 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) {
     const T inter_ymin = std::max(box1[1], box2[1]);
     const T inter_xmax = std::min(box1[2], box2[2]);
     const T inter_ymax = std::min(box1[3], box2[3]);
-    const T inter_w = inter_xmax - inter_xmin;
-    const T inter_h = inter_ymax - inter_ymin;
+    const T inter_w = std::max(0.0f, inter_xmax - inter_xmin + 1);
+    const T inter_h = std::max(0.0f, inter_ymax - inter_ymin + 1);
     const T inter_area = inter_w * inter_h;
     const T bbox1_area = BBoxArea<T>(box1, normalized);
     const T bbox2_area = BBoxArea<T>(box2, normalized);
diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
index 88757f25cd..dda423efd3 100644
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -31,8 +31,14 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("DistMat"),
-                   "Input(DistMat) of RpnTargetAssignOp should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("Anchor"),
+                   "Input(Anchor) of RpnTargetAssignOp should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("GtBoxes"),
+                   "Input(GtBoxes) of RpnTargetAssignOp should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("IsCrowd"),
+                   "Input(Anchor) of RpnTargetAssignOp should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("ImInfo"),
+                   "Input(ImInfo) of RpnTargetAssignOp should not be null");
 
     PADDLE_ENFORCE(
         ctx->HasOutput("LocationIndex"),
@@ -43,10 +49,20 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(
         ctx->HasOutput("TargetLabel"),
         "Output(TargetLabel) of RpnTargetAssignOp should not be null");
-
-    auto in_dims = ctx->GetInputDim("DistMat");
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2,
-                      "The rank of Input(DistMat) must be 2.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("TargetBBox"),
+        "Output(TargetBBox) of RpnTargetAssignOp should not be null");
+
+    auto anchor_dims = ctx->GetInputDim("Anchor");
+    auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
+    auto is_crowd_dims = ctx->GetInputDim("IsCrowd");
+    auto im_info_dims = ctx->GetInputDim("ImInfo");
+    PADDLE_ENFORCE_EQ(anchor_dims.size(), 2,
+                      "The rank of Input(Anchor) must be 2.");
+    PADDLE_ENFORCE_EQ(gt_boxes_dims.size(), 2,
+                      "The rank of Input(GtBoxes) must be 2.");
+    PADDLE_ENFORCE_EQ(im_info_dims.size(), 2,
+                      "The rank of Input(ImInfo) must be 2.");
 
     ctx->SetOutputDim("LocationIndex", {-1});
     ctx->SetOutputDim("ScoreIndex", {-1});
@@ -59,198 +75,383 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(
-            ctx.Input<framework::LoDTensor>("DistMat")->type()),
+            ctx.Input<framework::LoDTensor>("Anchor")->type()),
         platform::CPUPlace());
   }
 };
 
 template <typename T>
-class RpnTargetAssignKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* anchor_t = context.Input<Tensor>("Anchor");  // (H*W*A) * 4
-    auto* gt_bbox_t = context.Input<Tensor>("GtBox");
-    auto* dist_t = context.Input<LoDTensor>("DistMat");
+void AppendRpns(LoDTensor* out, int64_t offset, Tensor* to_add) {
+  auto* out_data = out->data<T>();
+  auto* to_add_data = to_add->data<T>();
+  memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T));
+}
+
+template <typename T>
+std::vector<Tensor> FilterStraddleAnchor(
+    const platform::CPUDeviceContext& context, const Tensor* anchor,
+    const float rpn_straddle_thresh, T im_height, T im_width) {
+  std::vector<int> inds_inside;
+  int anchor_num = anchor->dims()[0];
+  auto* anchor_data = anchor->data<T>();
+  if (rpn_straddle_thresh >= 0) {
+    int index;
+    for (int i = 0; i < anchor_num; ++i) {
+      index = i * 4;
+      if ((anchor_data[index + 0] >= -rpn_straddle_thresh) &&
+          (anchor_data[index + 1] >= -rpn_straddle_thresh) &&
+          (anchor_data[index + 2] < im_width + rpn_straddle_thresh) &&
+          (anchor_data[index + 3] < im_height + rpn_straddle_thresh)) {
+        inds_inside.emplace_back(i);
+      }
+    }
+  } else {
+    for (int i = 0; i < anchor_num; ++i) {
+      inds_inside.emplace_back(i);
+    }
+  }
+  int inside_num = inds_inside.size();
+  Tensor inds_inside_t;
+  int* inds_inside_data =
+      inds_inside_t.mutable_data<int>({inside_num}, context.GetPlace());
+  std::copy(inds_inside.begin(), inds_inside.end(), inds_inside_data);
+  Tensor inside_anchor_t;
+  T* inside_anchor_data =
+      inside_anchor_t.mutable_data<T>({inside_num, 4}, context.GetPlace());
+  Gather<T>(anchor->data<T>(), 4, inds_inside_data, inside_num,
+            inside_anchor_data);
+  std::vector<Tensor> res;
+  res.emplace_back(inds_inside_t);
+  res.emplace_back(inside_anchor_t);
+  return res;
+}
+
+template <typename T>
+Tensor FilterCrowdGt(const platform::CPUDeviceContext& context,
+                     Tensor* gt_boxes, Tensor* is_crowd) {
+  int gt_num = gt_boxes->dims()[0];
+  std::vector<int> not_crowd_inds;
+  auto* is_crowd_data = is_crowd->data<int>();
+  for (int i = 0; i < gt_num; ++i) {
+    if (is_crowd_data[i] == 0) {
+      not_crowd_inds.emplace_back(i);
+    }
+  }
+  int ncrowd_num = not_crowd_inds.size();
+  Tensor ncrowd_gt_boxes;
+  T* ncrowd_gt_boxes_data =
+      ncrowd_gt_boxes.mutable_data<T>({ncrowd_num, 4}, context.GetPlace());
+  Gather<T>(gt_boxes->data<T>(), 4, not_crowd_inds.data(), ncrowd_num,
+            ncrowd_gt_boxes_data);
+  return ncrowd_gt_boxes;
+}
+
+void ReservoirSampling(const int num, std::vector<int>* inds,
+                       std::minstd_rand engine, bool use_random) {
+  std::uniform_real_distribution<float> uniform(0, 1);
+  size_t len = inds->size();
+  if (len > static_cast<size_t>(num)) {
+    if (use_random) {
+      for (size_t i = num; i < len; ++i) {
+        int rng_ind = std::floor(uniform(engine) * i);
+        if (rng_ind < num)
+          std::iter_swap(inds->begin() + rng_ind, inds->begin() + i);
+      }
+    }
+    inds->resize(num);
+  }
+}
+
+template <typename T>
+void ScoreAssign(const T* anchor_by_gt_overlap_data,
+                 const Tensor& anchor_to_gt_max, const Tensor& gt_to_anchor_max,
+                 const int rpn_batch_size_per_im, const float rpn_fg_fraction,
+                 const float rpn_positive_overlap,
+                 const float rpn_negative_overlap, std::vector<int>* fg_inds,
+                 std::vector<int>* bg_inds, std::vector<int>* tgt_lbl,
+                 std::minstd_rand engine, bool use_random) {
+  float epsilon = 0.00001;
+  int anchor_num = anchor_to_gt_max.dims()[0];
+  int gt_num = gt_to_anchor_max.dims()[0];
+  std::vector<int> target_label(anchor_num, -1);
+  std::vector<int> fg_inds_fake;
+  std::vector<int> bg_inds_fake;
+  const T* anchor_to_gt_max_data = anchor_to_gt_max.data<T>();
+  const T* gt_to_anchor_max_data = gt_to_anchor_max.data<T>();
+  // TODO(buxingyuan): Match with Detectron now
+  // but it seems here is a bug in two directions assignment
+  // in which the later one may overwrites the former one.
+  for (int64_t i = 0; i < anchor_num; ++i) {
+    bool is_anchors_with_max_overlap = false;
+    for (int64_t j = 0; j < gt_num; ++j) {
+      T value = anchor_by_gt_overlap_data[i * gt_num + j];
+      T diff = std::abs(value - gt_to_anchor_max_data[j]);
+      if (diff < epsilon) {
+        is_anchors_with_max_overlap = true;
+        break;
+      }
+    }
+    bool is_anchor_great_than_thresh =
+        (anchor_to_gt_max_data[i] >= rpn_positive_overlap);
+    if (is_anchors_with_max_overlap || is_anchor_great_than_thresh) {
+      fg_inds_fake.push_back(i);
+    }
+  }
 
-    auto* loc_index_t = context.Output<Tensor>("LocationIndex");
-    auto* score_index_t = context.Output<Tensor>("ScoreIndex");
-    auto* tgt_bbox_t = context.Output<Tensor>("TargetBBox");
-    auto* tgt_lbl_t = context.Output<Tensor>("TargetLabel");
+  // Reservoir Sampling
+  int fg_num = static_cast<int>(rpn_fg_fraction * rpn_batch_size_per_im);
+  ReservoirSampling(fg_num, &fg_inds_fake, engine, use_random);
+  fg_num = static_cast<int>(fg_inds_fake.size());
+  for (int64_t i = 0; i < fg_num; ++i) {
+    target_label[fg_inds_fake[i]] = 1;
+  }
 
-    auto lod = dist_t->lod().back();
-    int64_t batch_num = static_cast<int64_t>(lod.size() - 1);
-    int64_t anchor_num = dist_t->dims()[1];
-    PADDLE_ENFORCE_EQ(anchor_num, anchor_t->dims()[0]);
+  int bg_num = rpn_batch_size_per_im - fg_num;
+  for (int64_t i = 0; i < anchor_num; ++i) {
+    if (anchor_to_gt_max_data[i] < rpn_negative_overlap) {
+      bg_inds_fake.push_back(i);
+    }
+  }
+  ReservoirSampling(bg_num, &bg_inds_fake, engine, use_random);
+  bg_num = static_cast<int>(bg_inds_fake.size());
+  for (int64_t i = 0; i < bg_num; ++i) {
+    target_label[bg_inds_fake[i]] = 0;
+  }
 
-    int rpn_batch_size = context.Attr<int>("rpn_batch_size_per_im");
-    float pos_threshold = context.Attr<float>("rpn_positive_overlap");
-    float neg_threshold = context.Attr<float>("rpn_negative_overlap");
-    float fg_fraction = context.Attr<float>("fg_fraction");
+  for (int64_t i = 0; i < anchor_num; ++i) {
+    if (target_label[i] == 1) fg_inds->emplace_back(i);
+    if (target_label[i] == 0) bg_inds->emplace_back(i);
+  }
+  fg_num = fg_inds->size();
+  bg_num = bg_inds->size();
+
+  tgt_lbl->resize(fg_num + bg_num, 0);
+  std::vector<int> fg_lbl(fg_num, 1);
+  std::vector<int> bg_lbl(bg_num, 0);
+  std::copy(fg_lbl.begin(), fg_lbl.end(), tgt_lbl->data());
+  std::copy(bg_lbl.begin(), bg_lbl.end(), tgt_lbl->data() + fg_num);
+}
+
+template <typename T>
+std::vector<Tensor> SampleRpnFgBgGt(const platform::CPUDeviceContext& ctx,
+                                    const Tensor& anchor_by_gt_overlap,
+                                    const int rpn_batch_size_per_im,
+                                    const float rpn_positive_overlap,
+                                    const float rpn_negative_overlap,
+                                    const float rpn_fg_fraction,
+                                    std::minstd_rand engine, bool use_random) {
+  auto* anchor_by_gt_overlap_data = anchor_by_gt_overlap.data<T>();
+  int anchor_num = anchor_by_gt_overlap.dims()[0];
+  int gt_num = anchor_by_gt_overlap.dims()[1];
+
+  std::vector<int> fg_inds;
+  std::vector<int> bg_inds;
+  std::vector<int> gt_inds;
+  std::vector<int> tgt_lbl;
+
+  // Calculate the max IoU between anchors and gt boxes
+  // Map from anchor to gt box that has highest overlap
+  auto place = ctx.GetPlace();
+  Tensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max;
+  anchor_to_gt_max.mutable_data<T>({anchor_num}, place);
+  int* argmax = anchor_to_gt_argmax.mutable_data<int>({anchor_num}, place);
+  gt_to_anchor_max.mutable_data<T>({gt_num}, place);
+
+  auto anchor_by_gt_overlap_et =
+      framework::EigenMatrix<T>::From(anchor_by_gt_overlap);
+  auto anchor_to_gt_max_et =
+      framework::EigenVector<T>::Flatten(anchor_to_gt_max);
+  auto gt_to_anchor_max_et =
+      framework::EigenVector<T>::Flatten(gt_to_anchor_max);
+  auto anchor_to_gt_argmax_et =
+      framework::EigenVector<int>::Flatten(anchor_to_gt_argmax);
+  anchor_to_gt_max_et =
+      anchor_by_gt_overlap_et.maximum(Eigen::DSizes<int, 1>(1));
+  anchor_to_gt_argmax_et =
+      anchor_by_gt_overlap_et.argmax(1).template cast<int>();
+  gt_to_anchor_max_et =
+      anchor_by_gt_overlap_et.maximum(Eigen::DSizes<int, 1>(0));
+
+  // Follow the Faster RCNN's implementation
+  ScoreAssign(anchor_by_gt_overlap_data, anchor_to_gt_max, gt_to_anchor_max,
+              rpn_batch_size_per_im, rpn_fg_fraction, rpn_positive_overlap,
+              rpn_negative_overlap, &fg_inds, &bg_inds, &tgt_lbl, engine,
+              use_random);
+
+  int fg_num = fg_inds.size();
+  int bg_num = bg_inds.size();
+  gt_inds.reserve(fg_num);
+  for (int i = 0; i < fg_num; ++i) {
+    gt_inds.emplace_back(argmax[fg_inds[i]]);
+  }
 
-    int fg_num_per_batch = static_cast<int>(rpn_batch_size * fg_fraction);
+  Tensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t;
+  int* loc_index_data = loc_index_t.mutable_data<int>({fg_num}, place);
+  int* score_index_data =
+      score_index_t.mutable_data<int>({fg_num + bg_num}, place);
+  int* tgt_lbl_data = tgt_lbl_t.mutable_data<int>({fg_num + bg_num}, place);
+  int* gt_inds_data = gt_inds_t.mutable_data<int>({fg_num}, place);
+  std::copy(fg_inds.begin(), fg_inds.end(), loc_index_data);
+  std::copy(fg_inds.begin(), fg_inds.end(), score_index_data);
+  std::copy(bg_inds.begin(), bg_inds.end(), score_index_data + fg_num);
+  std::copy(tgt_lbl.begin(), tgt_lbl.end(), tgt_lbl_data);
+  std::copy(gt_inds.begin(), gt_inds.end(), gt_inds_data);
+  std::vector<Tensor> loc_score_tgtlbl_gt;
+  loc_score_tgtlbl_gt.emplace_back(loc_index_t);
+  loc_score_tgtlbl_gt.emplace_back(score_index_t);
+  loc_score_tgtlbl_gt.emplace_back(tgt_lbl_t);
+  loc_score_tgtlbl_gt.emplace_back(gt_inds_t);
+
+  return loc_score_tgtlbl_gt;
+}
 
-    int64_t max_num = batch_num * anchor_num;
+template <typename T>
+class RpnTargetAssignKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* anchor = context.Input<Tensor>("Anchor");  // (H*W*A) * 4
+    auto* gt_boxes = context.Input<LoDTensor>("GtBoxes");
+    auto* is_crowd = context.Input<LoDTensor>("IsCrowd");
+    auto* im_info = context.Input<LoDTensor>("ImInfo");
+
+    auto* loc_index = context.Output<LoDTensor>("LocationIndex");
+    auto* score_index = context.Output<LoDTensor>("ScoreIndex");
+    auto* tgt_bbox = context.Output<LoDTensor>("TargetBBox");
+    auto* tgt_lbl = context.Output<LoDTensor>("TargetLabel");
+
+    PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL,
+                      "RpnTargetAssignOp gt_boxes needs 1 level of LoD");
+    PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL,
+                      "RpnTargetAssignOp is_crowd needs 1 level of LoD");
+    int64_t anchor_num = static_cast<int64_t>(anchor->dims()[0]);
+    int64_t batch_num = static_cast<int64_t>(gt_boxes->lod().back().size() - 1);
+
+    int rpn_batch_size_per_im = context.Attr<int>("rpn_batch_size_per_im");
+    float rpn_straddle_thresh = context.Attr<float>("rpn_straddle_thresh");
+    float rpn_positive_overlap = context.Attr<float>("rpn_positive_overlap");
+    float rpn_negative_overlap = context.Attr<float>("rpn_negative_overlap");
+    float rpn_fg_fraction = context.Attr<float>("rpn_fg_fraction");
+    bool use_random = context.Attr<bool>("use_random");
+
+    int64_t max_num = batch_num * rpn_batch_size_per_im;
     auto place = context.GetPlace();
 
-    tgt_bbox_t->mutable_data<T>({max_num, 4}, place);
-    auto* loc_index = loc_index_t->mutable_data<int>({max_num}, place);
-    auto* score_index = score_index_t->mutable_data<int>({max_num}, place);
+    loc_index->mutable_data<int>({max_num}, place);
+    score_index->mutable_data<int>({max_num}, place);
+    tgt_bbox->mutable_data<T>({max_num, 4}, place);
+    tgt_lbl->mutable_data<int>({max_num, 1}, place);
 
-    Tensor tmp_tgt_lbl;
-    auto* tmp_lbl_data = tmp_tgt_lbl.mutable_data<int64_t>({max_num}, place);
     auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
-    math::SetConstant<platform::CPUDeviceContext, int64_t> iset;
-    iset(dev_ctx, &tmp_tgt_lbl, static_cast<int64_t>(-1));
 
     std::random_device rnd;
     std::minstd_rand engine;
-    int seed =
-        context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
+    int seed = rnd();
     engine.seed(seed);
 
-    int fg_num = 0;
-    int bg_num = 0;
+    framework::LoD lod_loc, loc_score;
+    std::vector<size_t> lod0_loc(1, 0);
+    std::vector<size_t> lod0_score(1, 0);
+
+    int total_loc_num = 0;
+    int total_score_num = 0;
+    auto gt_boxes_lod = gt_boxes->lod().back();
+    auto is_crowd_lod = is_crowd->lod().back();
     for (int i = 0; i < batch_num; ++i) {
-      Tensor dist = dist_t->Slice(lod[i], lod[i + 1]);
-      Tensor gt_bbox = gt_bbox_t->Slice(lod[i], lod[i + 1]);
-      auto fg_bg_gt = SampleFgBgGt(dev_ctx, dist, pos_threshold, neg_threshold,
-                                   rpn_batch_size, fg_num_per_batch, engine,
-                                   tmp_lbl_data + i * anchor_num);
-
-      int cur_fg_num = fg_bg_gt[0].size();
-      int cur_bg_num = fg_bg_gt[1].size();
-      std::transform(fg_bg_gt[0].begin(), fg_bg_gt[0].end(), loc_index,
-                     [i, anchor_num](int d) { return d + i * anchor_num; });
-      memcpy(score_index, loc_index, cur_fg_num * sizeof(int));
-      std::transform(fg_bg_gt[1].begin(), fg_bg_gt[1].end(),
-                     score_index + cur_fg_num,
-                     [i, anchor_num](int d) { return d + i * anchor_num; });
+      Tensor gt_boxes_slice =
+          gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
+      Tensor is_crowd_slice =
+          is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
+      Tensor im_info_slice = im_info->Slice(i, i + 1);
+      auto* im_info_data = im_info_slice.data<T>();
+      auto im_height = im_info_data[0];
+      auto im_width = im_info_data[1];
+      auto im_scale = im_info_data[2];
+
+      // Filter straddle anchor
+      std::vector<Tensor> filter_output = FilterStraddleAnchor<T>(
+          dev_ctx, anchor, rpn_straddle_thresh, im_height, im_width);
+      Tensor inds_inside = filter_output[0];
+      Tensor inside_anchor = filter_output[1];
+
+      // Filter crowd gt
+      Tensor ncrowd_gt_boxes =
+          FilterCrowdGt<T>(dev_ctx, &gt_boxes_slice, &is_crowd_slice);
+      auto ncrowd_gt_boxes_et =
+          framework::EigenTensor<T, 2>::From(ncrowd_gt_boxes);
+      ncrowd_gt_boxes_et = ncrowd_gt_boxes_et * im_scale;
+
+      Tensor anchor_by_gt_overlap;
+      anchor_by_gt_overlap.mutable_data<T>(
+          {inside_anchor.dims()[0], ncrowd_gt_boxes.dims()[0]}, place);
+      BboxOverlaps<T>(inside_anchor, ncrowd_gt_boxes, &anchor_by_gt_overlap);
+
+      auto loc_score_tgtlbl_gt = SampleRpnFgBgGt<T>(
+          dev_ctx, anchor_by_gt_overlap, rpn_batch_size_per_im,
+          rpn_positive_overlap, rpn_negative_overlap, rpn_fg_fraction, engine,
+          use_random);
+
+      Tensor sampled_loc_index = loc_score_tgtlbl_gt[0];
+      Tensor sampled_score_index = loc_score_tgtlbl_gt[1];
+      Tensor sampled_tgtlbl = loc_score_tgtlbl_gt[2];
+      Tensor sampled_gt_index = loc_score_tgtlbl_gt[3];
+
+      int loc_num = sampled_loc_index.dims()[0];
+      int score_num = sampled_score_index.dims()[0];
+      // unmap to all anchor
+      Tensor sampled_loc_index_unmap, sampled_score_index_unmap;
+      sampled_loc_index_unmap.mutable_data<int>({loc_num}, place);
+      sampled_score_index_unmap.mutable_data<int>({score_num}, place);
+      Gather<int>(inds_inside.data<int>(), 1, sampled_loc_index.data<int>(),
+                  loc_num, sampled_loc_index_unmap.data<int>());
+      Gather<int>(inds_inside.data<int>(), 1, sampled_score_index.data<int>(),
+                  score_num, sampled_score_index_unmap.data<int>());
 
       // get target bbox deltas
-      if (cur_fg_num) {
-        Tensor fg_gt;
-        T* gt_data = fg_gt.mutable_data<T>({cur_fg_num, 4}, place);
-        Tensor tgt_bbox = tgt_bbox_t->Slice(fg_num, fg_num + cur_fg_num);
-        T* tgt_data = tgt_bbox.data<T>();
-        Gather<T>(anchor_t->data<T>(), 4,
-                  reinterpret_cast<int*>(&fg_bg_gt[0][0]), cur_fg_num,
-                  tgt_data);
-        Gather<T>(gt_bbox.data<T>(), 4, reinterpret_cast<int*>(&fg_bg_gt[2][0]),
-                  cur_fg_num, gt_data);
-        BoxToDelta<T>(cur_fg_num, tgt_bbox, fg_gt, nullptr, false, &tgt_bbox);
-      }
-
-      loc_index += cur_fg_num;
-      score_index += cur_fg_num + cur_bg_num;
-      fg_num += cur_fg_num;
-      bg_num += cur_bg_num;
-    }
-
-    int lbl_num = fg_num + bg_num;
-    PADDLE_ENFORCE_LE(fg_num, max_num);
-    PADDLE_ENFORCE_LE(lbl_num, max_num);
-
-    tgt_bbox_t->Resize({fg_num, 4});
-    loc_index_t->Resize({fg_num});
-    score_index_t->Resize({lbl_num});
-    auto* lbl_data = tgt_lbl_t->mutable_data<int64_t>({lbl_num, 1}, place);
-    Gather<int64_t>(tmp_lbl_data, 1, score_index_t->data<int>(), lbl_num,
-                    lbl_data);
-  }
-
- private:
-  void ScoreAssign(const T* dist_data, const Tensor& anchor_to_gt_max,
-                   const int row, const int col, const float pos_threshold,
-                   const float neg_threshold, int64_t* target_label,
-                   std::vector<int>* fg_inds, std::vector<int>* bg_inds) const {
-    float epsilon = 0.0001;
-    for (int64_t i = 0; i < row; ++i) {
-      const T* v = dist_data + i * col;
-      T max = *std::max_element(v, v + col);
-      for (int64_t j = 0; j < col; ++j) {
-        if (std::abs(max - v[j]) < epsilon) {
-          target_label[j] = 1;
-        }
-      }
-    }
-
-    // Pick the fg/bg
-    const T* anchor_to_gt_max_data = anchor_to_gt_max.data<T>();
-    for (int64_t j = 0; j < col; ++j) {
-      if (anchor_to_gt_max_data[j] >= pos_threshold) {
-        target_label[j] = 1;
-      } else if (anchor_to_gt_max_data[j] < neg_threshold) {
-        target_label[j] = 0;
-      }
-      if (target_label[j] == 1) {
-        fg_inds->push_back(j);
-      } else if (target_label[j] == 0) {
-        bg_inds->push_back(j);
-      }
+      Tensor sampled_anchor, sampled_gt, sampled_tgt_bbox;
+      auto* sampled_anchor_data =
+          sampled_anchor.mutable_data<T>({loc_num, 4}, place);
+      auto* sampled_gt_data = sampled_gt.mutable_data<T>({loc_num, 4}, place);
+      Gather<T>(anchor->data<T>(), 4, sampled_loc_index_unmap.data<int>(),
+                loc_num, sampled_anchor_data);
+      Gather<T>(ncrowd_gt_boxes.data<T>(), 4, sampled_gt_index.data<int>(),
+                loc_num, sampled_gt_data);
+      sampled_tgt_bbox.mutable_data<T>({loc_num, 4}, place);
+      BoxToDelta<T>(loc_num, sampled_anchor, sampled_gt, nullptr, false,
+                    &sampled_tgt_bbox);
+
+      // Add anchor offset
+      int anchor_offset = i * anchor_num;
+      auto sampled_loc_index_unmap_et =
+          framework::EigenTensor<int, 1>::From(sampled_loc_index_unmap);
+      sampled_loc_index_unmap_et = sampled_loc_index_unmap_et + anchor_offset;
+      auto sampled_score_index_unmap_et =
+          framework::EigenTensor<int, 1>::From(sampled_score_index_unmap);
+      sampled_score_index_unmap_et =
+          sampled_score_index_unmap_et + anchor_offset;
+      AppendRpns<int>(loc_index, total_loc_num, &sampled_loc_index_unmap);
+      AppendRpns<int>(score_index, total_score_num, &sampled_score_index_unmap);
+      AppendRpns<T>(tgt_bbox, total_loc_num * 4, &sampled_tgt_bbox);
+      AppendRpns<int>(tgt_lbl, total_score_num, &sampled_tgtlbl);
+      total_loc_num += loc_num;
+
+      total_score_num += score_num;
+      lod0_loc.emplace_back(total_loc_num);
+      lod0_score.emplace_back(total_score_num);
     }
-  }
-
-  void ReservoirSampling(const int num, std::minstd_rand engine,
-                         std::vector<int>* inds) const {
-    std::uniform_real_distribution<float> uniform(0, 1);
-    size_t len = inds->size();
-    if (len > static_cast<size_t>(num)) {
-      for (size_t i = num; i < len; ++i) {
-        int rng_ind = std::floor(uniform(engine) * i);
-        if (rng_ind < num)
-          std::iter_swap(inds->begin() + rng_ind, inds->begin() + i);
-      }
-      inds->resize(num);
-    }
-  }
 
-  // std::vector<std::vector<int>> RpnTargetAssign(
-  std::vector<std::vector<int>> SampleFgBgGt(
-      const platform::CPUDeviceContext& ctx, const Tensor& dist,
-      const float pos_threshold, const float neg_threshold,
-      const int rpn_batch_size, const int fg_num, std::minstd_rand engine,
-      int64_t* target_label) const {
-    auto* dist_data = dist.data<T>();
-    int row = dist.dims()[0];
-    int col = dist.dims()[1];
-
-    std::vector<int> fg_inds;
-    std::vector<int> bg_inds;
-    std::vector<int> gt_inds;
-
-    // Calculate the max IoU between anchors and gt boxes
-    // Map from anchor to gt box that has highest overlap
-    auto place = ctx.GetPlace();
-    Tensor anchor_to_gt_max, anchor_to_gt_argmax;
-    anchor_to_gt_max.mutable_data<T>({col}, place);
-    int* argmax = anchor_to_gt_argmax.mutable_data<int>({col}, place);
-
-    auto x = framework::EigenMatrix<T>::From(dist);
-    auto x_col_max = framework::EigenVector<T>::Flatten(anchor_to_gt_max);
-    auto x_col_argmax =
-        framework::EigenVector<int>::Flatten(anchor_to_gt_argmax);
-    x_col_max = x.maximum(Eigen::DSizes<int, 1>(0));
-    x_col_argmax = x.argmax(0).template cast<int>();
-
-    // Follow the Faster RCNN's implementation
-    ScoreAssign(dist_data, anchor_to_gt_max, row, col, pos_threshold,
-                neg_threshold, target_label, &fg_inds, &bg_inds);
-    // Reservoir Sampling
-    ReservoirSampling(fg_num, engine, &fg_inds);
-    int fg_num2 = static_cast<int>(fg_inds.size());
-    int bg_num = rpn_batch_size - fg_num2;
-    ReservoirSampling(bg_num, engine, &bg_inds);
-
-    gt_inds.reserve(fg_num2);
-    for (int i = 0; i < fg_num2; ++i) {
-      gt_inds.emplace_back(argmax[fg_inds[i]]);
-    }
-    std::vector<std::vector<int>> fg_bg_gt;
-    fg_bg_gt.emplace_back(fg_inds);
-    fg_bg_gt.emplace_back(bg_inds);
-    fg_bg_gt.emplace_back(gt_inds);
-
-    return fg_bg_gt;
+    PADDLE_ENFORCE_LE(total_loc_num, max_num);
+    PADDLE_ENFORCE_LE(total_score_num, max_num);
+
+    lod_loc.emplace_back(lod0_loc);
+    loc_score.emplace_back(lod0_score);
+    loc_index->set_lod(lod_loc);
+    score_index->set_lod(loc_score);
+    tgt_bbox->set_lod(lod_loc);
+    tgt_lbl->set_lod(loc_score);
+    loc_index->Resize({total_loc_num});
+    score_index->Resize({total_score_num});
+    tgt_bbox->Resize({total_loc_num, 4});
+    tgt_lbl->Resize({total_score_num, 1});
   }
 };
 
@@ -259,18 +460,22 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("Anchor",
              "(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4].");
-    AddInput("GtBox", "(LoDTensor) input groud-truth bbox with shape [K, 4].");
-    AddInput(
-        "DistMat",
-        "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
-        "[K, M]. It is pair-wise distance matrix between the entities "
-        "represented by each row and each column. For example, assumed one "
-        "entity is A with shape [K], another entity is B with shape [M]. The "
-        "DistMat[i][j] is the distance between A[i] and B[j]. The bigger "
-        "the distance is, the better macthing the pairs are. Please note, "
-        "This tensor can contain LoD information to represent a batch of "
-        "inputs. One instance of this batch can contain different numbers of "
-        "entities.");
+    AddInput("GtBoxes",
+             "(LoDTensor) input groud-truth bbox with shape [K, 4].");
+    AddInput("IsCrowd",
+             "(LoDTensor) input which indicates groud-truth is crowd.");
+    AddInput("ImInfo",
+             "(LoDTensor) input image information with shape [N, 3]. "
+             "N is the batch size, each image information includes height, "
+             "width and scale.");
+    AddAttr<int>("rpn_batch_size_per_im",
+                 "Total number of RPN examples per image.")
+        .SetDefault(256);
+    AddAttr<float>(
+        "rpn_straddle_thresh",
+        "Remove RPN anchors that go outside the image by straddle_thresh "
+        "pixels, "
+        "Set to -1 or a large value, e.g. 100000, to disable pruning anchors.");
     AddAttr<float>(
         "rpn_positive_overlap",
         "Minimum overlap required between an anchor and ground-truth "
@@ -282,20 +487,15 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
         "box for the (anchor, gt box) pair to be a negative examples.")
         .SetDefault(0.3);
     AddAttr<float>(
-        "fg_fraction",
+        "rpn_fg_fraction",
         "Target fraction of RoI minibatch that "
         "is labeled foreground (i.e. class > 0), 0-th class is background.")
         .SetDefault(0.25);
-    AddAttr<int>("rpn_batch_size_per_im",
-                 "Total number of RPN examples per image.")
-        .SetDefault(256);
-    AddAttr<bool>("fix_seed",
-                  "A flag indicating whether to use a fixed seed to generate "
-                  "random mask. NOTE: DO NOT set this flag to true in "
-                  "training. Setting this flag to true is only useful in "
-                  "unittest.")
-        .SetDefault(false);
-    AddAttr<int>("seed", "RpnTargetAssign random seed.").SetDefault(0);
+    AddAttr<bool>("use_random",
+                  "A flag indicating whether to use a ReservoirSampling. "
+                  "NOTE: DO NOT set this flag to false in training. "
+                  "Setting this flag to false is only useful in unittest.")
+        .SetDefault(true);
     AddOutput(
         "LocationIndex",
         "(Tensor), The indexes of foreground anchors in all RPN anchors, the "
@@ -308,16 +508,16 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
         "ScoreIndex is [F + B], F and B are sampled foreground and backgroud "
         " number.");
     AddOutput("TargetBBox",
-              "(Tensor<int64_t>), The target bbox deltas with shape "
+              "(Tensor), The target bbox deltas with shape "
               "[F, 4], F is the sampled foreground number.");
     AddOutput(
         "TargetLabel",
-        "(Tensor<int64_t>), The target labels of each anchor with shape "
+        "(Tensor<int>), The target labels of each anchor with shape "
         "[F + B, 1], F and B are sampled foreground and backgroud number.");
     AddComment(R"DOC(
-This operator can be, for given the IoU between the ground truth bboxes and the
+This operator can be, for a given set of ground truth bboxes and the
 anchors, to assign classification and regression targets to each prediction.
-The Score index and LocationIndex will be generated according to the DistMat.
+The ScoreIndex and LocationIndex will be generated according to the anchor-groundtruth IOU.
 The rest anchors would not contibute to the RPN training loss
 
 ScoreIndex is composed of foreground anchor indexes(positive labels) and
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 1bc1dbbeca..1c73c837e2 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -55,15 +55,19 @@ for _OP in set(__auto__):
     globals()[_OP] = generate_layer_fn(_OP)
 
 
-def rpn_target_assign(loc,
-                      scores,
+def rpn_target_assign(bbox_pred,
+                      cls_logits,
                       anchor_box,
                       anchor_var,
-                      gt_box,
+                      gt_boxes,
+                      is_crowd,
+                      im_info,
                       rpn_batch_size_per_im=256,
-                      fg_fraction=0.25,
+                      rpn_straddle_thresh=0.0,
+                      rpn_fg_fraction=0.5,
                       rpn_positive_overlap=0.7,
-                      rpn_negative_overlap=0.3):
+                      rpn_negative_overlap=0.3,
+                      use_random=True):
     """
     ** Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection. **
 
@@ -83,14 +87,13 @@ def rpn_target_assign(loc,
     the positive anchors.
 
     Args:
-        loc(Variable): A 3-D Tensor with shape [N, M, 4] represents the
+        bbox_pred(Variable): A 3-D Tensor with shape [N, M, 4] represents the
             predicted locations of M bounding bboxes. N is the batch size,
             and each bounding box has four coordinate values and the layout
             is [xmin, ymin, xmax, ymax].
-        scores(Variable): A 3-D Tensor with shape [N, M, C] represents the
-            predicted confidence predictions. N is the batch size, C is the
-            class number, M is number of bounding boxes. For each category
-            there are total M scores which corresponding M bounding boxes.
+        cls_logits(Variable): A 3-D Tensor with shape [N, M, 1] represents the
+            predicted confidence predictions. N is the batch size, 1 is the
+            frontground and background sigmoid, M is number of bounding boxes.
         anchor_box(Variable): A 2-D Tensor with shape [M, 4] holds M boxes,
             each box is represented as [xmin, ymin, xmax, ymax],
             [xmin, ymin] is the left top coordinate of the anchor box,
@@ -99,11 +102,16 @@ def rpn_target_assign(loc,
             coordinate of the anchor box.
         anchor_var(Variable): A 2-D Tensor with shape [M,4] holds expanded 
             variances of anchors.
-        gt_box (Variable): The ground-truth boudding boxes (bboxes) are a 2D
+        gt_boxes (Variable): The ground-truth boudding boxes (bboxes) are a 2D
             LoDTensor with shape [Ng, 4], Ng is the total number of ground-truth
             bboxes of mini-batch input.
+        is_crowd (Variable): A 1-D LoDTensor which indicates groud-truth is crowd.
+        im_info (Variable): A 2-D LoDTensor with shape [N, 3]. N is the batch size,
+        3 is the height, width and scale.
         rpn_batch_size_per_im(int): Total number of RPN examples per image.
-        fg_fraction(float): Target fraction of RoI minibatch that is labeled
+        rpn_straddle_thresh(float): Remove RPN anchors that go outside the image
+            by straddle_thresh pixels.
+        rpn_fg_fraction(float): Target fraction of RoI minibatch that is labeled
             foreground (i.e. class > 0), 0-th class is background.
         rpn_positive_overlap(float): Minimum overlap required between an anchor
             and ground-truth box for the (anchor, gt box) pair to be a positive
@@ -129,45 +137,48 @@ def rpn_target_assign(loc,
     Examples:
         .. code-block:: python
 
-        loc = layers.data(name='location', shape=[2, 80],
+        bbox_pred = layers.data(name='bbox_pred', shape=[100, 4],
                           append_batch_size=False, dtype='float32')
-        scores = layers.data(name='scores', shape=[2, 40],
+        cls_logits = layers.data(name='cls_logits', shape=[100, 1],
                           append_batch_size=False, dtype='float32')
         anchor_box = layers.data(name='anchor_box', shape=[20, 4],
                           append_batch_size=False, dtype='float32')
-        gt_box = layers.data(name='gt_box', shape=[10, 4],
+        gt_boxes = layers.data(name='gt_boxes', shape=[10, 4],
                          append_batch_size=False, dtype='float32')
         loc_pred, score_pred, loc_target, score_target =
-            fluid.layers.detection_output(loc=location,
-                                          scores=scores,
+            fluid.layers.rpn_target_assign(bbox_pred=bbox_pred,
+                                          cls_logits=cls_logits,
                                           anchor_box=anchor_box,
-                                          gt_box=gt_box)
+                                          gt_boxes=gt_boxes)
     """
 
     helper = LayerHelper('rpn_target_assign', **locals())
-    # Compute overlaps between the prior boxes and the gt boxes overlaps
-    iou = iou_similarity(x=gt_box, y=anchor_box)
     # Assign target label to anchors
     loc_index = helper.create_tmp_variable(dtype='int32')
     score_index = helper.create_tmp_variable(dtype='int32')
-    target_label = helper.create_tmp_variable(dtype='int64')
+    target_label = helper.create_tmp_variable(dtype='int32')
     target_bbox = helper.create_tmp_variable(dtype=anchor_box.dtype)
     helper.append_op(
         type="rpn_target_assign",
-        inputs={'Anchor': anchor_box,
-                'GtBox': gt_box,
-                'DistMat': iou},
+        inputs={
+            'Anchor': anchor_box,
+            'GtBoxes': gt_boxes,
+            'IsCrowd': is_crowd,
+            'ImInfo': im_info
+        },
         outputs={
             'LocationIndex': loc_index,
             'ScoreIndex': score_index,
             'TargetLabel': target_label,
-            'TargetBBox': target_bbox,
+            'TargetBBox': target_bbox
         },
         attrs={
             'rpn_batch_size_per_im': rpn_batch_size_per_im,
+            'rpn_straddle_thresh': rpn_straddle_thresh,
             'rpn_positive_overlap': rpn_positive_overlap,
             'rpn_negative_overlap': rpn_negative_overlap,
-            'fg_fraction': fg_fraction
+            'rpn_fg_fraction': rpn_fg_fraction,
+            'use_random': use_random
         })
 
     loc_index.stop_gradient = True
@@ -175,12 +186,12 @@ def rpn_target_assign(loc,
     target_label.stop_gradient = True
     target_bbox.stop_gradient = True
 
-    scores = nn.reshape(x=scores, shape=(-1, 1))
-    loc = nn.reshape(x=loc, shape=(-1, 4))
-    predicted_scores = nn.gather(scores, score_index)
-    predicted_location = nn.gather(loc, loc_index)
+    cls_logits = nn.reshape(x=cls_logits, shape=(-1, 1))
+    bbox_pred = nn.reshape(x=bbox_pred, shape=(-1, 4))
+    predicted_cls_logits = nn.gather(cls_logits, score_index)
+    predicted_bbox_pred = nn.gather(bbox_pred, loc_index)
 
-    return predicted_scores, predicted_location, target_label, target_bbox
+    return predicted_cls_logits, predicted_bbox_pred, target_label, target_bbox
 
 
 def detection_output(loc,
@@ -1258,15 +1269,17 @@ def anchor_generator(input,
 
 def generate_proposal_labels(rpn_rois,
                              gt_classes,
+                             is_crowd,
                              gt_boxes,
-                             im_scales,
+                             im_info,
                              batch_size_per_im=256,
                              fg_fraction=0.25,
                              fg_thresh=0.25,
                              bg_thresh_hi=0.5,
                              bg_thresh_lo=0.0,
                              bbox_reg_weights=[0.1, 0.1, 0.2, 0.2],
-                             class_nums=None):
+                             class_nums=None,
+                             use_random=True):
     """
     ** Generate proposal labels Faster-RCNN **
     TODO(buxingyuan): Add Document
@@ -1285,8 +1298,9 @@ def generate_proposal_labels(rpn_rois,
         inputs={
             'RpnRois': rpn_rois,
             'GtClasses': gt_classes,
+            'IsCrowd': is_crowd,
             'GtBoxes': gt_boxes,
-            'ImScales': im_scales
+            'ImInfo': im_info
         },
         outputs={
             'Rois': rois,
@@ -1302,7 +1316,8 @@ def generate_proposal_labels(rpn_rois,
             'bg_thresh_hi': bg_thresh_hi,
             'bg_thresh_lo': bg_thresh_lo,
             'bbox_reg_weights': bbox_reg_weights,
-            'class_nums': class_nums
+            'class_nums': class_nums,
+            'use_random': use_random
         })
 
     rois.stop_gradient = True
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index e2564763d1..56129641ce 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -148,51 +148,60 @@ class TestAnchorGenerator(unittest.TestCase):
 
 class TestGenerateProposalLabels(unittest.TestCase):
     def test_generate_proposal_labels(self):
-        rpn_rois = layers.data(
-            name='rpn_rois',
-            shape=[4, 4],
-            dtype='float32',
-            lod_level=1,
-            append_batch_size=False)
-        gt_classes = layers.data(
-            name='gt_classes',
-            shape=[6],
-            dtype='int32',
-            lod_level=1,
-            append_batch_size=False)
-        gt_boxes = layers.data(
-            name='gt_boxes',
-            shape=[6, 4],
-            dtype='float32',
-            lod_level=1,
-            append_batch_size=False)
-        im_scales = layers.data(
-            name='im_scales',
-            shape=[1],
-            dtype='float32',
-            lod_level=1,
-            append_batch_size=False)
-        class_nums = 5
-        rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights = fluid.layers.generate_proposal_labels(
-            rpn_rois=rpn_rois,
-            gt_classes=gt_classes,
-            gt_boxes=gt_boxes,
-            im_scales=im_scales,
-            batch_size_per_im=2,
-            fg_fraction=0.5,
-            fg_thresh=0.5,
-            bg_thresh_hi=0.5,
-            bg_thresh_lo=0.0,
-            bbox_reg_weights=[0.1, 0.1, 0.2, 0.2],
-            class_nums=class_nums)
-        assert rois.shape[1] == 4
-        assert rois.shape[0] == labels_int32.shape[0]
-        assert rois.shape[0] == bbox_targets.shape[0]
-        assert rois.shape[0] == bbox_inside_weights.shape[0]
-        assert rois.shape[0] == bbox_outside_weights.shape[0]
-        assert bbox_targets.shape[1] == 4 * class_nums
-        assert bbox_inside_weights.shape[1] == 4 * class_nums
-        assert bbox_outside_weights.shape[1] == 4 * class_nums
+        program = Program()
+        with program_guard(program):
+            rpn_rois = layers.data(
+                name='rpn_rois',
+                shape=[4, 4],
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            gt_classes = layers.data(
+                name='gt_classes',
+                shape=[6],
+                dtype='int32',
+                lod_level=1,
+                append_batch_size=False)
+            is_crowd = layers.data(
+                name='is_crowd',
+                shape=[6],
+                dtype='int32',
+                lod_level=1,
+                append_batch_size=False)
+            gt_boxes = layers.data(
+                name='gt_boxes',
+                shape=[6, 4],
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            im_info = layers.data(
+                name='im_info',
+                shape=[1, 3],
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            class_nums = 5
+            rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights = fluid.layers.generate_proposal_labels(
+                rpn_rois=rpn_rois,
+                gt_classes=gt_classes,
+                is_crowd=is_crowd,
+                gt_boxes=gt_boxes,
+                im_info=im_info,
+                batch_size_per_im=2,
+                fg_fraction=0.5,
+                fg_thresh=0.5,
+                bg_thresh_hi=0.5,
+                bg_thresh_lo=0.0,
+                bbox_reg_weights=[0.1, 0.1, 0.2, 0.2],
+                class_nums=class_nums)
+            assert rois.shape[1] == 4
+            assert rois.shape[0] == labels_int32.shape[0]
+            assert rois.shape[0] == bbox_targets.shape[0]
+            assert rois.shape[0] == bbox_inside_weights.shape[0]
+            assert rois.shape[0] == bbox_outside_weights.shape[0]
+            assert bbox_targets.shape[1] == 4 * class_nums
+            assert bbox_inside_weights.shape[1] == 4 * class_nums
+            assert bbox_outside_weights.shape[1] == 4 * class_nums
 
 
 class TestMultiBoxHead(unittest.TestCase):
@@ -254,18 +263,18 @@ class TestRpnTargetAssign(unittest.TestCase):
     def test_rpn_target_assign(self):
         program = Program()
         with program_guard(program):
-            loc_shape = [10, 50, 4]
-            score_shape = [10, 50, 2]
+            bbox_pred_shape = [10, 50, 4]
+            cls_logits_shape = [10, 50, 2]
             anchor_shape = [50, 4]
 
-            loc = layers.data(
-                name='loc',
-                shape=loc_shape,
+            bbox_pred = layers.data(
+                name='bbox_pred',
+                shape=bbox_pred_shape,
                 append_batch_size=False,
                 dtype='float32')
-            scores = layers.data(
-                name='scores',
-                shape=score_shape,
+            cls_logits = layers.data(
+                name='cls_logits',
+                shape=cls_logits_shape,
                 append_batch_size=False,
                 dtype='float32')
             anchor_box = layers.data(
@@ -278,17 +287,31 @@ class TestRpnTargetAssign(unittest.TestCase):
                 shape=anchor_shape,
                 append_batch_size=False,
                 dtype='float32')
-            gt_box = layers.data(
-                name='gt_box', shape=[4], lod_level=1, dtype='float32')
-
+            gt_boxes = layers.data(
+                name='gt_boxes', shape=[4], lod_level=1, dtype='float32')
+            is_crowd = layers.data(
+                name='is_crowd',
+                shape=[10],
+                dtype='int32',
+                lod_level=1,
+                append_batch_size=False)
+            im_info = layers.data(
+                name='im_info',
+                shape=[1, 3],
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
             pred_scores, pred_loc, tgt_lbl, tgt_bbox = layers.rpn_target_assign(
-                loc=loc,
-                scores=scores,
+                bbox_pred=bbox_pred,
+                cls_logits=cls_logits,
                 anchor_box=anchor_box,
                 anchor_var=anchor_var,
-                gt_box=gt_box,
+                gt_boxes=gt_boxes,
+                is_crowd=is_crowd,
+                im_info=im_info,
                 rpn_batch_size_per_im=256,
-                fg_fraction=0.25,
+                rpn_straddle_thresh=0.0,
+                rpn_fg_fraction=0.5,
                 rpn_positive_overlap=0.7,
                 rpn_negative_overlap=0.3)
 
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
similarity index 77%
rename from python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py
rename to python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
index 6dc101b6da..2d5cd3b24b 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
@@ -20,10 +20,10 @@ import paddle.fluid as fluid
 from op_test import OpTest
 
 
-def generate_proposal_labels_in_python(
-        rpn_rois, gt_classes, gt_boxes, im_scales, batch_size_per_im,
-        fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights,
-        class_nums):
+def generate_proposal_labels_in_python(rpn_rois, gt_classes, is_crowd, gt_boxes,
+                                       im_info, batch_size_per_im, fg_fraction,
+                                       fg_thresh, bg_thresh_hi, bg_thresh_lo,
+                                       bbox_reg_weights, class_nums):
     rois = []
     labels_int32 = []
     bbox_targets = []
@@ -31,13 +31,13 @@ def generate_proposal_labels_in_python(
     bbox_outside_weights = []
     lod = []
     assert len(rpn_rois) == len(
-        im_scales), 'batch size of rpn_rois and ground_truth is not matched'
+        im_info), 'batch size of rpn_rois and ground_truth is not matched'
 
-    for im_i in range(len(im_scales)):
+    for im_i in range(len(im_info)):
         frcn_blobs = _sample_rois(
-            rpn_rois[im_i], gt_classes[im_i], gt_boxes[im_i], im_scales[im_i],
-            batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi,
-            bg_thresh_lo, bbox_reg_weights, class_nums)
+            rpn_rois[im_i], gt_classes[im_i], is_crowd[im_i], gt_boxes[im_i],
+            im_info[im_i], batch_size_per_im, fg_fraction, fg_thresh,
+            bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums)
 
         lod.append(frcn_blobs['rois'].shape[0])
 
@@ -50,13 +50,14 @@ def generate_proposal_labels_in_python(
     return rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights, lod
 
 
-def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im,
-                 fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo,
-                 bbox_reg_weights, class_nums):
+def _sample_rois(rpn_rois, gt_classes, is_crowd, gt_boxes, im_info,
+                 batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi,
+                 bg_thresh_lo, bbox_reg_weights, class_nums):
     rois_per_image = int(batch_size_per_im)
     fg_rois_per_im = int(np.round(fg_fraction * rois_per_image))
 
     # Roidb
+    im_scale = im_info[2]
     inv_im_scale = 1. / im_scale
     rpn_rois = rpn_rois * inv_im_scale
 
@@ -78,6 +79,9 @@ def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im,
         box_to_gt_ind_map[overlapped_boxes_ind] = overlaps_argmax[
             overlapped_boxes_ind]
 
+    crowd_ind = np.where(is_crowd)[0]
+    gt_overlaps[crowd_ind] = -1
+
     max_overlaps = gt_overlaps.max(axis=1)
     max_classes = gt_overlaps.argmax(axis=1)
 
@@ -85,9 +89,10 @@ def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im,
     fg_inds = np.where(max_overlaps >= fg_thresh)[0]
     fg_rois_per_this_image = np.minimum(fg_rois_per_im, fg_inds.shape[0])
     # Sample foreground if there are too many
-    if fg_inds.shape[0] > fg_rois_per_this_image:
-        fg_inds = np.random.choice(
-            fg_inds, size=fg_rois_per_this_image, replace=False)
+    # if fg_inds.shape[0] > fg_rois_per_this_image:
+    #     fg_inds = np.random.choice(
+    #         fg_inds, size=fg_rois_per_this_image, replace=False)
+    fg_inds = fg_inds[:fg_rois_per_this_image]
 
     # Background
     bg_inds = np.where((max_overlaps < bg_thresh_hi) & (max_overlaps >=
@@ -96,9 +101,10 @@ def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im,
     bg_rois_per_this_image = np.minimum(bg_rois_per_this_image,
                                         bg_inds.shape[0])
     # Sample background if there are too many
-    if bg_inds.shape[0] > bg_rois_per_this_image:
-        bg_inds = np.random.choice(
-            bg_inds, size=bg_rois_per_this_image, replace=False)
+    # if bg_inds.shape[0] > bg_rois_per_this_image:
+    #     bg_inds = np.random.choice(
+    #         bg_inds, size=bg_rois_per_this_image, replace=False)
+    bg_inds = bg_inds[:bg_rois_per_this_image]
 
     keep_inds = np.append(fg_inds, bg_inds)
     sampled_labels = max_classes[keep_inds]
@@ -208,8 +214,9 @@ class TestGenerateProposalLabelsOp(OpTest):
         self.inputs = {
             'RpnRois': (self.rpn_rois[0], self.rpn_rois_lod),
             'GtClasses': (self.gt_classes[0], self.gts_lod),
+            'IsCrowd': (self.is_crowd[0], self.gts_lod),
             'GtBoxes': (self.gt_boxes[0], self.gts_lod),
-            'ImScales': self.im_scales[0]
+            'ImInfo': self.im_info
         }
         self.attrs = {
             'batch_size_per_im': self.batch_size_per_im,
@@ -218,14 +225,15 @@ class TestGenerateProposalLabelsOp(OpTest):
             'bg_thresh_hi': self.bg_thresh_hi,
             'bg_thresh_lo': self.bg_thresh_lo,
             'bbox_reg_weights': self.bbox_reg_weights,
-            'class_nums': self.class_nums
+            'class_nums': self.class_nums,
+            'use_random': False
         }
         self.outputs = {
-            'Rois': (self.rois[0], [self.lod]),
-            'LabelsInt32': (self.labels_int32[0], [self.lod]),
-            'BboxTargets': (self.bbox_targets[0], [self.lod]),
-            'BboxInsideWeights': (self.bbox_inside_weights[0], [self.lod]),
-            'BboxOutsideWeights': (self.bbox_outside_weights[0], [self.lod]),
+            'Rois': (self.rois, [self.lod]),
+            'LabelsInt32': (self.labels_int32, [self.lod]),
+            'BboxTargets': (self.bbox_targets, [self.lod]),
+            'BboxInsideWeights': (self.bbox_inside_weights, [self.lod]),
+            'BboxOutsideWeights': (self.bbox_outside_weights, [self.lod]),
         }
 
     def test_check_output(self):
@@ -236,8 +244,8 @@ class TestGenerateProposalLabelsOp(OpTest):
         self.set_data()
 
     def init_test_params(self):
-        self.batch_size_per_im = 10
-        self.fg_fraction = 1.0
+        self.batch_size_per_im = 512
+        self.fg_fraction = 0.25
         self.fg_thresh = 0.5
         self.bg_thresh_hi = 0.5
         self.bg_thresh_lo = 0.0
@@ -246,14 +254,14 @@ class TestGenerateProposalLabelsOp(OpTest):
 
     def init_test_input(self):
         np.random.seed(0)
-        image_nums = 1
         gt_nums = 6  # Keep same with batch_size_per_im for unittest
-        proposal_nums = self.batch_size_per_im - gt_nums
-        images_shape = []
-        self.im_scales = []
-        for i in range(image_nums):
-            images_shape.append(np.random.randint(200, size=2))
-            self.im_scales.append(np.ones((1)).astype(np.float32))
+        proposal_nums = 2000  #self.batch_size_per_im - gt_nums
+        images_shape = [[64, 64]]
+        self.im_info = np.ones((len(images_shape), 3)).astype(np.float32)
+        for i in range(len(images_shape)):
+            self.im_info[i, 0] = images_shape[i][0]
+            self.im_info[i, 1] = images_shape[i][1]
+            self.im_info[i, 2] = 0.8  #scale
 
         self.rpn_rois, self.rpn_rois_lod = _generate_proposals(images_shape,
                                                                proposal_nums)
@@ -261,16 +269,23 @@ class TestGenerateProposalLabelsOp(OpTest):
             images_shape, self.class_nums, gt_nums)
         self.gt_classes = [gt['gt_classes'] for gt in ground_truth]
         self.gt_boxes = [gt['boxes'] for gt in ground_truth]
+        self.is_crowd = [gt['is_crowd'] for gt in ground_truth]
 
     def init_test_output(self):
         self.rois, self.labels_int32, self.bbox_targets, \
         self.bbox_inside_weights, self.bbox_outside_weights, \
         self.lod = generate_proposal_labels_in_python(
-                self.rpn_rois, self.gt_classes, self.gt_boxes, self.im_scales,
+                self.rpn_rois, self.gt_classes, self.is_crowd, self.gt_boxes, self.im_info,
                 self.batch_size_per_im, self.fg_fraction,
                 self.fg_thresh, self.bg_thresh_hi, self.bg_thresh_lo,
                 self.bbox_reg_weights, self.class_nums
             )
+        self.rois = np.vstack(self.rois)
+        self.labels_int32 = np.hstack(self.labels_int32)
+        self.labels_int32 = self.labels_int32[:, np.newaxis]
+        self.bbox_targets = np.vstack(self.bbox_targets)
+        self.bbox_inside_weights = np.vstack(self.bbox_inside_weights)
+        self.bbox_outside_weights = np.vstack(self.bbox_outside_weights)
 
 
 def _generate_proposals(images_shape, proposal_nums):
@@ -280,7 +295,7 @@ def _generate_proposals(images_shape, proposal_nums):
     for i, image_shape in enumerate(images_shape):
         proposals = _generate_boxes(image_shape, proposal_nums)
         rpn_rois.append(proposals)
-        num_proposals += len(proposals)
+        num_proposals = len(proposals)
         rpn_rois_lod.append(num_proposals)
     return rpn_rois, [rpn_rois_lod]
 
@@ -294,7 +309,11 @@ def _generate_groundtruth(images_shape, class_nums, gt_nums):
         gt_classes = np.random.randint(
             low=1, high=class_nums, size=gt_nums).astype(np.int32)
         gt_boxes = _generate_boxes(image_shape, gt_nums)
-        ground_truth.append(dict(gt_classes=gt_classes, boxes=gt_boxes))
+        is_crowd = np.zeros((gt_nums), dtype=np.int32)
+        is_crowd[0] = 1
+        ground_truth.append(
+            dict(
+                gt_classes=gt_classes, boxes=gt_boxes, is_crowd=is_crowd))
         num_gts += len(gt_classes)
         gts_lod.append(num_gts)
     return ground_truth, [gts_lod]
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
similarity index 88%
rename from python/paddle/fluid/tests/unittests/test_generate_proposals.py
rename to python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
index 3fbd2ce95a..86e27fe29e 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposals.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
@@ -114,10 +114,10 @@ def box_coder(all_anchors, bbox_deltas, variances):
     #anchor_loc: width, height, center_x, center_y
     anchor_loc = np.zeros_like(bbox_deltas, dtype=np.float32)
 
-    anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0]
-    anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1]
-    anchor_loc[:, 2] = (all_anchors[:, 2] + all_anchors[:, 0]) / 2
-    anchor_loc[:, 3] = (all_anchors[:, 3] + all_anchors[:, 1]) / 2
+    anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0] + 1
+    anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1] + 1
+    anchor_loc[:, 2] = all_anchors[:, 0] + 0.5 * anchor_loc[:, 0]
+    anchor_loc[:, 3] = all_anchors[:, 1] + 0.5 * anchor_loc[:, 1]
 
     #predicted bbox: bbox_center_x, bbox_center_y, bbox_width, bbox_height 
     pred_bbox = np.zeros_like(bbox_deltas, dtype=np.float32)
@@ -127,23 +127,29 @@ def box_coder(all_anchors, bbox_deltas, variances):
                 i, 0] + anchor_loc[i, 2]
             pred_bbox[i, 1] = variances[i, 1] * bbox_deltas[i, 1] * anchor_loc[
                 i, 1] + anchor_loc[i, 3]
-            pred_bbox[i, 2] = math.exp(variances[i, 2] *
-                                       bbox_deltas[i, 2]) * anchor_loc[i, 0]
-            pred_bbox[i, 3] = math.exp(variances[i, 3] *
-                                       bbox_deltas[i, 3]) * anchor_loc[i, 1]
+            pred_bbox[i, 2] = math.exp(
+                min(variances[i, 2] * bbox_deltas[i, 2], math.log(
+                    1000 / 16.0))) * anchor_loc[i, 0]
+            pred_bbox[i, 3] = math.exp(
+                min(variances[i, 3] * bbox_deltas[i, 3], math.log(
+                    1000 / 16.0))) * anchor_loc[i, 1]
     else:
         for i in range(bbox_deltas.shape[0]):
             pred_bbox[i, 0] = bbox_deltas[i, 0] * anchor_loc[i, 0] + anchor_loc[
                 i, 2]
             pred_bbox[i, 1] = bbox_deltas[i, 1] * anchor_loc[i, 1] + anchor_loc[
                 i, 3]
-            pred_bbox[i, 2] = math.exp(bbox_deltas[i, 2]) * anchor_loc[i, 0]
-            pred_bbox[i, 3] = math.exp(bbox_deltas[i, 3]) * anchor_loc[i, 1]
+            pred_bbox[i, 2] = math.exp(
+                min(bbox_deltas[i, 2], math.log(1000 / 16.0))) * anchor_loc[i,
+                                                                            0]
+            pred_bbox[i, 3] = math.exp(
+                min(bbox_deltas[i, 3], math.log(1000 / 16.0))) * anchor_loc[i,
+                                                                            1]
 
     proposals[:, 0] = pred_bbox[:, 0] - pred_bbox[:, 2] / 2
     proposals[:, 1] = pred_bbox[:, 1] - pred_bbox[:, 3] / 2
-    proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2
-    proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2
+    proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2 - 1
+    proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2 - 1
 
     return proposals
 
@@ -170,13 +176,16 @@ def filter_boxes(boxes, min_size, im_info):
     """Only keep boxes with both sides >= min_size and center within the image.
     """
     # Scale min_size to match image scale
-    min_size *= im_info[2]
+    im_scale = im_info[2]
+    min_size = max(min_size, 1.0)
     ws = boxes[:, 2] - boxes[:, 0] + 1
     hs = boxes[:, 3] - boxes[:, 1] + 1
+    ws_orig_scale = (boxes[:, 2] - boxes[:, 0]) / im_scale + 1
+    hs_orig_scale = (boxes[:, 3] - boxes[:, 1]) / im_scale + 1
     x_ctr = boxes[:, 0] + ws / 2.
     y_ctr = boxes[:, 1] + hs / 2.
-    keep = np.where((ws >= min_size) & (hs >= min_size) & (x_ctr < im_info[1]) &
-                    (y_ctr < im_info[0]))[0]
+    keep = np.where((ws_orig_scale >= min_size) & (hs_orig_scale >= min_size) &
+                    (x_ctr < im_info[1]) & (y_ctr < im_info[0]))[0]
     return keep
 
 
@@ -204,7 +213,7 @@ def iou(box_a, box_b):
     xb = min(xmax_a, xmax_b)
     yb = min(ymax_a, ymax_b)
 
-    inter_area = max(xb - xa, 0.0) * max(yb - ya, 0.0)
+    inter_area = max(xb - xa + 1, 0.0) * max(yb - ya + 1, 0.0)
 
     iou_ratio = inter_area / (area_a + area_b - inter_area)
 
diff --git a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
index bd548009b3..f63dbcd3d7 100644
--- a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
@@ -19,48 +19,58 @@ import numpy as np
 import paddle.fluid.core as core
 from op_test import OpTest
 from test_anchor_generator_op import anchor_generator_in_python
-from test_generate_proposal_labels import _generate_groundtruth
-from test_generate_proposal_labels import _bbox_overlaps, _box_to_delta
-
-
-def rpn_target_assign(gt_anchor_iou, rpn_batch_size_per_im,
-                      rpn_positive_overlap, rpn_negative_overlap, fg_fraction):
-    iou = np.transpose(gt_anchor_iou)
-    anchor_to_gt_max = iou.max(axis=1)
-    anchor_to_gt_argmax = iou.argmax(axis=1)
-
-    gt_to_anchor_argmax = iou.argmax(axis=0)
-    gt_to_anchor_max = iou[gt_to_anchor_argmax, np.arange(iou.shape[1])]
-    anchors_with_max_overlap = np.where(iou == gt_to_anchor_max)[0]
-
-    tgt_lbl = np.ones((iou.shape[0], ), dtype=np.int32) * -1
-    tgt_lbl[anchors_with_max_overlap] = 1
-    tgt_lbl[anchor_to_gt_max >= rpn_positive_overlap] = 1
-
-    num_fg = int(fg_fraction * rpn_batch_size_per_im)
-    fg_inds = np.where(tgt_lbl == 1)[0]
-    if len(fg_inds) > num_fg:
+from test_generate_proposal_labels_op import _generate_groundtruth
+from test_generate_proposal_labels_op import _bbox_overlaps, _box_to_delta
+
+
+def rpn_target_assign(anchor_by_gt_overlap,
+                      rpn_batch_size_per_im,
+                      rpn_positive_overlap,
+                      rpn_negative_overlap,
+                      rpn_fg_fraction,
+                      use_random=True):
+    anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1)
+    anchor_to_gt_max = anchor_by_gt_overlap[np.arange(
+        anchor_by_gt_overlap.shape[0]), anchor_to_gt_argmax]
+
+    gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0)
+    gt_to_anchor_max = anchor_by_gt_overlap[gt_to_anchor_argmax, np.arange(
+        anchor_by_gt_overlap.shape[1])]
+    anchors_with_max_overlap = np.where(
+        anchor_by_gt_overlap == gt_to_anchor_max)[0]
+
+    labels = np.ones((anchor_by_gt_overlap.shape[0], ), dtype=np.int32) * -1
+    labels[anchors_with_max_overlap] = 1
+    labels[anchor_to_gt_max >= rpn_positive_overlap] = 1
+
+    num_fg = int(rpn_fg_fraction * rpn_batch_size_per_im)
+    fg_inds = np.where(labels == 1)[0]
+    if len(fg_inds) > num_fg and use_random:
         disable_inds = np.random.choice(
             fg_inds, size=(len(fg_inds) - num_fg), replace=False)
-        tgt_lbl[disable_inds] = -1
-    fg_inds = np.where(tgt_lbl == 1)[0]
+    else:
+        disable_inds = fg_inds[num_fg:]
+    labels[disable_inds] = -1
+    fg_inds = np.where(labels == 1)[0]
 
-    num_bg = rpn_batch_size_per_im - np.sum(tgt_lbl == 1)
+    num_bg = rpn_batch_size_per_im - np.sum(labels == 1)
     bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0]
-    tgt_lbl[bg_inds] = 0
-    if len(bg_inds) > num_bg:
+    if len(bg_inds) > num_bg and use_random:
         enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)]
-        tgt_lbl[enable_inds] = 0
-    bg_inds = np.where(tgt_lbl == 0)[0]
-    tgt_lbl[bg_inds] = 0
+    else:
+        enable_inds = bg_inds[:num_bg]
+    labels[enable_inds] = 0
+    fg_inds = np.where(labels == 1)[0]
+    bg_inds = np.where(labels == 0)[0]
 
     loc_index = fg_inds
     score_index = np.hstack((fg_inds, bg_inds))
-    tgt_lbl = np.expand_dims(tgt_lbl, axis=1)
+    labels = labels[score_index]
+    assert not np.any(labels == -1), "Wrong labels with -1"
 
     gt_inds = anchor_to_gt_argmax[fg_inds]
 
-    return loc_index, score_index, tgt_lbl, gt_inds
+    return loc_index, score_index, labels, gt_inds
 
 
 def get_anchor(n, c, h, w):
@@ -75,85 +85,129 @@ def get_anchor(n, c, h, w):
     return anchors
 
 
-def rpn_blob(anchor, gt_boxes, iou, lod, rpn_batch_size_per_im,
-             rpn_positive_overlap, rpn_negative_overlap, fg_fraction):
-
-    loc_indexes = []
-    score_indexes = []
-    tmp_tgt_labels = []
-    tgt_bboxes = []
-    anchor_num = anchor.shape[0]
-
+def rpn_target_assign_in_python(all_anchors,
+                                gt_boxes,
+                                is_crowd,
+                                im_info,
+                                lod,
+                                rpn_straddle_thresh,
+                                rpn_batch_size_per_im,
+                                rpn_positive_overlap,
+                                rpn_negative_overlap,
+                                rpn_fg_fraction,
+                                use_random=True):
+    anchor_num = all_anchors.shape[0]
     batch_size = len(lod) - 1
     for i in range(batch_size):
+        im_height = im_info[i][0]
+        im_width = im_info[i][1]
+        im_scale = im_info[i][2]
+        if rpn_straddle_thresh >= 0:
+            # Only keep anchors inside the image by a margin of straddle_thresh
+            inds_inside = np.where(
+                (all_anchors[:, 0] >= -rpn_straddle_thresh) &
+                (all_anchors[:, 1] >= -rpn_straddle_thresh) & (
+                    all_anchors[:, 2] < im_width + rpn_straddle_thresh) & (
+                        all_anchors[:, 3] < im_height + rpn_straddle_thresh))[0]
+            # keep only inside anchors
+            inside_anchors = all_anchors[inds_inside, :]
+        else:
+            inds_inside = np.arange(all_anchors.shape[0])
+            inside_anchors = all_anchors
+
         b, e = lod[i], lod[i + 1]
-        iou_slice = iou[b:e, :]
-        bboxes_slice = gt_boxes[b:e, :]
+        gt_boxes_slice = gt_boxes[b:e, :] * im_scale
+        is_crowd_slice = is_crowd[b:e]
 
-        loc_idx, score_idx, tgt_lbl, gt_inds = rpn_target_assign(
-            iou_slice, rpn_batch_size_per_im, rpn_positive_overlap,
-            rpn_negative_overlap, fg_fraction)
+        not_crowd_inds = np.where(is_crowd_slice == 0)[0]
+        gt_boxes_slice = gt_boxes_slice[not_crowd_inds]
+        iou = _bbox_overlaps(inside_anchors, gt_boxes_slice)
 
-        fg_bboxes = bboxes_slice[gt_inds]
-        fg_anchors = anchor[loc_idx]
-        box_deltas = _box_to_delta(fg_anchors, fg_bboxes, [1., 1., 1., 1.])
+        loc_inds, score_inds, labels, gt_inds = rpn_target_assign(
+            iou, rpn_batch_size_per_im, rpn_positive_overlap,
+            rpn_negative_overlap, rpn_fg_fraction, use_random)
+        # unmap to all anchor 
+        loc_inds = inds_inside[loc_inds]
+        score_inds = inds_inside[score_inds]
+
+        sampled_gt = gt_boxes_slice[gt_inds]
+        sampled_anchor = all_anchors[loc_inds]
+        box_deltas = _box_to_delta(sampled_anchor, sampled_gt, [1., 1., 1., 1.])
 
         if i == 0:
-            loc_indexes = loc_idx
-            score_indexes = score_idx
-            tmp_tgt_labels = tgt_lbl
+            loc_indexes = loc_inds
+            score_indexes = score_inds
+            tgt_labels = labels
             tgt_bboxes = box_deltas
         else:
             loc_indexes = np.concatenate(
-                [loc_indexes, loc_idx + i * anchor_num])
+                [loc_indexes, loc_inds + i * anchor_num])
             score_indexes = np.concatenate(
-                [score_indexes, score_idx + i * anchor_num])
-            tmp_tgt_labels = np.concatenate([tmp_tgt_labels, tgt_lbl])
+                [score_indexes, score_inds + i * anchor_num])
+            tgt_labels = np.concatenate([tgt_labels, labels])
             tgt_bboxes = np.vstack([tgt_bboxes, box_deltas])
 
-    tgt_labels = tmp_tgt_labels[score_indexes]
     return loc_indexes, score_indexes, tgt_bboxes, tgt_labels
 
 
 class TestRpnTargetAssignOp(OpTest):
     def setUp(self):
         n, c, h, w = 2, 4, 14, 14
-        anchor = get_anchor(n, c, h, w)
+        all_anchors = get_anchor(n, c, h, w)
         gt_num = 10
-        anchor = anchor.reshape(-1, 4)
-        anchor_num = anchor.shape[0]
-
-        im_shapes = [[64, 64], [64, 64]]
-        gt_box, lod = _generate_groundtruth(im_shapes, 3, 4)
-        bbox = np.vstack([v['boxes'] for v in gt_box])
-
-        iou = _bbox_overlaps(bbox, anchor)
-
-        anchor = anchor.astype('float32')
-        bbox = bbox.astype('float32')
-        iou = iou.astype('float32')
-
-        loc_index, score_index, tgt_bbox, tgt_lbl = rpn_blob(
-            anchor, bbox, iou, [0, 4, 8], 25600, 0.95, 0.03, 0.25)
+        all_anchors = all_anchors.reshape(-1, 4)
+        anchor_num = all_anchors.shape[0]
+
+        images_shape = [[64, 64], [64, 64]]
+        #images_shape = [[64, 64]]
+        groundtruth, lod = _generate_groundtruth(images_shape, 3, 4)
+        lod = [0, 4, 8]
+        #lod = [0, 4]
+
+        im_info = np.ones((len(images_shape), 3)).astype(np.float32)
+        for i in range(len(images_shape)):
+            im_info[i, 0] = images_shape[i][0]
+            im_info[i, 1] = images_shape[i][1]
+            im_info[i, 2] = 0.8  #scale
+        gt_boxes = np.vstack([v['boxes'] for v in groundtruth])
+        is_crowd = np.hstack([v['is_crowd'] for v in groundtruth])
+
+        all_anchors = all_anchors.astype('float32')
+        gt_boxes = gt_boxes.astype('float32')
+
+        rpn_straddle_thresh = 0.0
+        rpn_batch_size_per_im = 256
+        rpn_positive_overlap = 0.7
+        rpn_negative_overlap = 0.3
+        rpn_fg_fraction = 0.5
+        use_random = False
+
+        loc_index, score_index, tgt_bbox, labels = rpn_target_assign_in_python(
+            all_anchors, gt_boxes, is_crowd, im_info, lod, rpn_straddle_thresh,
+            rpn_batch_size_per_im, rpn_positive_overlap, rpn_negative_overlap,
+            rpn_fg_fraction, use_random)
+        labels = labels[:, np.newaxis]
 
         self.op_type = "rpn_target_assign"
         self.inputs = {
-            'Anchor': anchor,
-            'GtBox': (bbox, [[4, 4]]),
-            'DistMat': (iou, [[4, 4]]),
+            'Anchor': all_anchors,
+            'GtBoxes': (gt_boxes, [[4, 4]]),
+            'IsCrowd': (is_crowd, [[4, 4]]),
+            'ImInfo': (im_info, [[1, 1]])
         }
         self.attrs = {
-            'rpn_batch_size_per_im': 25600,
-            'rpn_positive_overlap': 0.95,
-            'rpn_negative_overlap': 0.03,
-            'fg_fraction': 0.25,
-            'fix_seed': True
+            'rpn_batch_size_per_im': rpn_batch_size_per_im,
+            'rpn_straddle_thresh': rpn_straddle_thresh,
+            'rpn_positive_overlap': rpn_positive_overlap,
+            'rpn_negative_overlap': rpn_negative_overlap,
+            'rpn_fg_fraction': rpn_fg_fraction,
+            'use_random': use_random
         }
         self.outputs = {
             'LocationIndex': loc_index.astype('int32'),
             'ScoreIndex': score_index.astype('int32'),
             'TargetBBox': tgt_bbox.astype('float32'),
-            'TargetLabel': tgt_lbl.astype('int64'),
+            'TargetLabel': labels.astype('int32')
         }
 
     def test_check_output(self):