From d384d39a68457c837b46df3260a343041e934637 Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Thu, 19 Jul 2018 03:14:04 +0000
Subject: [PATCH 01/37] add Temporarily add code with bug

---
 .../inference/tensorrt/convert/CMakeLists.txt |  5 +-
 .../inference/tensorrt/convert/pool2d_op.cc   | 75 +++++++++++++++++++
 .../tensorrt/convert/test_pool2d_op.cc        | 63 ++++++++++++++++
 3 files changed, 142 insertions(+), 1 deletion(-)
 create mode 100644 paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
 create mode 100644 paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 748f5a084e..c841510ae6 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,6 +1,6 @@
 # Add TRT tests
 nv_library(tensorrt_converter
-  SRCS mul_op.cc conv2d_op.cc fc_op.cc
+  SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc
   DEPS tensorrt_engine mul_op)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
@@ -13,3 +13,6 @@ nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
 nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL)
+
+nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op SERIAL)
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
new file mode 100644
index 0000000000..ea282a79d5
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -0,0 +1,75 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Pool2dOp, IPoolingLayer in TRT. This Layer doesn't has weights.
+ */
+class Pool2dOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4)
+        << "convert a fluid pool2d op to tensorrt pool2d layer without bias";
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+    std::string pool_type =
+        boost::get<std::string>(op_desc.GetAttr("pooling_type"));
+    std::vector<int> ksize =
+        boost::get<std::vector<int>>(op_desc.GetAttr("ksize"));
+    std::vector<int> strides =
+        boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+    std::vector<int> paddings =
+        boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+
+    const nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]);
+    const nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
+    const nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
+
+    nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kMAX;
+    if (pool_type == "max") {
+      pool_t = nvinfer1::PoolingType::kMAX;
+    } else if (pool_type == "avg") {
+      pool_t = nvinfer1::PoolingType::kAVERAGE;
+    } else {
+      PADDLE_THROW("TensorRT unsupported pooling type!");
+    }
+
+    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling,
+                                       *const_cast<nvinfer1::ITensor*>(input1),
+                                       pool_t, nv_ksize);
+    PADDLE_ENFORCE_NOT_NULL(layer, "pool layer could not be created.");
+    layer->setStride(nv_strides);
+    layer->setPadding(nv_paddings);
+
+    auto output_name = op_desc.Output("Out")[0];
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(pool2d);
+REGISTER_TRT_OP_CONVERTER(pool2d, Pool2dOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
new file mode 100644
index 0000000000..261eb2c6ce
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <gtest/gtest.h>
+#include <fstream>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(Pool2dOpConverter, main) {
+  framework::Scope scope;
+  std::unordered_set<std::string> parameters;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("pool2d-X", nvinfer1::Dims4(10, 3, 2, 2));
+  validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims4(10, 3, 1, 1));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("pool2d");
+  desc.SetInput("X", {"pool2d-X"});
+  desc.SetOutput("Out", {"pool2d-Out"});
+
+  std::vector<int> ksize({2, 2});
+  std::vector<int> strides({1, 1});
+  std::vector<int> paddings({0, 0});
+  std::string pooling_t = "max";
+
+  desc.SetAttr("pooling_type", pooling_t);
+  desc.SetAttr("ksize", ksize);
+  desc.SetAttr("strides", strides);
+  desc.SetAttr("paddings", paddings);
+  // std::string temp = "";
+  // (*desc.Proto()).SerializeToString(&temp);
+
+  // std::cout << temp << std::endl;
+  // std::ofstream f("__temp__", std::ios::out);
+  // f << temp;
+
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+
+  validator.Execute(10);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(pool2d);

From dcaf183daa2629d0032874632a0b90fa78d3d9ff Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Tue, 17 Jul 2018 21:34:00 +0800
Subject: [PATCH 02/37] builder SSA graph at the beginning.

---
 .../details/multi_devices_graph_builder.cc    |  27 ++--
 .../framework/details/ssa_graph_builder.cc    |  11 ++
 paddle/fluid/framework/ir/graph.cc            | 133 ++++++++++++++++--
 paddle/fluid/framework/ir/graph.h             |   3 +
 paddle/fluid/framework/ir/graph_test.cc       |   1 +
 paddle/fluid/framework/ir/node.h              |   1 +
 6 files changed, 150 insertions(+), 26 deletions(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index f1f8674caf..dc9183e96a 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -221,15 +221,15 @@ std::unique_ptr<Graph> MultiDevSSAGraphBuilder::Apply(
   // forward, backward nodes. E.g. you can't append an forward node
   // at the end of the node list.
   // TODO(panyx0718): FIXME: Needs to sort by forward->backward order.
-  for (auto &node : nodes) {
-    if (node->NodeType() != ir::Node::Type::kOperation) continue;
+  for (ir::Node *node : TopologySortOperationFromInToOut(nodes)) {
+    VLOG(3) << "apply node: " << node->Name() << reinterpret_cast<void *>(node);
     if (boost::get<int>(
             node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
         static_cast<int>(OpRole::kRPC)) {
-      CreateRPCOp(&result, node.get());
-    } else if (IsDistTrainOp(node.get(), send_vars, recv_vars)) {
-      CreateDistTrainOp(&result, node.get());
-    } else if (IsScaleLossOp(node.get())) {
+      CreateRPCOp(&result, node);
+    } else if (IsDistTrainOp(node, send_vars, recv_vars)) {
+      CreateDistTrainOp(&result, node);
+    } else if (IsScaleLossOp(node)) {
       // user can customize loss@grad if not use_default_grad_scale_
       if (strategy_.gradient_scale_ !=
           BuildStrategy::GradientScaleStrategy::kCustomized) {
@@ -240,10 +240,11 @@ std::unique_ptr<Graph> MultiDevSSAGraphBuilder::Apply(
       // It also assumes backward op will always follow the forward op in
       // the block.
       is_forwarding = false;
+      LOG(ERROR) << "forward flipping!!!!!!!";
     } else {
-      int op_dev_id = GetOpDeviceID(node.get());
+      int op_dev_id = GetOpDeviceID(node);
       if (op_dev_id != -1) {  // This op only runs on one specific device.
-        CreateComputationalOp(&result, node.get(), op_dev_id);
+        CreateComputationalOp(&result, node, op_dev_id);
         for (ir::Node *n : node->outputs) {
           var_name_on_devices_.emplace(n->Name(), op_dev_id);
         }
@@ -252,13 +253,11 @@ std::unique_ptr<Graph> MultiDevSSAGraphBuilder::Apply(
         // gradients.
         if (node->Op()->Type() == "read" && strategy_.enable_data_balance_) {
           node->Op()->SetAttr("throw_eof_exp", false);
-          CreateComputationalOps(&result, node.get(), places_.size());
-          // TODO(paddle-dev): builder shouldn't depend on the out logic of
-          // a specific op.
+          CreateComputationalOps(&result, node, places_.size());
           const auto &data_var_names = node->Op()->Output("Out");
           InsertDataBalanceOp(&result, data_var_names);
         } else {
-          CreateComputationalOps(&result, node.get(), places_.size());
+          CreateComputationalOps(&result, node, places_.size());
         }
 
         if (!is_forwarding && places_.size() > 1) {
@@ -479,8 +478,8 @@ int MultiDevSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const {
 
   PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
   int dev_id = GetVarDeviceID(param_grad[1]);
-  PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s]",
-                    node->Op()->Type(), param_grad[0]);
+  PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]",
+                    node->Op()->Type(), param_grad[0], param_grad[1]);
   return dev_id;
 }
 
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc
index 7bc130ef6e..2be4bb009e 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -37,6 +37,17 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(Graph *graph) {
             continue;
           }
 
+          bool has_dep = false;
+          for (auto read_out : read_op->Outputs()) {
+            for (auto write_in : write_op->Inputs()) {
+              if (read_out == write_in) {
+                has_dep = true;
+                break;
+              }
+            }
+          }
+          if (has_dep) continue;
+
           auto *dep_var = new DummyVarHandle(
               graph->CreateEmptyNode("dummy", ir::Node::Type::kVariable));
           read_op->AddOutput(dep_var);
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index e4021aa92b..f297461ab2 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -12,14 +12,35 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <algorithm>
+#include <unordered_set>
+
 #include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
 
 namespace paddle {
 namespace framework {
+namespace {
+void SortHelper(
+    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list,
+    ir::Node *node, std::unordered_set<ir::Node *> *visited,
+    std::vector<ir::Node *> *ret) {
+  visited->insert(node);
+
+  for (auto adj : adj_list.at(node)) {
+    if (visited->find(adj) == visited->end()) {
+      SortHelper(adj_list, adj, visited, ret);
+    }
+  }
+
+  VLOG(3) << "topology sort insert: " << node->Name()
+          << reinterpret_cast<void *>(node) << " input " << node->inputs.size();
+  ret->push_back(node);
+}
+}  // namespace
 
-// NOTE(paddle-dev): This graph contains circle.
 Graph::Graph(const ProgramDesc &program) : program_(program) {
   VLOG(3) << "block in program:" << program_.Size();
   std::unordered_map<std::string, VarDesc *> all_vars;
@@ -27,40 +48,128 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
     all_vars.emplace(var->Name(), var);
   }
 
-  std::map<std::string, ir::Node *> var_nodes;
+  ir::Node *last_backward = nullptr;
+  std::vector<ir::Node *> optimize_ops;
+  std::map<std::string, std::vector<ir::Node *>> var_nodes;
   for (auto *op : program.Block(0).AllOps()) {
     ir::Node *node = CreateOpNode(op);
+    if (boost::get<int>(
+            op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+        static_cast<int>(OpRole::kBackward)) {
+      last_backward = node;
+    } else if (boost::get<int>(
+                   op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+               static_cast<int>(OpRole::kOptimize)) {
+      optimize_ops.push_back(node);
+    }
 
     for (auto &each_var_name : op->InputArgumentNames()) {
       ir::Node *var = nullptr;
       if (var_nodes.find(each_var_name) != var_nodes.end()) {
-        var = var_nodes.at(each_var_name);
+        var = var_nodes.at(each_var_name).back();
       } else if (all_vars.count(each_var_name) != 0) {
         var = CreateVarNode(all_vars.at(each_var_name));
-        var_nodes[each_var_name] = var;
+        var_nodes[each_var_name].push_back(var);
       } else {
         // TODO(paddle-dev): Seems some assumption doesn't hold?
         VLOG(3) << op->Type()
                 << " input var not in all_var list: " << each_var_name;
         var = CreateEmptyNode(each_var_name, ir::Node::Type::kVariable);
-        var_nodes[each_var_name] = var;
+        var_nodes[each_var_name].push_back(var);
       }
       node->inputs.push_back(var);
       var->outputs.push_back(node);
     }
 
     for (auto &each_var_name : op->OutputArgumentNames()) {
-      ir::Node *var = nullptr;
-      if (var_nodes.find(each_var_name) != var_nodes.end()) {
-        var = var_nodes.at(each_var_name);
-      } else {
-        var = CreateVarNode(all_vars.at(each_var_name));
-        var_nodes[each_var_name] = var;
-      }
+      ir::Node *var = CreateVarNode(all_vars.at(each_var_name));
+      var_nodes[each_var_name].push_back(var);
       node->outputs.push_back(var);
       var->inputs.push_back(node);
     }
   }
+  for (auto &var : var_nodes) {
+    auto &versions = var.second;
+    if (versions.size() <= 1) continue;
+
+    auto it_new = versions.rbegin();
+    auto it_old = versions.rbegin();
+    ++it_old;
+    for (; it_old != versions.rend(); it_new = it_old, ++it_old) {
+      ir::Node *write_op =
+          (*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0];
+      const auto &read_ops = (*it_old)->outputs;
+
+      for (auto *read_op : read_ops) {
+        // Manually add a dependency var from read_op to write_op;
+        if (read_op == write_op) {
+          // Read Write is the same op.
+          continue;
+        }
+        ir::Node *dep_var = CreateEmptyNode("dummy", ir::Node::Type::kVariable);
+        read_op->outputs.push_back(dep_var);
+        dep_var->inputs.push_back(read_op);
+        write_op->inputs.push_back(dep_var);
+        dep_var->outputs.push_back(write_op);
+      }
+    }
+  }
+
+  if (last_backward) {
+    for (ir::Node *opt_node : optimize_ops) {
+      ir::Node *dep_var = CreateEmptyNode("dummy", ir::Node::Type::kVariable);
+      last_backward->outputs.push_back(dep_var);
+      dep_var->inputs.push_back(last_backward);
+      opt_node->inputs.push_back(dep_var);
+      dep_var->outputs.push_back(opt_node);
+      VLOG(3) << "appending connect: " << last_backward->Name()
+              << reinterpret_cast<void *>(last_backward) << "->"
+              << opt_node->Name() << reinterpret_cast<void *>(opt_node);
+    }
+  }
+}
+
+std::vector<ir::Node *> TopologySortOperationFromInToOut(
+    const std::vector<std::unique_ptr<ir::Node>> &nodes) {
+  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
+  std::unordered_set<ir::Node *> visited;
+  std::vector<ir::Node *> ret;
+
+  for (auto &n : nodes) {
+    if (n->NodeType() != ir::Node::Type::kOperation) continue;
+    if (adj_list.find(n.get()) == adj_list.end()) {
+      adj_list[n.get()] = std::unordered_set<ir::Node *>();
+    }
+    for (auto &var : n->inputs) {
+      for (auto &adj_n : var->inputs) {
+        PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
+        adj_list[n.get()].insert(adj_n);
+        LOG(ERROR) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
+                   << " -> " << n->Name() << reinterpret_cast<void *>(n.get())
+                   << "  via " << var->Name() << reinterpret_cast<void *>(var);
+      }
+    }
+  }
+
+  for (auto adj : adj_list) {
+    if (visited.find(adj.first) == visited.end()) {
+      SortHelper(adj_list, adj.first, &visited, &ret);
+    }
+  }
+
+  for (ir::Node *n : ret) {
+    std::unordered_set<ir::Node *> dummy;
+    n->inputs.erase(
+        std::remove_if(n->inputs.begin(), n->inputs.end(),
+                       [n](ir::Node *t) { return t->Name() == "dummy"; }),
+        n->inputs.end());
+    n->outputs.erase(
+        std::remove_if(n->outputs.begin(), n->outputs.end(),
+                       [n](ir::Node *t) { return t->Name() == "dummy"; }),
+        n->outputs.end());
+  }
+  return ret;
 }
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index b4ac135b02..0242edecf4 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -78,5 +78,8 @@ class Graph {
   std::map<std::string, std::function<void(void)>> attr_dels_;
 };
 
+std::vector<ir::Node*> TopologySortOperationFromInToOut(
+    const std::vector<std::unique_ptr<ir::Node>>& nodes);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc
index 4e23bf124f..186047b370 100644
--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
@@ -76,6 +76,7 @@ TEST(GraphTest, Basic) {
   op->SetType("sum");
   op->SetInput("X", {"test_a", "test_b", "test_c"});
   op->SetOutput("Out", {"test_out"});
+  op->SetAttr("op_role", 1);
 
   prog.MutableBlock(0)->Var("test_a")->SetType(proto::VarType::SELECTED_ROWS);
   prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarType::SELECTED_ROWS);
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index b98c29b81d..97b64a6017 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -50,6 +50,7 @@ class Node {
     PADDLE_ENFORCE(type_ == Type::kVariable);
     return var_desc_;
   }
+
   OpDesc* Op() {
     PADDLE_ENFORCE(type_ == Type::kOperation);
     return op_desc_;

From 0b3465d215f9639bffc79f2336d623acca08ba6d Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 20 Jul 2018 10:37:04 +0800
Subject: [PATCH 03/37] better

---
 paddle/fluid/framework/details/CMakeLists.txt |   2 +-
 .../details/multi_devices_graph_builder.cc    |  55 ++++++-
 paddle/fluid/framework/ir/CMakeLists.txt      |   2 +-
 paddle/fluid/framework/ir/graph.cc            | 136 +++++++++++++-----
 paddle/fluid/framework/ir/graph.h             |   3 -
 paddle/fluid/framework/ir/graph_helper.cc     | 116 +++++++++++++++
 paddle/fluid/framework/ir/graph_helper.h      |  35 +++++
 paddle/fluid/framework/ir/node.cc             |   6 +-
 paddle/fluid/framework/ir/node.h              |   2 +
 9 files changed, 308 insertions(+), 49 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/graph_helper.cc
 create mode 100644 paddle/fluid/framework/ir/graph_helper.h

diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index df55b3d054..620d202d33 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -5,7 +5,7 @@ cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry)
 
-cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS graph)
+cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS graph graph_helper)
 cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder)
 cc_library(ssa_graph_checker SRCS ssa_graph_checker.cc DEPS ssa_graph_builder)
 
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index dc9183e96a..4cc6e5727b 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -25,6 +25,7 @@
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/rpc_op_handle.h"
 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/scope.h"
@@ -186,9 +187,55 @@ size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
   return dev_id;
 }
 
+// Topology sort the graph nodes from inputs to outputs.
+// Since SSAGraphBuilder depends on forward/backward nodes to assign devices
+// to parameter/gradients before optimizer ops, topo sort is insufficient. (
+// some optimizer ops might not depend on any nodes), we manually move all
+// optimizer nodes after last backward nodes.
+std::vector<ir::Node *> SortOpsAndDelayOptimizeOp(const Graph &graph) {
+  std::vector<ir::Node *> ret = ir::TopologySort(graph);
+  size_t last_backward = 0;
+  std::vector<ir::Node *> optimize_ops;
+  std::vector<ir::Node *> sorted_ret;
+  for (size_t i = 0; i < ret.size(); ++i) {
+    if (boost::get<int>(
+            ret[i]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+        static_cast<int>(OpRole::kBackward)) {
+      sorted_ret.push_back(ret[i]);
+      last_backward = sorted_ret.size();
+    } else if (boost::get<int>(ret[i]->Op()->GetAttr(
+                   OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+               static_cast<int>(OpRole::kOptimize)) {
+      optimize_ops.push_back(ret[i]);
+    } else {
+      sorted_ret.push_back(ret[i]);
+    }
+  }
+
+  sorted_ret.insert(sorted_ret.begin() + last_backward, optimize_ops.begin(),
+                    optimize_ops.end());
+
+  for (ir::Node *n : sorted_ret) {
+    n->inputs.erase(std::remove_if(n->inputs.begin(), n->inputs.end(),
+                                   [n](ir::Node *t) {
+                                     return t->Name() ==
+                                            ir::Node::kControlDepVarName;
+                                   }),
+                    n->inputs.end());
+    n->outputs.erase(std::remove_if(n->outputs.begin(), n->outputs.end(),
+                                    [n](ir::Node *t) {
+                                      return t->Name() ==
+                                             ir::Node::kControlDepVarName;
+                                    }),
+                     n->outputs.end());
+  }
+  return sorted_ret;
+}
+
 std::unique_ptr<Graph> MultiDevSSAGraphBuilder::Apply(
     std::unique_ptr<Graph> graph) const {
   // Rebuild the graph structure.
+  std::vector<ir::Node *> sorted_ops = SortOpsAndDelayOptimizeOp(*graph);
   auto nodes = std::move(graph->nodes);
   graph->nodes.clear();
 
@@ -217,12 +264,7 @@ std::unique_ptr<Graph> MultiDevSSAGraphBuilder::Apply(
   size_t cur_device_id = 0;
   bool is_forwarding = true;
 
-  // NOTE: Currently, passes before SSAGraphBuilder cannot reorder
-  // forward, backward nodes. E.g. you can't append an forward node
-  // at the end of the node list.
-  // TODO(panyx0718): FIXME: Needs to sort by forward->backward order.
-  for (ir::Node *node : TopologySortOperationFromInToOut(nodes)) {
-    VLOG(3) << "apply node: " << node->Name() << reinterpret_cast<void *>(node);
+  for (ir::Node *node : sorted_ops) {
     if (boost::get<int>(
             node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
         static_cast<int>(OpRole::kRPC)) {
@@ -240,7 +282,6 @@ std::unique_ptr<Graph> MultiDevSSAGraphBuilder::Apply(
       // It also assumes backward op will always follow the forward op in
       // the block.
       is_forwarding = false;
-      LOG(ERROR) << "forward flipping!!!!!!!";
     } else {
       int op_dev_id = GetOpDeviceID(node);
       if (op_dev_id != -1) {  // This op only runs on one specific device.
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index ee0604383e..744696ebb0 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -1,5 +1,5 @@
 cc_library(node SRCS node.cc DEPS proto_desc)
 cc_library(graph SRCS graph.cc DEPS node)
+cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
 cc_library(pass SRCS pass.cc DEPS graph node)
-
 cc_test(graph_test SRCS graph_test.cc DEPS graph proto_desc op_registry)
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index f297461ab2..46640fedcc 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+/*
 namespace {
 void SortHelper(
     const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list,
@@ -39,7 +40,21 @@ void SortHelper(
           << reinterpret_cast<void *>(node) << " input " << node->inputs.size();
   ret->push_back(node);
 }
+
+std::vector<ir::Node*> TopologySort(
+    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list) {
+  std::unordered_set<ir::Node *> visited;
+  std::vector<ir::Node *> ret;
+
+  for (auto adj : adj_list) {
+    if (visited.find(adj.first) == visited.end()) {
+      SortHelper(adj_list, adj.first, &visited, &ret);
+    }
+  }
+  return ret;
+}
 }  // namespace
+*/
 
 Graph::Graph(const ProgramDesc &program) : program_(program) {
   VLOG(3) << "block in program:" << program_.Size();
@@ -48,20 +63,9 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
     all_vars.emplace(var->Name(), var);
   }
 
-  ir::Node *last_backward = nullptr;
-  std::vector<ir::Node *> optimize_ops;
   std::map<std::string, std::vector<ir::Node *>> var_nodes;
   for (auto *op : program.Block(0).AllOps()) {
     ir::Node *node = CreateOpNode(op);
-    if (boost::get<int>(
-            op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
-        static_cast<int>(OpRole::kBackward)) {
-      last_backward = node;
-    } else if (boost::get<int>(
-                   op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
-               static_cast<int>(OpRole::kOptimize)) {
-      optimize_ops.push_back(node);
-    }
 
     for (auto &each_var_name : op->InputArgumentNames()) {
       ir::Node *var = nullptr;
@@ -106,7 +110,8 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
           // Read Write is the same op.
           continue;
         }
-        ir::Node *dep_var = CreateEmptyNode("dummy", ir::Node::Type::kVariable);
+        ir::Node *dep_var = CreateEmptyNode(ir::Node::kControlDepVarName,
+                                            ir::Node::Type::kVariable);
         read_op->outputs.push_back(dep_var);
         dep_var->inputs.push_back(read_op);
         write_op->inputs.push_back(dep_var);
@@ -114,62 +119,121 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
       }
     }
   }
+}
 
-  if (last_backward) {
-    for (ir::Node *opt_node : optimize_ops) {
-      ir::Node *dep_var = CreateEmptyNode("dummy", ir::Node::Type::kVariable);
-      last_backward->outputs.push_back(dep_var);
-      dep_var->inputs.push_back(last_backward);
-      opt_node->inputs.push_back(dep_var);
-      dep_var->outputs.push_back(opt_node);
-      VLOG(3) << "appending connect: " << last_backward->Name()
-              << reinterpret_cast<void *>(last_backward) << "->"
-              << opt_node->Name() << reinterpret_cast<void *>(opt_node);
+/*
+bool HasCircleHelper(ir::Node* node,
+                     const std::map<ir::Node *, std::unordered_set<ir::Node *>>
+&adj_list,
+                     std::unordered_set<ir::Node*>* visited,
+                     std::unordered_set<ir::Node*>* in_trace) {
+  if (visited->find(node) == visited->end()) {
+    visited->insert(node);
+    in_trace->insert(node);
+
+    for (ir::Node *in : adj_list.at(node)) {
+      if (visited->find(in) == visited->end() &&
+          HasCircleHelper(in, adj_list, visited, in_trace)) {
+        return true;
+      } else if (in_trace->find(in) != in_trace->end()) {
+        return true;
+      }
     }
   }
+  in_trace->erase(node);
+  return false;
 }
 
-std::vector<ir::Node *> TopologySortOperationFromInToOut(
-    const std::vector<std::unique_ptr<ir::Node>> &nodes) {
+bool HasCircle(const std::map<ir::Node *, std::unordered_set<ir::Node *>>
+&adj_list) {
+  std::unordered_set<ir::Node*> visited;
+  std::unordered_set<ir::Node*> in_trace;
+  for (auto& adj : adj_list) {
+    if (HasCircleHelper(adj.first, adj_list, &visited, &in_trace)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildAdjList(
+    const std::vector<ir::Node*> &nodes) {
   std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
-  std::unordered_set<ir::Node *> visited;
-  std::vector<ir::Node *> ret;
 
   for (auto &n : nodes) {
     if (n->NodeType() != ir::Node::Type::kOperation) continue;
-    if (adj_list.find(n.get()) == adj_list.end()) {
-      adj_list[n.get()] = std::unordered_set<ir::Node *>();
+    if (adj_list.find(n) == adj_list.end()) {
+      adj_list[n] = std::unordered_set<ir::Node *>();
     }
     for (auto &var : n->inputs) {
       for (auto &adj_n : var->inputs) {
         PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
-        adj_list[n.get()].insert(adj_n);
+        adj_list[n].insert(adj_n);
         LOG(ERROR) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
-                   << " -> " << n->Name() << reinterpret_cast<void *>(n.get())
+                   << " -> " << n->Name() << reinterpret_cast<void *>(n)
                    << "  via " << var->Name() << reinterpret_cast<void *>(var);
       }
     }
   }
+  return adj_list;
+}
 
-  for (auto adj : adj_list) {
-    if (visited.find(adj.first) == visited.end()) {
-      SortHelper(adj_list, adj.first, &visited, &ret);
+std::vector<ir::Node *> TopologySortOperationFromInToOut(
+    const std::vector<std::unique_ptr<ir::Node>> &nodes) {
+  std::vector<ir::Node*> tmp;
+  for (auto& n : nodes) {
+    tmp.push_back(n.get());
+  }
+  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list =
+BuildAdjList(tmp);
+
+  PADDLE_ENFORCE(!HasCircle(adj_list));
+  std::vector<ir::Node*> ret = TopologySort(adj_list);
+
+  ir::Node *last_backward = nullptr;
+  std::vector<ir::Node *> optimize_ops;
+  for (ir::Node* n : ret) {
+    if (boost::get<int>(
+        n->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+        static_cast<int>(OpRole::kBackward)) {
+      last_backward = n;
+    } else if (boost::get<int>(
+        n->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+        static_cast<int>(OpRole::kOptimize)) {
+      optimize_ops.push_back(n);
     }
   }
 
+  if (last_backward) {
+    for (ir::Node *opt_node : optimize_ops) {
+      ir::Node *dep_var = CreateEmptyNode(ir::Node::kControlDepVarName,
+                                          ir::Node::Type::kVariable);
+      last_backward->outputs.push_back(dep_var);
+      dep_var->inputs.push_back(last_backward);
+      opt_node->inputs.push_back(dep_var);
+      dep_var->outputs.push_back(opt_node);
+      VLOG(3) << "appending connect: " << last_backward->Name()
+              << reinterpret_cast<void *>(last_backward) << "->"
+              << opt_node->Name() << reinterpret_cast<void *>(opt_node);
+    }
+  }
+
+  PADDLE_ENFORCE(!HasCircle(adj_list));
   for (ir::Node *n : ret) {
     std::unordered_set<ir::Node *> dummy;
     n->inputs.erase(
         std::remove_if(n->inputs.begin(), n->inputs.end(),
-                       [n](ir::Node *t) { return t->Name() == "dummy"; }),
+                       [n](ir::Node *t) {
+                         return t->Name() == ir::Node::kControlDepVarName; }),
         n->inputs.end());
     n->outputs.erase(
         std::remove_if(n->outputs.begin(), n->outputs.end(),
-                       [n](ir::Node *t) { return t->Name() == "dummy"; }),
+                       [n](ir::Node *t) {
+                         return t->Name() == ir::Node::kControlDepVarName; }),
         n->outputs.end());
   }
   return ret;
-}
+}*/
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 0242edecf4..b4ac135b02 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -78,8 +78,5 @@ class Graph {
   std::map<std::string, std::function<void(void)>> attr_dels_;
 };
 
-std::vector<ir::Node*> TopologySortOperationFromInToOut(
-    const std::vector<std::unique_ptr<ir::Node>>& nodes);
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
new file mode 100644
index 0000000000..ecd90f4f3e
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -0,0 +1,116 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/ir/graph_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace {
+void SortHelper(
+    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list,
+    ir::Node *node, std::unordered_set<ir::Node *> *visited,
+    std::vector<ir::Node *> *ret) {
+  visited->insert(node);
+
+  for (auto adj : adj_list.at(node)) {
+    if (visited->find(adj) == visited->end()) {
+      SortHelper(adj_list, adj, visited, ret);
+    }
+  }
+
+  LOG(ERROR) << "topology sort insert: " << node->Name()
+             << reinterpret_cast<void *>(node) << " input "
+             << node->inputs.size();
+  ret->push_back(node);
+}
+
+bool HasCircleHelper(
+    ir::Node *node,
+    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list,
+    std::unordered_set<ir::Node *> *visited,
+    std::unordered_set<ir::Node *> *in_trace) {
+  if (visited->find(node) == visited->end()) {
+    visited->insert(node);
+    in_trace->insert(node);
+
+    for (ir::Node *in : adj_list.at(node)) {
+      if (visited->find(in) == visited->end() &&
+          HasCircleHelper(in, adj_list, visited, in_trace)) {
+        return true;
+      } else if (in_trace->find(in) != in_trace->end()) {
+        return true;
+      }
+    }
+  }
+  in_trace->erase(node);
+  return false;
+}
+}  // namespace
+
+bool HasCircle(const Graph &graph) {
+  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list =
+      BuildAdjList(graph);
+
+  std::unordered_set<ir::Node *> visited;
+  std::unordered_set<ir::Node *> in_trace;
+  for (auto &adj : adj_list) {
+    if (HasCircleHelper(adj.first, adj_list, &visited, &in_trace)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+std::vector<ir::Node *> TopologySort(const Graph &graph) {
+  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list =
+      BuildAdjList(graph);
+  std::unordered_set<ir::Node *> visited;
+  std::vector<ir::Node *> ret;
+  for (auto adj : adj_list) {
+    if (visited.find(adj.first) == visited.end()) {
+      SortHelper(adj_list, adj.first, &visited, &ret);
+    }
+  }
+  return ret;
+}
+
+std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildAdjList(
+    const Graph &graph) {
+  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
+
+  for (auto &n : graph.nodes) {
+    if (n->NodeType() != ir::Node::Type::kOperation) continue;
+    if (adj_list.find(n.get()) == adj_list.end()) {
+      adj_list[n.get()] = std::unordered_set<ir::Node *>();
+    }
+    for (auto &var : n->inputs) {
+      for (auto &adj_n : var->inputs) {
+        PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
+        adj_list[n.get()].insert(adj_n);
+        LOG(ERROR) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
+                   << " -> " << n->Name() << reinterpret_cast<void *>(n.get())
+                   << "  via " << var->Name() << reinterpret_cast<void *>(var);
+      }
+    }
+  }
+  return adj_list;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h
new file mode 100644
index 0000000000..b8714eb5be
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/node.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+bool HasCircle(const Graph &graph);
+
+std::vector<ir::Node *> TopologySort(const Graph &graph);
+
+std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildAdjList(
+    const Graph &graph);
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc
index 86376e7e8b..aca77da8d6 100644
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@@ -15,5 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/node.h"
 
 namespace paddle {
-namespace framework {}  // namespace framework
+namespace framework {
+namespace ir {
+const char Node::kControlDepVarName[] = "__control_var";
+}  // namespace ir
+}  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index 97b64a6017..b3138fccee 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -27,6 +27,8 @@ namespace ir {
 class Node {
  public:
   enum class Type { kOperation, kVariable };
+  static const char kControlDepVarName[];
+
   explicit Node(const std::string& name, Type type)
       : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {}
 

From c3f6e0e8a2f2185559cff1a0e79c50431d41ae2f Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 20 Jul 2018 12:20:03 +0800
Subject: [PATCH 04/37] add namespace to Graph

---
 .../details/multi_devices_graph_builder.cc    | 35 ++++++++++---------
 .../details/multi_devices_graph_builder.h     | 27 +++++++-------
 .../framework/details/ssa_graph_builder.cc    |  8 ++---
 .../framework/details/ssa_graph_builder.h     |  8 ++---
 .../framework/details/ssa_graph_checker.cc    |  2 +-
 .../framework/details/ssa_graph_checker.h     |  5 +--
 .../framework/details/ssa_graph_printer.cc    |  4 +--
 .../framework/details/ssa_graph_printer.h     |  7 ++--
 .../details/threaded_ssa_graph_executor.cc    |  3 +-
 .../details/threaded_ssa_graph_executor.h     |  4 +--
 paddle/fluid/framework/ir/graph.cc            | 12 ++++---
 paddle/fluid/framework/ir/graph.h             | 22 ++++++------
 paddle/fluid/framework/ir/graph_helper.cc     |  8 ++---
 paddle/fluid/framework/ir/graph_helper.h      |  4 +--
 paddle/fluid/framework/ir/graph_test.cc       |  2 +-
 paddle/fluid/framework/parallel_executor.cc   |  2 +-
 16 files changed, 82 insertions(+), 71 deletions(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 4cc6e5727b..4050424e70 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -68,7 +68,8 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
   }
 }
 
-void MultiDevSSAGraphBuilder::CreateOpHandleIOs(Graph *result, ir::Node *node,
+void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result,
+                                                ir::Node *node,
                                                 size_t place_id) const {
   auto p = places_[place_id];
   auto *op_handle = result->Get<GraphOps>("ops").back().get();
@@ -192,8 +193,9 @@ size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
 // to parameter/gradients before optimizer ops, topo sort is insufficient. (
 // some optimizer ops might not depend on any nodes), we manually move all
 // optimizer nodes after last backward nodes.
-std::vector<ir::Node *> SortOpsAndDelayOptimizeOp(const Graph &graph) {
-  std::vector<ir::Node *> ret = ir::TopologySort(graph);
+// However, the assumption by SSAGraphBuilder should be relaxed in the future.
+std::vector<ir::Node *> SortOpsAndDelayOptimizeOp(const ir::Graph &graph) {
+  std::vector<ir::Node *> ret = ir::TopologySortOperations(graph);
   size_t last_backward = 0;
   std::vector<ir::Node *> optimize_ops;
   std::vector<ir::Node *> sorted_ret;
@@ -232,8 +234,8 @@ std::vector<ir::Node *> SortOpsAndDelayOptimizeOp(const Graph &graph) {
   return sorted_ret;
 }
 
-std::unique_ptr<Graph> MultiDevSSAGraphBuilder::Apply(
-    std::unique_ptr<Graph> graph) const {
+std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::Apply(
+    std::unique_ptr<ir::Graph> graph) const {
   // Rebuild the graph structure.
   std::vector<ir::Node *> sorted_ops = SortOpsAndDelayOptimizeOp(*graph);
   auto nodes = std::move(graph->nodes);
@@ -245,7 +247,7 @@ std::unique_ptr<Graph> MultiDevSSAGraphBuilder::Apply(
     }
   }
 
-  Graph &result = *graph;
+  ir::Graph &result = *graph;
   std::unordered_set<std::string> og_has_been_broadcast;
 
   // We cannot invoke resize. It is a bug of GCC 4.8
@@ -397,7 +399,7 @@ void MultiDevSSAGraphBuilder::SetCommunicationContext(
 #endif
 }
 
-void MultiDevSSAGraphBuilder::CreateBroadcastOp(Graph *result,
+void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
                                                 const std::string &p_name,
                                                 size_t src_dev_id) const {
 #ifdef PADDLE_WITH_CUDA
@@ -427,7 +429,7 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(Graph *result,
   }
 }
 
-void MultiDevSSAGraphBuilder::CreateComputationalOp(Graph *result,
+void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result,
                                                     ir::Node *node,
                                                     int dev_id) const {
   result->Get<GraphOps>("ops").emplace_back(
@@ -436,7 +438,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(Graph *result,
   CreateOpHandleIOs(result, node, dev_id);
 }
 
-void MultiDevSSAGraphBuilder::InsertAllReduceOp(Graph *result,
+void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result,
                                                 const std::string &og) const {
 #ifdef PADDLE_WITH_CUDA
   result->Get<GraphOps>("ops").emplace_back(new AllReduceOpHandle(
@@ -466,7 +468,7 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(Graph *result,
 }
 
 void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
-    Graph *result, const std::vector<std::string> &datas) const {
+    ir::Graph *result, const std::vector<std::string> &datas) const {
 #ifdef PADDLE_WITH_CUDA
   result->Get<GraphOps>("ops").emplace_back(new DataBalanceOpHandle(
       result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation),
@@ -529,7 +531,7 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const {
   return got == var_name_on_devices_.end() ? -1 : got->second;
 }
 
-void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(Graph *result) const {
+void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(ir::Graph *result) const {
   for (size_t i = 0; i < places_.size(); ++i) {
 // Insert ScaleCost OpHandle
 #ifdef PADDLE_WITH_CUDA
@@ -559,7 +561,7 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(Graph *result) const {
   }
 }
 
-void MultiDevSSAGraphBuilder::CreateComputationalOps(Graph *result,
+void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result,
                                                      ir::Node *node,
                                                      size_t num_places) const {
   for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) {
@@ -571,7 +573,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(Graph *result,
   }
 }
 
-VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(Graph *result,
+VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
                                                    const std::string &og,
                                                    int dst_dev_id) const {
 #ifdef PADDLE_WITH_CUDA
@@ -604,7 +606,7 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(Graph *result,
 
 // Find the first occurence of `prev_op_name` and make current `op` depend
 // on it.
-void MultiDevSSAGraphBuilder::ConnectOp(Graph *result, OpHandleBase *op,
+void MultiDevSSAGraphBuilder::ConnectOp(ir::Graph *result, OpHandleBase *op,
                                         const std::string &prev_op_name) const {
   for (auto &prev_op : result->Get<GraphOps>("ops")) {
     if (prev_op->Name() == prev_op_name) {
@@ -617,7 +619,7 @@ void MultiDevSSAGraphBuilder::ConnectOp(Graph *result, OpHandleBase *op,
   }
 }
 
-void MultiDevSSAGraphBuilder::CreateDistTrainOp(Graph *result,
+void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
                                                 ir::Node *node) const {
   int op_dev_id = -1;
   std::vector<std::string> input_var_names;
@@ -664,7 +666,8 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(Graph *result,
 }
 
 // Create RPC related op handles that connects its in ops and out ops.
-void MultiDevSSAGraphBuilder::CreateRPCOp(Graph *result, ir::Node *node) const {
+void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
+                                          ir::Node *node) const {
   int op_dev_id = -1;
   if (node->Op()->Type() == "send") {
     op_dev_id = GetVarDeviceID(node->inputs[0]->Name());
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index 2b7f4f586b..c2c764bb94 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -46,11 +46,13 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
                           const std::vector<Scope *> &local_scopes,
                           const BuildStrategy &strategy);
 #endif
-  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const override;
+  std::unique_ptr<ir::Graph> Apply(
+      std::unique_ptr<ir::Graph> graph) const override;
   int GetVarDeviceID(const std::string &varname) const override;
 
  private:
-  void CreateOpHandleIOs(Graph *result, ir::Node *node, size_t device_id) const;
+  void CreateOpHandleIOs(ir::Graph *result, ir::Node *node,
+                         size_t device_id) const;
 
  private:
   std::string loss_var_name_;
@@ -64,8 +66,8 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 
   bool IsScaleLossOp(ir::Node *node) const;
 
-  void CreateRPCOp(Graph *result, ir::Node *node) const;
-  void CreateDistTrainOp(Graph *result, ir::Node *node) const;
+  void CreateRPCOp(ir::Graph *result, ir::Node *node) const;
+  void CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
 
   /**
    * Is this operator as the end-point operator before/after send operator.
@@ -79,16 +81,17 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
   std::vector<std::string> FindDistTrainRecvVars(
       const std::vector<std::unique_ptr<ir::Node>> &nodes) const;
 
-  void ConnectOp(Graph *result, OpHandleBase *op,
+  void ConnectOp(ir::Graph *result, OpHandleBase *op,
                  const std::string &prev_op_name) const;
 
-  void CreateComputationalOps(Graph *result, ir::Node *node,
+  void CreateComputationalOps(ir::Graph *result, ir::Node *node,
                               size_t num_places) const;
 
-  void CreateScaleLossGradOp(Graph *result) const;
-  VarHandle *CreateReduceOp(Graph *result, const std::string &og,
+  void CreateScaleLossGradOp(ir::Graph *result) const;
+  VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
                             int dst_dev_id) const;
-  void CreateComputationalOp(Graph *result, ir::Node *node, int dev_id) const;
+  void CreateComputationalOp(ir::Graph *result, ir::Node *node,
+                             int dev_id) const;
 
   bool IsParameterGradientOnce(
       const std::string &og,
@@ -96,12 +99,12 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 
   int GetOpDeviceID(ir::Node *node) const;
 
-  void InsertAllReduceOp(Graph *result, const std::string &og) const;
+  void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;
 
-  void InsertDataBalanceOp(Graph *result,
+  void InsertDataBalanceOp(ir::Graph *result,
                            const std::vector<std::string> &datas) const;
 
-  void CreateBroadcastOp(Graph *result, const std::string &p_name,
+  void CreateBroadcastOp(ir::Graph *result, const std::string &p_name,
                          size_t src_dev_id) const;
 
   bool IsSparseGradient(const std::string &og) const;
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc
index 2be4bb009e..3c579f427e 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -17,7 +17,7 @@
 namespace paddle {
 namespace framework {
 namespace details {
-void SSAGraphBuilder::PolishGraphToSupportDataHazards(Graph *graph) {
+void SSAGraphBuilder::PolishGraphToSupportDataHazards(ir::Graph *graph) {
   for (auto &var_map : graph->Get<GraphVars>("vars")) {
     for (auto &name_pair : var_map) {
       if (name_pair.second.size() <= 1) {
@@ -60,7 +60,7 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(Graph *graph) {
 }
 
 VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
-    Graph *graph, ir::Node *node, const platform::Place &place,
+    ir::Graph *graph, ir::Node *node, const platform::Place &place,
     size_t place_offset) {
   auto &var_holders = graph->Get<GraphVars>("vars")[place_offset];
   auto &var_holder = var_holders[node->Name()];
@@ -81,7 +81,7 @@ VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
   return var;
 }
 
-void SSAGraphBuilder::CreateOpOutput(Graph *graph, OpHandleBase *op_handle,
+void SSAGraphBuilder::CreateOpOutput(ir::Graph *graph, OpHandleBase *op_handle,
                                      ir::Node *new_node,
                                      const platform::Place &place,
                                      size_t place_offset) {
@@ -93,7 +93,7 @@ void SSAGraphBuilder::CreateOpOutput(Graph *graph, OpHandleBase *op_handle,
   op_handle->AddOutput(var);
 }
 
-void SSAGraphBuilder::AddOutputToLeafOps(Graph *graph) {
+void SSAGraphBuilder::AddOutputToLeafOps(ir::Graph *graph) {
   for (auto &op : graph->Get<GraphOps>("ops")) {
     if (!op->Outputs().empty()) {
       continue;
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h
index e8e8acdb38..f64445b470 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -64,19 +64,19 @@ class SSAGraphBuilder : public ir::Pass {
    *
    * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
    */
-  static void PolishGraphToSupportDataHazards(Graph *graph);
+  static void PolishGraphToSupportDataHazards(ir::Graph *graph);
 
-  static VarHandle *CreateOrGetLatestVarHandle(Graph *graph, ir::Node *node,
+  static VarHandle *CreateOrGetLatestVarHandle(ir::Graph *graph, ir::Node *node,
                                                const platform::Place &place,
                                                size_t place_offset);
 
   // Add an output variable (each_var_name, place, place_offset) to op_handle,
   // which belongs to graph
-  static void CreateOpOutput(Graph *graph, OpHandleBase *op_handle,
+  static void CreateOpOutput(ir::Graph *graph, OpHandleBase *op_handle,
                              ir::Node *new_node, const platform::Place &place,
                              size_t place_offset);
 
-  static void AddOutputToLeafOps(Graph *graph);
+  static void AddOutputToLeafOps(ir::Graph *graph);
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/ssa_graph_checker.cc b/paddle/fluid/framework/details/ssa_graph_checker.cc
index 7c79d7f1e8..0438b09610 100644
--- a/paddle/fluid/framework/details/ssa_graph_checker.cc
+++ b/paddle/fluid/framework/details/ssa_graph_checker.cc
@@ -20,7 +20,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-bool SSAGraghBuilderWithChecker::IsValidGraph(const Graph *graph) const {
+bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const {
   std::unordered_map<OpHandleBase *, size_t> pending_ops;
   std::unordered_set<VarHandleBase *> pending_vars;
   std::unordered_set<VarHandleBase *> ready_vars;
diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/ssa_graph_checker.h
index 2e397e8682..51ce6e5eca 100644
--- a/paddle/fluid/framework/details/ssa_graph_checker.h
+++ b/paddle/fluid/framework/details/ssa_graph_checker.h
@@ -28,7 +28,8 @@ class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
       std::unique_ptr<SSAGraphBuilder>&& builder)
       : builder_(std::move(builder)) {}
 
-  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const override {
+  std::unique_ptr<ir::Graph> Apply(
+      std::unique_ptr<ir::Graph> graph) const override {
     auto new_graph = builder_->Apply(std::move(graph));
     PADDLE_ENFORCE(IsValidGraph(new_graph.get()));
     return new_graph;
@@ -38,7 +39,7 @@ class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
     return builder_->GetVarDeviceID(var_name);
   }
 
-  bool IsValidGraph(const Graph* graph) const;
+  bool IsValidGraph(const ir::Graph* graph) const;
 
  private:
   std::unique_ptr<SSAGraphBuilder> builder_;
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.cc b/paddle/fluid/framework/details/ssa_graph_printer.cc
index 6dd6fd262e..20aab14644 100644
--- a/paddle/fluid/framework/details/ssa_graph_printer.cc
+++ b/paddle/fluid/framework/details/ssa_graph_printer.cc
@@ -21,7 +21,7 @@ namespace framework {
 namespace details {
 
 template <typename Callback>
-static inline void IterAllVar(const Graph &graph, Callback callback) {
+static inline void IterAllVar(const ir::Graph &graph, Callback callback) {
   for (auto &each : graph.Get<GraphVars>("vars")) {
     for (auto &pair1 : each) {
       for (auto &pair2 : pair1.second) {
@@ -35,7 +35,7 @@ static inline void IterAllVar(const Graph &graph, Callback callback) {
   }
 }
 
-void GraphvizSSAGraphPrinter::Print(const Graph &graph,
+void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph,
                                     std::ostream &sout) const {
   size_t var_id = 0;
   std::unordered_map<const VarHandleBase *, size_t> vars;
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h
index cd72162f44..a77c1bad3f 100644
--- a/paddle/fluid/framework/details/ssa_graph_printer.h
+++ b/paddle/fluid/framework/details/ssa_graph_printer.h
@@ -25,12 +25,12 @@ namespace details {
 class SSAGraphPrinter {
  public:
   virtual ~SSAGraphPrinter() {}
-  virtual void Print(const Graph& graph, std::ostream& sout) const = 0;
+  virtual void Print(const ir::Graph& graph, std::ostream& sout) const = 0;
 };
 
 class GraphvizSSAGraphPrinter : public SSAGraphPrinter {
  public:
-  void Print(const Graph& graph, std::ostream& sout) const override;
+  void Print(const ir::Graph& graph, std::ostream& sout) const override;
 };
 
 class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
@@ -50,7 +50,8 @@ class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
         stream_ptr_(std::move(sout)),
         stream_ref_(*stream_ptr_) {}
 
-  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const override {
+  std::unique_ptr<ir::Graph> Apply(
+      std::unique_ptr<ir::Graph> graph) const override {
     auto new_graph = builder_->Apply(std::move(graph));
     printer_->Print(*new_graph, stream_ref_);
     return new_graph;
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index f85c62dd6c..c19f74476f 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -21,7 +21,8 @@ namespace framework {
 namespace details {
 ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
     const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places, std::unique_ptr<Graph> &&graph)
+    const std::vector<platform::Place> &places,
+    std::unique_ptr<ir::Graph> &&graph)
     : graph_(std::move(graph)),
       pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
                                        : nullptr),
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index bf7c0a367a..3d67daa45e 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -40,7 +40,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
                            const std::vector<Scope *> &local_scopes,
                            const std::vector<platform::Place> &places,
-                           std::unique_ptr<Graph> &&graph);
+                           std::unique_ptr<ir::Graph> &&graph);
 
   // Run a SSAGraph by a thread pool
   // Use topological sort algorithm
@@ -53,7 +53,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
              details::OpHandleBase *op);
 
  private:
-  std::unique_ptr<Graph> graph_;
+  std::unique_ptr<ir::Graph> graph_;
   std::unique_ptr<::ThreadPool> pool_;
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index 46640fedcc..18211f2e2b 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+namespace ir {
 /*
 namespace {
 void SortHelper(
@@ -41,7 +42,7 @@ void SortHelper(
   ret->push_back(node);
 }
 
-std::vector<ir::Node*> TopologySort(
+std::vector<ir::Node*> TopologySortOperations(
     const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list) {
   std::unordered_set<ir::Node *> visited;
   std::vector<ir::Node *> ret;
@@ -156,7 +157,7 @@ bool HasCircle(const std::map<ir::Node *, std::unordered_set<ir::Node *>>
   return false;
 }
 
-std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildAdjList(
+std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
     const std::vector<ir::Node*> &nodes) {
   std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
 
@@ -178,17 +179,17 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildAdjList(
   return adj_list;
 }
 
-std::vector<ir::Node *> TopologySortOperationFromInToOut(
+std::vector<ir::Node *> TopologySortOperationsOperationFromInToOut(
     const std::vector<std::unique_ptr<ir::Node>> &nodes) {
   std::vector<ir::Node*> tmp;
   for (auto& n : nodes) {
     tmp.push_back(n.get());
   }
   std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list =
-BuildAdjList(tmp);
+BuildOperationAdjList(tmp);
 
   PADDLE_ENFORCE(!HasCircle(adj_list));
-  std::vector<ir::Node*> ret = TopologySort(adj_list);
+  std::vector<ir::Node*> ret = TopologySortOperations(adj_list);
 
   ir::Node *last_backward = nullptr;
   std::vector<ir::Node *> optimize_ops;
@@ -235,5 +236,6 @@ BuildAdjList(tmp);
   return ret;
 }*/
 
+}  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index b4ac135b02..a1e39b08a4 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -26,13 +26,13 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-
+namespace ir {
 class Graph {
  public:
-  explicit Graph(const ProgramDesc& program);
+  explicit Graph(const ProgramDesc &program);
 
   virtual ~Graph() {
-    for (auto& attr : attrs_) {
+    for (auto &attr : attrs_) {
       attr_dels_[attr.first]();
     }
     attrs_.clear();
@@ -40,12 +40,12 @@ class Graph {
   }
 
   template <typename AttrType>
-  AttrType& Get(const std::string& attr_name) const {
-    return *boost::any_cast<AttrType*>(attrs_.at(attr_name));
+  AttrType &Get(const std::string &attr_name) const {
+    return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
   }
 
   template <typename AttrType>
-  void Set(const std::string& attr_name, AttrType* attr) {
+  void Set(const std::string &attr_name, AttrType *attr) {
     PADDLE_ENFORCE(attrs_.count(attr_name) == 0);
     attrs_[attr_name] = attr;
     attr_dels_[attr_name] = [attr, attr_name]() {
@@ -54,17 +54,17 @@ class Graph {
     };
   }
 
-  ir::Node* CreateVarNode(VarDesc* var_desc) {
+  ir::Node *CreateVarNode(VarDesc *var_desc) {
     nodes.emplace_back(new ir::Node(var_desc));
     return nodes.back().get();
   }
 
-  ir::Node* CreateOpNode(OpDesc* op_desc) {
+  ir::Node *CreateOpNode(OpDesc *op_desc) {
     nodes.emplace_back(new ir::Node(op_desc));
     return nodes.back().get();
   }
 
-  ir::Node* CreateEmptyNode(const std::string& name, ir::Node::Type type) {
+  ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type) {
     nodes.emplace_back(new ir::Node(name, type));
     return nodes.back().get();
   }
@@ -73,10 +73,10 @@ class Graph {
 
  private:
   // NOTE: program_ shouldn't be exposed to user.
-  const ProgramDesc& program_;
+  const ProgramDesc &program_;
   std::map<std::string, boost::any> attrs_;
   std::map<std::string, std::function<void(void)>> attr_dels_;
 };
-
+}  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index ecd90f4f3e..76458be135 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -64,7 +64,7 @@ bool HasCircleHelper(
 
 bool HasCircle(const Graph &graph) {
   std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list =
-      BuildAdjList(graph);
+      BuildOperationAdjList(graph);
 
   std::unordered_set<ir::Node *> visited;
   std::unordered_set<ir::Node *> in_trace;
@@ -76,9 +76,9 @@ bool HasCircle(const Graph &graph) {
   return false;
 }
 
-std::vector<ir::Node *> TopologySort(const Graph &graph) {
+std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
   std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list =
-      BuildAdjList(graph);
+      BuildOperationAdjList(graph);
   std::unordered_set<ir::Node *> visited;
   std::vector<ir::Node *> ret;
   for (auto adj : adj_list) {
@@ -89,7 +89,7 @@ std::vector<ir::Node *> TopologySort(const Graph &graph) {
   return ret;
 }
 
-std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildAdjList(
+std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
     const Graph &graph) {
   std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
 
diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h
index b8714eb5be..55b2e3f5ca 100644
--- a/paddle/fluid/framework/ir/graph_helper.h
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -26,9 +26,9 @@ namespace framework {
 namespace ir {
 bool HasCircle(const Graph &graph);
 
-std::vector<ir::Node *> TopologySort(const Graph &graph);
+std::vector<ir::Node *> TopologySortOperations(const Graph &graph);
 
-std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildAdjList(
+std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
     const Graph &graph);
 }  // namespace ir
 }  // namespace framework
diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc
index 186047b370..f8fbd2242d 100644
--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
@@ -93,7 +93,7 @@ TEST(GraphTest, Basic) {
   ASSERT_EQ(proto::VarType::LOD_TENSOR,
             prog.MutableBlock(0)->Var("test_out")->GetType());
 
-  std::unique_ptr<Graph> g(new Graph(prog));
+  std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
   ASSERT_EQ(g->nodes[0]->Name(), "sum");
   ASSERT_EQ(g->nodes[0]->inputs[0]->Name(), "test_a");
   ASSERT_EQ(g->nodes[0]->inputs[1]->Name(), "test_b");
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 1e5bba62b5..02c836bea1 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -132,7 +132,7 @@ ParallelExecutor::ParallelExecutor(
 #endif
   }
   builder_ = builder_factory.Create();
-  std::unique_ptr<Graph> graph(new Graph(main_program));
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
   graph = builder_->Apply(std::move(graph));
   member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
       exec_strategy, member_->local_scopes_, places, std::move(graph)));

From f6d99d1f73f1b57ee94a0db7fb6c039ff72085de Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 20 Jul 2018 15:15:21 +0800
Subject: [PATCH 05/37] polish

---
 .../details/multi_devices_graph_builder.cc    |  21 ---
 .../framework/details/ssa_graph_builder.cc    |  42 -----
 .../framework/details/ssa_graph_builder.h     |   9 -
 paddle/fluid/framework/ir/graph.cc            | 155 +-----------------
 4 files changed, 7 insertions(+), 220 deletions(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 4050424e70..f5e99c5748 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -216,21 +216,6 @@ std::vector<ir::Node *> SortOpsAndDelayOptimizeOp(const ir::Graph &graph) {
 
   sorted_ret.insert(sorted_ret.begin() + last_backward, optimize_ops.begin(),
                     optimize_ops.end());
-
-  for (ir::Node *n : sorted_ret) {
-    n->inputs.erase(std::remove_if(n->inputs.begin(), n->inputs.end(),
-                                   [n](ir::Node *t) {
-                                     return t->Name() ==
-                                            ir::Node::kControlDepVarName;
-                                   }),
-                    n->inputs.end());
-    n->outputs.erase(std::remove_if(n->outputs.begin(), n->outputs.end(),
-                                    [n](ir::Node *t) {
-                                      return t->Name() ==
-                                             ir::Node::kControlDepVarName;
-                                    }),
-                     n->outputs.end());
-  }
   return sorted_ret;
 }
 
@@ -365,12 +350,6 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::Apply(
     }
   }
 
-  /*
-    Dependency graph has been constructed. However, there are still data
-    hazards need to be handled.
-   */
-  PolishGraphToSupportDataHazards(&result);
-
   /*
    * Only variables should be the leaves of graph.
    */
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc
index 3c579f427e..dcdcb28ac4 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -17,48 +17,6 @@
 namespace paddle {
 namespace framework {
 namespace details {
-void SSAGraphBuilder::PolishGraphToSupportDataHazards(ir::Graph *graph) {
-  for (auto &var_map : graph->Get<GraphVars>("vars")) {
-    for (auto &name_pair : var_map) {
-      if (name_pair.second.size() <= 1) {
-        continue;
-      }
-      auto it_new = name_pair.second.rbegin();
-      auto it_old = name_pair.second.rbegin();
-      ++it_old;
-      for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) {
-        OpHandleBase *write_op = (*it_new)->GeneratedOp();
-        const auto &read_ops = (*it_old)->PendingOps();
-
-        for (auto *read_op : read_ops) {
-          // Manually add a dependency var from read_op to write_op;
-          if (read_op == write_op) {
-            // Read Write is the same op.
-            continue;
-          }
-
-          bool has_dep = false;
-          for (auto read_out : read_op->Outputs()) {
-            for (auto write_in : write_op->Inputs()) {
-              if (read_out == write_in) {
-                has_dep = true;
-                break;
-              }
-            }
-          }
-          if (has_dep) continue;
-
-          auto *dep_var = new DummyVarHandle(
-              graph->CreateEmptyNode("dummy", ir::Node::Type::kVariable));
-          read_op->AddOutput(dep_var);
-          write_op->AddInput(dep_var);
-          graph->Get<GraphDepVars>("dep_vars").emplace(dep_var);
-        }
-      }
-    }
-  }
-}
-
 VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
     ir::Graph *graph, ir::Node *node, const platform::Place &place,
     size_t place_offset) {
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h
index f64445b470..e99bab518e 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -57,15 +57,6 @@ class SSAGraphBuilder : public ir::Pass {
   DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
 
  protected:
-  /**
-   * We only handle write after read(WAR), since it should not have a write
-   * after write in program. If there are write after write operators, we need
-   * prune them.
-   *
-   * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
-   */
-  static void PolishGraphToSupportDataHazards(ir::Graph *graph);
-
   static VarHandle *CreateOrGetLatestVarHandle(ir::Graph *graph, ir::Node *node,
                                                const platform::Place &place,
                                                size_t place_offset);
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index 18211f2e2b..769dddbc59 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -23,39 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
-/*
-namespace {
-void SortHelper(
-    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list,
-    ir::Node *node, std::unordered_set<ir::Node *> *visited,
-    std::vector<ir::Node *> *ret) {
-  visited->insert(node);
-
-  for (auto adj : adj_list.at(node)) {
-    if (visited->find(adj) == visited->end()) {
-      SortHelper(adj_list, adj, visited, ret);
-    }
-  }
-
-  VLOG(3) << "topology sort insert: " << node->Name()
-          << reinterpret_cast<void *>(node) << " input " << node->inputs.size();
-  ret->push_back(node);
-}
-
-std::vector<ir::Node*> TopologySortOperations(
-    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list) {
-  std::unordered_set<ir::Node *> visited;
-  std::vector<ir::Node *> ret;
-
-  for (auto adj : adj_list) {
-    if (visited.find(adj.first) == visited.end()) {
-      SortHelper(adj_list, adj.first, &visited, &ret);
-    }
-  }
-  return ret;
-}
-}  // namespace
-*/
 
 Graph::Graph(const ProgramDesc &program) : program_(program) {
   VLOG(3) << "block in program:" << program_.Size();
@@ -93,6 +60,13 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
       var->inputs.push_back(node);
     }
   }
+  /**
+   * We only handle write after read(WAR), since it should not have a write
+   * after write in program. If there are write after write operators, we need
+   * prune them.
+   *
+   * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
+   */
   for (auto &var : var_nodes) {
     auto &versions = var.second;
     if (versions.size() <= 1) continue;
@@ -121,121 +95,6 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
     }
   }
 }
-
-/*
-bool HasCircleHelper(ir::Node* node,
-                     const std::map<ir::Node *, std::unordered_set<ir::Node *>>
-&adj_list,
-                     std::unordered_set<ir::Node*>* visited,
-                     std::unordered_set<ir::Node*>* in_trace) {
-  if (visited->find(node) == visited->end()) {
-    visited->insert(node);
-    in_trace->insert(node);
-
-    for (ir::Node *in : adj_list.at(node)) {
-      if (visited->find(in) == visited->end() &&
-          HasCircleHelper(in, adj_list, visited, in_trace)) {
-        return true;
-      } else if (in_trace->find(in) != in_trace->end()) {
-        return true;
-      }
-    }
-  }
-  in_trace->erase(node);
-  return false;
-}
-
-bool HasCircle(const std::map<ir::Node *, std::unordered_set<ir::Node *>>
-&adj_list) {
-  std::unordered_set<ir::Node*> visited;
-  std::unordered_set<ir::Node*> in_trace;
-  for (auto& adj : adj_list) {
-    if (HasCircleHelper(adj.first, adj_list, &visited, &in_trace)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
-    const std::vector<ir::Node*> &nodes) {
-  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
-
-  for (auto &n : nodes) {
-    if (n->NodeType() != ir::Node::Type::kOperation) continue;
-    if (adj_list.find(n) == adj_list.end()) {
-      adj_list[n] = std::unordered_set<ir::Node *>();
-    }
-    for (auto &var : n->inputs) {
-      for (auto &adj_n : var->inputs) {
-        PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
-        adj_list[n].insert(adj_n);
-        LOG(ERROR) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
-                   << " -> " << n->Name() << reinterpret_cast<void *>(n)
-                   << "  via " << var->Name() << reinterpret_cast<void *>(var);
-      }
-    }
-  }
-  return adj_list;
-}
-
-std::vector<ir::Node *> TopologySortOperationsOperationFromInToOut(
-    const std::vector<std::unique_ptr<ir::Node>> &nodes) {
-  std::vector<ir::Node*> tmp;
-  for (auto& n : nodes) {
-    tmp.push_back(n.get());
-  }
-  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list =
-BuildOperationAdjList(tmp);
-
-  PADDLE_ENFORCE(!HasCircle(adj_list));
-  std::vector<ir::Node*> ret = TopologySortOperations(adj_list);
-
-  ir::Node *last_backward = nullptr;
-  std::vector<ir::Node *> optimize_ops;
-  for (ir::Node* n : ret) {
-    if (boost::get<int>(
-        n->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
-        static_cast<int>(OpRole::kBackward)) {
-      last_backward = n;
-    } else if (boost::get<int>(
-        n->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
-        static_cast<int>(OpRole::kOptimize)) {
-      optimize_ops.push_back(n);
-    }
-  }
-
-  if (last_backward) {
-    for (ir::Node *opt_node : optimize_ops) {
-      ir::Node *dep_var = CreateEmptyNode(ir::Node::kControlDepVarName,
-                                          ir::Node::Type::kVariable);
-      last_backward->outputs.push_back(dep_var);
-      dep_var->inputs.push_back(last_backward);
-      opt_node->inputs.push_back(dep_var);
-      dep_var->outputs.push_back(opt_node);
-      VLOG(3) << "appending connect: " << last_backward->Name()
-              << reinterpret_cast<void *>(last_backward) << "->"
-              << opt_node->Name() << reinterpret_cast<void *>(opt_node);
-    }
-  }
-
-  PADDLE_ENFORCE(!HasCircle(adj_list));
-  for (ir::Node *n : ret) {
-    std::unordered_set<ir::Node *> dummy;
-    n->inputs.erase(
-        std::remove_if(n->inputs.begin(), n->inputs.end(),
-                       [n](ir::Node *t) {
-                         return t->Name() == ir::Node::kControlDepVarName; }),
-        n->inputs.end());
-    n->outputs.erase(
-        std::remove_if(n->outputs.begin(), n->outputs.end(),
-                       [n](ir::Node *t) {
-                         return t->Name() == ir::Node::kControlDepVarName; }),
-        n->outputs.end());
-  }
-  return ret;
-}*/
-
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle

From 93355cc0d222b558689f963805d21b366ee8699d Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Sat, 21 Jul 2018 17:27:25 +0800
Subject: [PATCH 06/37] fix control deps

---
 paddle/fluid/framework/details/CMakeLists.txt |  2 +-
 .../details/multi_devices_graph_builder.cc    | 34 ++++++++------
 .../details/multi_devices_graph_builder.h     |  4 +-
 .../fluid/framework/details/rpc_op_handle.cc  |  3 +-
 .../framework/details/ssa_graph_builder.cc    | 33 +++++++++++++-
 .../framework/details/ssa_graph_builder.h     |  9 ++++
 paddle/fluid/framework/details/var_handle.cc  |  2 +-
 paddle/fluid/framework/ir/graph.cc            | 26 ++++++++---
 paddle/fluid/framework/ir/graph.h             | 45 ++++++++++++++++---
 paddle/fluid/framework/ir/graph_helper.cc     | 19 ++++----
 paddle/fluid/framework/ir/graph_helper.h      |  1 +
 paddle/fluid/framework/ir/graph_test.cc       | 29 ++++++------
 paddle/fluid/operators/CMakeLists.txt         |  4 +-
 paddle/fluid/operators/send_recv_util.h       |  5 ++-
 14 files changed, 155 insertions(+), 61 deletions(-)

diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 620d202d33..9df7df1f42 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -1,4 +1,4 @@
-cc_library(var_handle SRCS var_handle.cc DEPS place framework_proto)
+cc_library(var_handle SRCS var_handle.cc DEPS place framework_proto node)
 cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index f5e99c5748..150f1534c8 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -94,12 +94,11 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result,
 }
 
 std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainSendVars(
-    const std::vector<std::unique_ptr<ir::Node>> &nodes) const {
+    const std::vector<ir::Node *> &nodes) const {
   std::vector<std::string> send_vars;
   // since parameters are all in block 0,
   // it's enough to only scan send ops in block 0
   for (auto &node : nodes) {
-    if (node->NodeType() != ir::Node::Type::kOperation) continue;
     OpDesc *op = node->Op();
     // TODO(Yancey1989): use a graceful method to find send op,
     // instead of the the hard code string
@@ -114,10 +113,9 @@ std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainSendVars(
 }
 
 std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainRecvVars(
-    const std::vector<std::unique_ptr<ir::Node>> &nodes) const {
+    const std::vector<ir::Node *> &nodes) const {
   std::vector<std::string> recv_vars;
   for (auto &node : nodes) {
-    if (node->NodeType() != ir::Node::Type::kOperation) continue;
     OpDesc *op = node->Op();
     // TODO(Yancey1989): use a graceful method to find recv op,
     // instead of the hard code string
@@ -214,6 +212,19 @@ std::vector<ir::Node *> SortOpsAndDelayOptimizeOp(const ir::Graph &graph) {
     }
   }
 
+  // Verify that no operations before optimize ops depends on optimize ops.
+  std::unordered_set<ir::Node *> optimize_set(optimize_ops.begin(),
+                                              optimize_ops.end());
+  for (size_t i = 0; i < last_backward; ++i) {
+    for (ir::Node *in : sorted_ret[i]->inputs) {
+      for (ir::Node *pre_n : in->inputs) {
+        PADDLE_ENFORCE(optimize_set.find(pre_n) == optimize_set.end(),
+                       "optimize operations cannot be depended by forward "
+                       "or backward node %s -> %s",
+                       pre_n->Name(), sorted_ret[i]->Name());
+      }
+    }
+  }
   sorted_ret.insert(sorted_ret.begin() + last_backward, optimize_ops.begin(),
                     optimize_ops.end());
   return sorted_ret;
@@ -221,18 +232,16 @@ std::vector<ir::Node *> SortOpsAndDelayOptimizeOp(const ir::Graph &graph) {
 
 std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::Apply(
     std::unique_ptr<ir::Graph> graph) const {
-  // Rebuild the graph structure.
+  // Give the topology sort order and rebuild the graph structure.
   std::vector<ir::Node *> sorted_ops = SortOpsAndDelayOptimizeOp(*graph);
-  auto nodes = std::move(graph->nodes);
-  graph->nodes.clear();
+  auto nodes = graph->ReleaseNodes();
+  ir::Graph &result = *graph;
 
   for (auto &node : nodes) {
     if (node->NodeType() == ir::Node::Type::kVariable) {
       all_vars_.emplace(node->Name(), node->Var());
     }
   }
-
-  ir::Graph &result = *graph;
   std::unordered_set<std::string> og_has_been_broadcast;
 
   // We cannot invoke resize. It is a bug of GCC 4.8
@@ -242,8 +251,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::Apply(
 
   // find send/recv vars so that we can place the distributed training
   // realted op in the place 0
-  auto send_vars = FindDistTrainSendVars(nodes);
-  auto recv_vars = FindDistTrainRecvVars(nodes);
+  auto send_vars = FindDistTrainSendVars(sorted_ops);
+  auto recv_vars = FindDistTrainRecvVars(sorted_ops);
 
   std::vector<std::unordered_set<std::string>> bcast_var_name_set;
   bcast_var_name_set.resize(places_.size());
@@ -589,8 +598,7 @@ void MultiDevSSAGraphBuilder::ConnectOp(ir::Graph *result, OpHandleBase *op,
                                         const std::string &prev_op_name) const {
   for (auto &prev_op : result->Get<GraphOps>("ops")) {
     if (prev_op->Name() == prev_op_name) {
-      auto *dep_var = new DummyVarHandle(
-          result->CreateEmptyNode("dummy", ir::Node::Type::kVariable));
+      auto *dep_var = new DummyVarHandle(result->CreateControlDepVar());
       prev_op->AddOutput(dep_var);
       result->Get<GraphDepVars>("dep_vars").emplace(dep_var);
       op->AddInput(dep_var);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index c2c764bb94..55076f227b 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -76,10 +76,10 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
                      const std::vector<std::string> &recv_vars) const;
 
   std::vector<std::string> FindDistTrainSendVars(
-      const std::vector<std::unique_ptr<ir::Node>> &nodes) const;
+      const std::vector<ir::Node *> &nodes) const;
 
   std::vector<std::string> FindDistTrainRecvVars(
-      const std::vector<std::unique_ptr<ir::Node>> &nodes) const;
+      const std::vector<ir::Node *> &nodes) const;
 
   void ConnectOp(ir::Graph *result, OpHandleBase *op,
                  const std::string &prev_op_name) const;
diff --git a/paddle/fluid/framework/details/rpc_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc
index 924ff4d118..c52a07530e 100644
--- a/paddle/fluid/framework/details/rpc_op_handle.cc
+++ b/paddle/fluid/framework/details/rpc_op_handle.cc
@@ -33,7 +33,8 @@ void RPCOpHandle::RunImpl() {
   for (auto *in : inputs_) {
     auto &p = static_cast<VarHandle *>(in)->place_;
     // FIXME(Yancey1989): need a better solution instead of use DebugString()
-    if (in->DebugString() == "dummy") {  // HACK
+    if (in->Node()->Name().find(ir::Node::kControlDepVarName) !=
+        std::string::npos) {  // HACK
       continue;
     }
     if (in->GeneratedOp()) {
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc
index dcdcb28ac4..e7db8729cf 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -17,6 +17,36 @@
 namespace paddle {
 namespace framework {
 namespace details {
+void SSAGraphBuilder::PolishGraphToSupportDataHazards(ir::Graph *graph) {
+  for (auto &var_map : graph->Get<GraphVars>("vars")) {
+    for (auto &name_pair : var_map) {
+      if (name_pair.second.size() <= 1) {
+        continue;
+      }
+      auto it_new = name_pair.second.rbegin();
+      auto it_old = name_pair.second.rbegin();
+      ++it_old;
+      for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) {
+        OpHandleBase *write_op = (*it_new)->GeneratedOp();
+        const auto &read_ops = (*it_old)->PendingOps();
+
+        for (auto *read_op : read_ops) {
+          // Manually add a dependency var from read_op to write_op;
+          if (read_op == write_op) {
+            // Read Write is the same op.
+            continue;
+          }
+
+          auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
+          read_op->AddOutput(dep_var);
+          write_op->AddInput(dep_var);
+          graph->Get<GraphDepVars>("dep_vars").emplace(dep_var);
+        }
+      }
+    }
+  }
+}
+
 VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
     ir::Graph *graph, ir::Node *node, const platform::Place &place,
     size_t place_offset) {
@@ -56,8 +86,7 @@ void SSAGraphBuilder::AddOutputToLeafOps(ir::Graph *graph) {
     if (!op->Outputs().empty()) {
       continue;
     }
-    auto *dummy_leaf = new DummyVarHandle(
-        graph->CreateEmptyNode("dummy", ir::Node::Type::kVariable));
+    auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
     graph->Get<GraphDepVars>("dep_vars").emplace(dummy_leaf);
     op->AddOutput(dummy_leaf);
   }
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h
index e99bab518e..f64445b470 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -57,6 +57,15 @@ class SSAGraphBuilder : public ir::Pass {
   DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
 
  protected:
+  /**
+   * We only handle write after read(WAR), since it should not have a write
+   * after write in program. If there are write after write operators, we need
+   * prune them.
+   *
+   * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
+   */
+  static void PolishGraphToSupportDataHazards(ir::Graph *graph);
+
   static VarHandle *CreateOrGetLatestVarHandle(ir::Graph *graph, ir::Node *node,
                                                const platform::Place &place,
                                                size_t place_offset);
diff --git a/paddle/fluid/framework/details/var_handle.cc b/paddle/fluid/framework/details/var_handle.cc
index 6f00abd947..5457870e9f 100644
--- a/paddle/fluid/framework/details/var_handle.cc
+++ b/paddle/fluid/framework/details/var_handle.cc
@@ -26,7 +26,7 @@ std::string VarHandle::DebugString() const {
   return ss.str();
 }
 
-std::string DummyVarHandle::DebugString() const { return "dummy"; }
+std::string DummyVarHandle::DebugString() const { return node_->Name(); }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index 769dddbc59..6ad8773567 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -34,7 +34,8 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
   std::map<std::string, std::vector<ir::Node *>> var_nodes;
   for (auto *op : program.Block(0).AllOps()) {
     ir::Node *node = CreateOpNode(op);
-
+    // For input args, reuse the same var name if it was created before.
+    // Otherwise, create a new one.
     for (auto &each_var_name : op->InputArgumentNames()) {
       ir::Node *var = nullptr;
       if (var_nodes.find(each_var_name) != var_nodes.end()) {
@@ -43,16 +44,16 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
         var = CreateVarNode(all_vars.at(each_var_name));
         var_nodes[each_var_name].push_back(var);
       } else {
-        // TODO(paddle-dev): Seems some assumption doesn't hold?
-        VLOG(3) << op->Type()
-                << " input var not in all_var list: " << each_var_name;
+        // Operation input var can be optional (dispensable). Which means
+        // the operation doesn't really need the var at runtime. In this
+        // case, the no-existed var is ready at the beginning.
         var = CreateEmptyNode(each_var_name, ir::Node::Type::kVariable);
         var_nodes[each_var_name].push_back(var);
       }
       node->inputs.push_back(var);
       var->outputs.push_back(node);
     }
-
+    // For output args, always create a new var.
     for (auto &each_var_name : op->OutputArgumentNames()) {
       ir::Node *var = CreateVarNode(all_vars.at(each_var_name));
       var_nodes[each_var_name].push_back(var);
@@ -67,6 +68,7 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
    *
    * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
    */
+
   for (auto &var : var_nodes) {
     auto &versions = var.second;
     if (versions.size() <= 1) continue;
@@ -85,8 +87,18 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
           // Read Write is the same op.
           continue;
         }
-        ir::Node *dep_var = CreateEmptyNode(ir::Node::kControlDepVarName,
-                                            ir::Node::Type::kVariable);
+        // 2 ops might have been connected via other vars.
+        bool has_dep = false;
+        for (ir::Node *r_out : read_op->outputs) {
+          for (ir::Node *w_in : write_op->inputs) {
+            if (r_out == w_in) {
+              has_dep = true;
+              break;
+            }
+          }
+        }
+        if (has_dep) continue;
+        ir::Node *dep_var = CreateControlDepVar();
         read_op->outputs.push_back(dep_var);
         dep_var->inputs.push_back(read_op);
         write_op->inputs.push_back(dep_var);
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index a1e39b08a4..fccad9fd4f 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -27,6 +27,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
+
 class Graph {
  public:
   explicit Graph(const ProgramDesc &program);
@@ -54,28 +55,58 @@ class Graph {
     };
   }
 
+  const std::unordered_set<ir::Node *> &Nodes() const { return node_set_; }
+
   ir::Node *CreateVarNode(VarDesc *var_desc) {
-    nodes.emplace_back(new ir::Node(var_desc));
-    return nodes.back().get();
+    return AddNode(new ir::Node(var_desc));
   }
 
   ir::Node *CreateOpNode(OpDesc *op_desc) {
-    nodes.emplace_back(new ir::Node(op_desc));
-    return nodes.back().get();
+    return AddNode(new ir::Node(op_desc));
+  }
+
+  ir::Node *CreateControlDepVar() {
+    // TODO(panyx0718): control var name should be unique.
+    const std::string name = string::Sprintf(
+        "%s@%llu", ir::Node::kControlDepVarName, node_set_.size());
+    return AddNode(new ir::Node(name, ir::Node::Type::kVariable));
   }
 
   ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type) {
-    nodes.emplace_back(new ir::Node(name, type));
-    return nodes.back().get();
+    return AddNode(new ir::Node(name, type));
   }
 
-  std::vector<std::unique_ptr<ir::Node>> nodes;
+  std::vector<std::unique_ptr<ir::Node>> ReleaseNodes() {
+    std::vector<std::unique_ptr<ir::Node>> ret;
+    for (auto &n : nodes_) {
+      ret.emplace_back(n.second.release());
+    }
+    nodes_.clear();
+    node_set_.clear();
+    return ret;
+  }
 
  private:
+  // This method takes ownership of `node`.
+  ir::Node *AddNode(ir::Node *node) {
+    PADDLE_ENFORCE(node_set_.find(node) == node_set_.end());
+    nodes_[node].reset(node);
+    node_set_.insert(node);
+    return node;
+  }
+
+  void RemoveNode(ir::Node *node) {
+    PADDLE_ENFORCE(node_set_.find(node) != node_set_.end());
+    node_set_.erase(node);
+    nodes_.erase(node);
+  }
+
   // NOTE: program_ shouldn't be exposed to user.
   const ProgramDesc &program_;
   std::map<std::string, boost::any> attrs_;
   std::map<std::string, std::function<void(void)>> attr_dels_;
+  std::map<ir::Node *, std::unique_ptr<ir::Node>> nodes_;
+  std::unordered_set<ir::Node *> node_set_;
 };
 }  // namespace ir
 }  // namespace framework
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index 76458be135..b829cf204d 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -33,9 +33,8 @@ void SortHelper(
     }
   }
 
-  LOG(ERROR) << "topology sort insert: " << node->Name()
-             << reinterpret_cast<void *>(node) << " input "
-             << node->inputs.size();
+  VLOG(3) << "topology sort insert: " << node->Name()
+          << reinterpret_cast<void *>(node) << " input " << node->inputs.size();
   ret->push_back(node);
 }
 
@@ -93,18 +92,18 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
     const Graph &graph) {
   std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
 
-  for (auto &n : graph.nodes) {
+  for (auto &n : graph.Nodes()) {
     if (n->NodeType() != ir::Node::Type::kOperation) continue;
-    if (adj_list.find(n.get()) == adj_list.end()) {
-      adj_list[n.get()] = std::unordered_set<ir::Node *>();
+    if (adj_list.find(n) == adj_list.end()) {
+      adj_list[n] = std::unordered_set<ir::Node *>();
     }
     for (auto &var : n->inputs) {
       for (auto &adj_n : var->inputs) {
         PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
-        adj_list[n.get()].insert(adj_n);
-        LOG(ERROR) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
-                   << " -> " << n->Name() << reinterpret_cast<void *>(n.get())
-                   << "  via " << var->Name() << reinterpret_cast<void *>(var);
+        adj_list[n].insert(adj_n);
+        VLOG(3) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
+                << " -> " << n->Name() << reinterpret_cast<void *>(n)
+                << "  via " << var->Name() << reinterpret_cast<void *>(var);
       }
     }
   }
diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h
index 55b2e3f5ca..118c16ab7a 100644
--- a/paddle/fluid/framework/ir/graph_helper.h
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -30,6 +30,7 @@ std::vector<ir::Node *> TopologySortOperations(const Graph &graph);
 
 std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
     const Graph &graph);
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc
index f8fbd2242d..feb4e8e76b 100644
--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
@@ -94,20 +94,21 @@ TEST(GraphTest, Basic) {
             prog.MutableBlock(0)->Var("test_out")->GetType());
 
   std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
-  ASSERT_EQ(g->nodes[0]->Name(), "sum");
-  ASSERT_EQ(g->nodes[0]->inputs[0]->Name(), "test_a");
-  ASSERT_EQ(g->nodes[0]->inputs[1]->Name(), "test_b");
-  ASSERT_EQ(g->nodes[0]->inputs[2]->Name(), "test_c");
-  ASSERT_EQ(g->nodes[0]->outputs[0]->Name(), "test_out");
-  ASSERT_EQ(g->nodes[1]->Name(), "test_a");
-  ASSERT_EQ(g->nodes[1]->outputs[0]->Name(), "sum");
-  ASSERT_EQ(g->nodes[2]->Name(), "test_b");
-  ASSERT_EQ(g->nodes[2]->outputs[0]->Name(), "sum");
-  ASSERT_EQ(g->nodes[3]->Name(), "test_c");
-  ASSERT_EQ(g->nodes[3]->outputs[0]->Name(), "sum");
-  ASSERT_EQ(g->nodes[4]->Name(), "test_out");
-  ASSERT_EQ(g->nodes[4]->inputs[0]->Name(), "sum");
-  ASSERT_EQ(g->nodes.size(), 5);
+  std::vector<ir::Node *> nodes(g->Nodes().begin(), g->Nodes().end());
+  ASSERT_EQ(nodes[0]->Name(), "sum");
+  ASSERT_EQ(nodes[0]->inputs[0]->Name(), "test_a");
+  ASSERT_EQ(nodes[0]->inputs[1]->Name(), "test_b");
+  ASSERT_EQ(nodes[0]->inputs[2]->Name(), "test_c");
+  ASSERT_EQ(nodes[0]->outputs[0]->Name(), "test_out");
+  ASSERT_EQ(nodes[1]->Name(), "test_a");
+  ASSERT_EQ(nodes[1]->outputs[0]->Name(), "sum");
+  ASSERT_EQ(nodes[2]->Name(), "test_b");
+  ASSERT_EQ(nodes[2]->outputs[0]->Name(), "sum");
+  ASSERT_EQ(nodes[3]->Name(), "test_c");
+  ASSERT_EQ(nodes[3]->outputs[0]->Name(), "sum");
+  ASSERT_EQ(nodes[4]->Name(), "test_out");
+  ASSERT_EQ(nodes[4]->inputs[0]->Name(), "sum");
+  ASSERT_EQ(nodes.size(), 5);
 }
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 4e2002ad24..9b56ad4c55 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -192,9 +192,9 @@ if(WITH_DISTRIBUTE)
     
     set(DISTRIBUTE_DEPS "")
     if(WITH_GRPC)
-        set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
+        set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
     else()
-        set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib)
+        set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
         if(WITH_BRPC_RDMA)
             find_library(IBVERBS_LIBRARY NAMES ibverbs)
             ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
diff --git a/paddle/fluid/operators/send_recv_util.h b/paddle/fluid/operators/send_recv_util.h
index deab005149..500230d553 100644
--- a/paddle/fluid/operators/send_recv_util.h
+++ b/paddle/fluid/operators/send_recv_util.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+#include "paddle/fluid/framework/ir/node.h"
 
 namespace paddle {
 namespace operators {
@@ -22,7 +23,9 @@ inline bool NeedSend(const framework::Scope& scope,
                      const std::string& varname) {
   // dummy variable is only used in parallel executor to represent
   // some dependency relationship, we don't need to send/recv it.
-  if (varname == "dummy") return false;
+  if (varname.find(framework::ir::Node::kControlDepVarName) !=
+      std::string::npos)
+    return false;
   auto* var = scope.FindVar(varname);
   PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
                           varname);

From 2782e71a11082edf83906efaecfc5c0a47731a72 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Sun, 22 Jul 2018 12:43:27 +0800
Subject: [PATCH 07/37] fix graph_test

---
 paddle/fluid/framework/ir/graph_test.cc | 26 ++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc
index feb4e8e76b..73ef55756c 100644
--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
@@ -95,19 +95,19 @@ TEST(GraphTest, Basic) {
 
   std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
   std::vector<ir::Node *> nodes(g->Nodes().begin(), g->Nodes().end());
-  ASSERT_EQ(nodes[0]->Name(), "sum");
-  ASSERT_EQ(nodes[0]->inputs[0]->Name(), "test_a");
-  ASSERT_EQ(nodes[0]->inputs[1]->Name(), "test_b");
-  ASSERT_EQ(nodes[0]->inputs[2]->Name(), "test_c");
-  ASSERT_EQ(nodes[0]->outputs[0]->Name(), "test_out");
-  ASSERT_EQ(nodes[1]->Name(), "test_a");
-  ASSERT_EQ(nodes[1]->outputs[0]->Name(), "sum");
-  ASSERT_EQ(nodes[2]->Name(), "test_b");
-  ASSERT_EQ(nodes[2]->outputs[0]->Name(), "sum");
-  ASSERT_EQ(nodes[3]->Name(), "test_c");
-  ASSERT_EQ(nodes[3]->outputs[0]->Name(), "sum");
-  ASSERT_EQ(nodes[4]->Name(), "test_out");
-  ASSERT_EQ(nodes[4]->inputs[0]->Name(), "sum");
+  for (ir::Node *n : nodes) {
+    if (n->Name() == "sum") {
+      ASSERT_EQ(n->inputs.size(), 3);
+      ASSERT_EQ(n->outputs.size(), 1);
+    } else if (n->Name() == "test_a" || n->Name() == "test_b" ||
+               n->Name() == "test_c") {
+      ASSERT_EQ(n->inputs.size(), 0);
+      ASSERT_EQ(n->outputs.size(), 1);
+    } else if (n->Name() == "test_out") {
+      ASSERT_EQ(n->inputs.size(), 1);
+      ASSERT_EQ(n->outputs.size(), 0);
+    }
+  }
   ASSERT_EQ(nodes.size(), 5);
 }
 }  // namespace framework

From 21a45420f0dc4da9413b002d380e8b31bff88e05 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Sun, 22 Jul 2018 21:10:46 +0800
Subject: [PATCH 08/37] polish and test

---
 .../details/multi_devices_graph_builder.cc    |   8 ++
 .../fluid/framework/details/rpc_op_handle.cc  |   4 +-
 .../framework/details/ssa_graph_builder.cc    |  30 -----
 .../framework/details/ssa_graph_builder.h     |   9 --
 paddle/fluid/framework/ir/CMakeLists.txt      |   3 +-
 paddle/fluid/framework/ir/graph.cc            |   4 +
 paddle/fluid/framework/ir/graph.h             |  13 +-
 paddle/fluid/framework/ir/graph_helper.cc     |  13 +-
 paddle/fluid/framework/ir/graph_helper.h      |   4 +
 .../fluid/framework/ir/graph_helper_test.cc   | 125 ++++++++++++++++++
 paddle/fluid/operators/send_recv_util.h       |   1 +
 11 files changed, 166 insertions(+), 48 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/graph_helper_test.cc

diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 150f1534c8..de0cac0f1a 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -170,6 +170,7 @@ size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
     const std::vector<std::string> &var_names) const {
   int64_t numel_sum = 0;
   for (auto var_name : var_names) {
+    if (all_vars_.find(var_name) == all_vars_.end()) continue;
     auto var_desc = all_vars_.at(var_name);
     PADDLE_ENFORCE_NOT_NULL(var_desc);
     auto dim = framework::make_ddim(var_desc->GetShape());
@@ -271,6 +272,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::Apply(
       // user can customize loss@grad if not use_default_grad_scale_
       if (strategy_.gradient_scale_ !=
           BuildStrategy::GradientScaleStrategy::kCustomized) {
+        // TODO(paddle-dev): Why is there no input for this op_handle?
         CreateScaleLossGradOp(&result);
       }
       // This assumes the backward generating code will ensure IsScaleLossOp
@@ -288,6 +290,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::Apply(
       } else {
         // This op runs on all devices, and its output may have parameter's
         // gradients.
+        // TODO(paddle-dev): Why is so special about "read" op?
         if (node->Op()->Type() == "read" && strategy_.enable_data_balance_) {
           node->Op()->SetAttr("throw_eof_exp", false);
           CreateComputationalOps(&result, node, places_.size());
@@ -363,6 +366,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::Apply(
    * Only variables should be the leaves of graph.
    */
   AddOutputToLeafOps(&result);
+  PADDLE_ENFORCE(!ir::HasCircle(result));
   return graph;
 }
 
@@ -620,6 +624,7 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
 
   if (node->Op()->Type() == "split_byref" ||
       node->Op()->Type() == "split_selected_rows") {
+    // TODO(paddle-dev): getting the first var is not safe.
     op_dev_id = GetVarDeviceID(input_var_names[0]);
     if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
       op_dev_id = GetAppropriateDeviceID(input_var_names);
@@ -657,7 +662,10 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
                                           ir::Node *node) const {
   int op_dev_id = -1;
   if (node->Op()->Type() == "send") {
+    // TODO(paddle-dev): getting the first var is not safe.
     op_dev_id = GetVarDeviceID(node->inputs[0]->Name());
+    PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]),
+                   "This hack no longer holds, please fix.");
     // the variable name which contains .block means it was splited by
     // split_byref op
     // so that we can balance the variable blocks to all the pserver
diff --git a/paddle/fluid/framework/details/rpc_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc
index c52a07530e..f44b374edb 100644
--- a/paddle/fluid/framework/details/rpc_op_handle.cc
+++ b/paddle/fluid/framework/details/rpc_op_handle.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/rpc_op_handle.h"
+#include "paddle/fluid/framework/ir/graph.h"
 
 namespace paddle {
 namespace framework {
@@ -33,8 +34,7 @@ void RPCOpHandle::RunImpl() {
   for (auto *in : inputs_) {
     auto &p = static_cast<VarHandle *>(in)->place_;
     // FIXME(Yancey1989): need a better solution instead of use DebugString()
-    if (in->Node()->Name().find(ir::Node::kControlDepVarName) !=
-        std::string::npos) {  // HACK
+    if (ir::IsControlDepVar(*in->Node())) {  // HACK
       continue;
     }
     if (in->GeneratedOp()) {
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc
index e7db8729cf..203d5fbbc1 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -17,36 +17,6 @@
 namespace paddle {
 namespace framework {
 namespace details {
-void SSAGraphBuilder::PolishGraphToSupportDataHazards(ir::Graph *graph) {
-  for (auto &var_map : graph->Get<GraphVars>("vars")) {
-    for (auto &name_pair : var_map) {
-      if (name_pair.second.size() <= 1) {
-        continue;
-      }
-      auto it_new = name_pair.second.rbegin();
-      auto it_old = name_pair.second.rbegin();
-      ++it_old;
-      for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) {
-        OpHandleBase *write_op = (*it_new)->GeneratedOp();
-        const auto &read_ops = (*it_old)->PendingOps();
-
-        for (auto *read_op : read_ops) {
-          // Manually add a dependency var from read_op to write_op;
-          if (read_op == write_op) {
-            // Read Write is the same op.
-            continue;
-          }
-
-          auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
-          read_op->AddOutput(dep_var);
-          write_op->AddInput(dep_var);
-          graph->Get<GraphDepVars>("dep_vars").emplace(dep_var);
-        }
-      }
-    }
-  }
-}
-
 VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
     ir::Graph *graph, ir::Node *node, const platform::Place &place,
     size_t place_offset) {
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h
index f64445b470..e99bab518e 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -57,15 +57,6 @@ class SSAGraphBuilder : public ir::Pass {
   DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
 
  protected:
-  /**
-   * We only handle write after read(WAR), since it should not have a write
-   * after write in program. If there are write after write operators, we need
-   * prune them.
-   *
-   * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
-   */
-  static void PolishGraphToSupportDataHazards(ir::Graph *graph);
-
   static VarHandle *CreateOrGetLatestVarHandle(ir::Graph *graph, ir::Node *node,
                                                const platform::Place &place,
                                                size_t place_offset);
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 744696ebb0..6447452ae5 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -2,4 +2,5 @@ cc_library(node SRCS node.cc DEPS proto_desc)
 cc_library(graph SRCS graph.cc DEPS node)
 cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
 cc_library(pass SRCS pass.cc DEPS graph node)
-cc_test(graph_test SRCS graph_test.cc DEPS graph proto_desc op_registry)
+cc_test(graph_test SRCS graph_test.cc DEPS graph op_registry)
+cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph_helper op_registry)
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index 6ad8773567..c5b4851477 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -107,6 +107,10 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
     }
   }
 }
+
+bool IsControlDepVar(const ir::Node &var) {
+  return var.Name().find(ir::Node::kControlDepVarName) != std::string::npos;
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index fccad9fd4f..4f59ec82a7 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -57,25 +57,34 @@ class Graph {
 
   const std::unordered_set<ir::Node *> &Nodes() const { return node_set_; }
 
+  // Create a normal variable with non-null VarDesc.
   ir::Node *CreateVarNode(VarDesc *var_desc) {
     return AddNode(new ir::Node(var_desc));
   }
 
+  // Create a normal runnable operator with OpDesc.
   ir::Node *CreateOpNode(OpDesc *op_desc) {
     return AddNode(new ir::Node(op_desc));
   }
 
+  // Create a control dependency var that connects 2 operations. The
+  // var doesn't hold any data. Other than that, it's no different from
+  // other var, considering dependency analysis.
   ir::Node *CreateControlDepVar() {
-    // TODO(panyx0718): control var name should be unique.
+    // TODO(panyx0718): control var name should be really unique.
     const std::string name = string::Sprintf(
         "%s@%llu", ir::Node::kControlDepVarName, node_set_.size());
     return AddNode(new ir::Node(name, ir::Node::Type::kVariable));
   }
 
+  // A more free style way of creating a graph node. Mostly use for test
+  // or "copy" from another node. Avoid using it if possible.
   ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type) {
     return AddNode(new ir::Node(name, type));
   }
 
+  // Clear all node information of the graph and return the ownership of the
+  // nodes.
   std::vector<std::unique_ptr<ir::Node>> ReleaseNodes() {
     std::vector<std::unique_ptr<ir::Node>> ret;
     for (auto &n : nodes_) {
@@ -108,6 +117,8 @@ class Graph {
   std::map<ir::Node *, std::unique_ptr<ir::Node>> nodes_;
   std::unordered_set<ir::Node *> node_set_;
 };
+
+bool IsControlDepVar(const ir::Node &var);
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index b829cf204d..b1c19e6535 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -59,12 +59,9 @@ bool HasCircleHelper(
   in_trace->erase(node);
   return false;
 }
-}  // namespace
-
-bool HasCircle(const Graph &graph) {
-  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list =
-      BuildOperationAdjList(graph);
 
+bool HasCircleInternal(
+    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list) {
   std::unordered_set<ir::Node *> visited;
   std::unordered_set<ir::Node *> in_trace;
   for (auto &adj : adj_list) {
@@ -74,10 +71,16 @@ bool HasCircle(const Graph &graph) {
   }
   return false;
 }
+}  // namespace
+
+bool HasCircle(const Graph &graph) {
+  return HasCircleInternal(BuildOperationAdjList(graph));
+}
 
 std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
   std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list =
       BuildOperationAdjList(graph);
+  PADDLE_ENFORCE(!HasCircleInternal(adj_list));
   std::unordered_set<ir::Node *> visited;
   std::vector<ir::Node *> ret;
   for (auto adj : adj_list) {
diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h
index 118c16ab7a..cd6c53a07f 100644
--- a/paddle/fluid/framework/ir/graph_helper.h
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -24,10 +24,14 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
+// Test if the graph contains circle.
 bool HasCircle(const Graph &graph);
 
+// Topology Sort the operations in the graph from inputs to outputs.
+// `graph` cannot contain circle.
 std::vector<ir::Node *> TopologySortOperations(const Graph &graph);
 
+// Build an adjacency list of operations for the `graph`.
 std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
     const Graph &graph);
 
diff --git a/paddle/fluid/framework/ir/graph_helper_test.cc b/paddle/fluid/framework/ir/graph_helper_test.cc
new file mode 100644
index 0000000000..b517442bb7
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_helper_test.cc
@@ -0,0 +1,125 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include <string>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void BuildCircleGraph(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+
+  o1->outputs.push_back(v1);
+  o1->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o1);
+}
+
+void BuildCircleGraph2(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
+
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+
+  o2->outputs.push_back(v2);
+  o1->inputs.push_back(v2);
+  v2->inputs.push_back(o2);
+  v2->outputs.push_back(o1);
+}
+
+void BuildNoCircleGraph(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
+  ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation);
+  ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation);
+  ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
+  ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable);
+  ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable);
+
+  // o1->v1->o2
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+  // o2->v2->o3
+  // o2->v2->o4
+  o2->outputs.push_back(v2);
+  o3->inputs.push_back(v2);
+  o4->inputs.push_back(v2);
+  v2->inputs.push_back(o2);
+  v2->outputs.push_back(o3);
+  v2->outputs.push_back(o4);
+  // o2->v3->o5
+  o2->outputs.push_back(v3);
+  o5->inputs.push_back(v3);
+  v3->inputs.push_back(o2);
+  v3->outputs.push_back(o5);
+  // o3-v4->o5
+  o3->outputs.push_back(v4);
+  o5->inputs.push_back(v4);
+  v4->inputs.push_back(o3);
+  v4->outputs.push_back(o5);
+}
+
+TEST(GraphHelperTest, Basic) {
+  ProgramDesc prog;
+
+  Graph g(prog);
+  BuildCircleGraph(&g);
+  ASSERT_TRUE(HasCircle(g));
+
+  Graph g2(prog);
+  BuildCircleGraph2(&g2);
+  ASSERT_TRUE(HasCircle(g2));
+
+  auto adj_list = BuildOperationAdjList(g2);
+  for (auto& adj : adj_list) {
+    auto& adj_set = adj.second;
+    if (adj.first->Name() == "op1") {
+      ASSERT_EQ((*adj_set.begin())->Name(), "op2");
+    } else if (adj.first->Name() == "op2") {
+      ASSERT_EQ((*adj_set.begin())->Name(), "op1");
+    } else {
+      ASSERT_TRUE(false);
+    }
+  }
+
+  Graph g3(prog);
+  BuildNoCircleGraph(&g3);
+  ASSERT_FALSE(HasCircle(g3));
+  auto sorted = TopologySortOperations(g3);
+  std::map<std::string, size_t> node_map;
+  for (size_t i = 0; i < sorted.size(); ++i) {
+    node_map[sorted[i]->Name()] = i;
+  }
+  ASSERT_EQ(node_map.at("op1"), 0);
+  ASSERT_EQ(node_map.at("op2"), 1);
+  ASSERT_TRUE(node_map.at("op3") < node_map.at("op5"));
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/operators/send_recv_util.h b/paddle/fluid/operators/send_recv_util.h
index 500230d553..dc26c53c64 100644
--- a/paddle/fluid/operators/send_recv_util.h
+++ b/paddle/fluid/operators/send_recv_util.h
@@ -23,6 +23,7 @@ inline bool NeedSend(const framework::Scope& scope,
                      const std::string& varname) {
   // dummy variable is only used in parallel executor to represent
   // some dependency relationship, we don't need to send/recv it.
+  // TODO(paddle-dev): Why would parallel executor logic leaked into here?
   if (varname.find(framework::ir::Node::kControlDepVarName) !=
       std::string::npos)
     return false;

From 5173a53c8a6ac2e398c4624aff76c125716f775d Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Mon, 23 Jul 2018 13:03:00 +0800
Subject: [PATCH 09/37] fix reorder issue.

---
 doc/fluid/design/ir/draft.md                  | 48 +++++++--------
 .../details/multi_devices_graph_builder.cc    | 59 +++++++++++--------
 .../framework/details/ssa_graph_builder.cc    | 40 +++++++++++++
 .../framework/details/ssa_graph_builder.h     |  6 ++
 paddle/fluid/framework/ir/graph.cc            |  1 +
 5 files changed, 107 insertions(+), 47 deletions(-)

diff --git a/doc/fluid/design/ir/draft.md b/doc/fluid/design/ir/draft.md
index a141dcbca5..a33b5a9c93 100644
--- a/doc/fluid/design/ir/draft.md
+++ b/doc/fluid/design/ir/draft.md
@@ -1,16 +1,16 @@
 ## Motivation
 
-There is a ```gap``` between the ```Program``` defined by
-user and the ```Executable``` that can be scheduled
+There is a `gap` between the `Program` defined by
+user and the `Executable` that can be scheduled
 efficiently on heterogeneous hardware, either locally
 or distributedly.
 
-Usually, the ```gap``` is bridged by
+Usually, the `gap` is bridged by
 
 * A serious transformations with defined order.
 
 * These transformations usually involve
-```insert, delete, clustering, split, dependency analysis```.
+`insert, delete, clustering, split, dependency analysis`.
 
 * Has a simple way to verify and debug each transformation.
 
@@ -38,44 +38,44 @@ design below.
 
 #### Node
 
-```Node``` represents an operation that performs some computation or
+`Node` represents an operation that performs some computation or
 a variable that is input or output of operation.
 
-```Node```s are connected to other ```Node```s via inputs and outputs.
+`Node`s are connected to other `Node`s via inputs and outputs.
 
 Other properties (maybe device placement information) can be added
-to ```Node``` in the future if it's a
-common requirement of many other ```Pass```es. Otherwise, it should live
-in a ```Node``` wrapper class that is private to some ```Pass``` or be
-a local member of a ```Pass```.
+to `Node` in the future if it's a
+common requirement of many other `Pass`es. Otherwise, it should live
+in a `Node` wrapper class that is private to some `Pass` or be
+a local member of a `Pass`.
 
 #### Graph
 
-```Graph``` contains a list of ```Node```s, which are connected to
+`Graph` contains a list of `Node`s, which are connected to
 each other via inputs and outputs.
 
 TODO: Better definitions for the graph.
 
-```Graph``` can also contain ```Attribute```s. ```Attribute```s
-can be ``any`` thing. For example, it can be a list of "wraper"
-nodes. The ```wrapper``` nodes compose ```Node```s and provide
-helper method for execution or transformation. ```Attribute```
+`Graph` can also contain `Attribute`s. `Attribute`s
+can be `any` thing. For example, it can be a list of "wraper"
+nodes. The `wrapper` nodes compose `Node`s and provide
+helper method for execution or transformation. `Attribute`
 can also contain other things that describe some properties of
-the ```Graph``` or ```Graph``` nodes. ```Attribute``` can be passed
-across ```Pass```. However, it should be used with care.
+the `Graph` or `Graph` nodes. `Attribute` can be passed
+across `Pass`. However, it should be used with care.
 
 #### Pass
 
-```Pass``` represents a transformation of ```Graph```. Its input
-is a ```Graph``` and its output is also a ```Graph```. For example,
-a ```Pass``` can simply print out the ```Graph```. A ```Pass```
-can also fuse some ```Graph```'s ```Node```s.
+`Pass` represents a transformation of `Graph`. Its input
+is a `Graph` and its output is also a `Graph`. For example,
+a `Pass` can simply print out the `Graph`. A `Pass`
+can also fuse some `Graph`'s `Node`s.
 
 #### Optimize
 
-```Optimize``` contains a series of ```Pass``` with defined order.
-```Optimize``` transforms a ```Graph``` that only contains raw
-modeling logic to a ```Graph``` that can be run efficiently while
+`Optimize` contains a series of `Pass` with defined order.
+`Optimize` transforms a `Graph` that only contains raw
+modeling logic to a `Graph` that can be run efficiently while
 maintaining the original modeling logic.
 
 
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index de0cac0f1a..22f0cb20d0 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -196,38 +196,46 @@ size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
 std::vector<ir::Node *> SortOpsAndDelayOptimizeOp(const ir::Graph &graph) {
   std::vector<ir::Node *> ret = ir::TopologySortOperations(graph);
   size_t last_backward = 0;
-  std::vector<ir::Node *> optimize_ops;
-  std::vector<ir::Node *> sorted_ret;
   for (size_t i = 0; i < ret.size(); ++i) {
     if (boost::get<int>(
             ret[i]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
         static_cast<int>(OpRole::kBackward)) {
-      sorted_ret.push_back(ret[i]);
-      last_backward = sorted_ret.size();
-    } else if (boost::get<int>(ret[i]->Op()->GetAttr(
-                   OpProtoAndCheckerMaker::OpRoleAttrName())) ==
-               static_cast<int>(OpRole::kOptimize)) {
-      optimize_ops.push_back(ret[i]);
-    } else {
-      sorted_ret.push_back(ret[i]);
+      last_backward = i;
     }
   }
 
-  // Verify that no operations before optimize ops depends on optimize ops.
-  std::unordered_set<ir::Node *> optimize_set(optimize_ops.begin(),
-                                              optimize_ops.end());
-  for (size_t i = 0; i < last_backward; ++i) {
-    for (ir::Node *in : sorted_ret[i]->inputs) {
-      for (ir::Node *pre_n : in->inputs) {
-        PADDLE_ENFORCE(optimize_set.find(pre_n) == optimize_set.end(),
-                       "optimize operations cannot be depended by forward "
-                       "or backward node %s -> %s",
-                       pre_n->Name(), sorted_ret[i]->Name());
+  std::vector<ir::Node *> optimize_ops;
+  std::vector<ir::Node *> sorted_ret;
+  for (size_t i = 0; i < ret.size(); ++i) {
+    if (i < last_backward) {
+      if (boost::get<int>(ret[i]->Op()->GetAttr(
+              OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+          static_cast<int>(OpRole::kOptimize)) {
+        optimize_ops.push_back(ret[i]);
+      } else {
+        sorted_ret.push_back(ret[i]);
+      }
+    } else if (i == last_backward) {
+      sorted_ret.push_back(ret[i]);
+      // Verify that no operations before optimize ops depends on optimize ops.
+      std::unordered_set<ir::Node *> optimize_set(optimize_ops.begin(),
+                                                  optimize_ops.end());
+      for (ir::Node *n : sorted_ret) {
+        for (ir::Node *in : n->inputs) {
+          for (ir::Node *pre_n : in->inputs) {
+            PADDLE_ENFORCE(optimize_set.find(pre_n) == optimize_set.end(),
+                           "optimize operations cannot be depended by forward "
+                           "or backward node %s -> %s",
+                           pre_n->Name(), n->Name());
+          }
+        }
       }
+      sorted_ret.insert(sorted_ret.end(), optimize_ops.begin(),
+                        optimize_ops.end());
+    } else {
+      sorted_ret.push_back(ret[i]);
     }
   }
-  sorted_ret.insert(sorted_ret.begin() + last_backward, optimize_ops.begin(),
-                    optimize_ops.end());
   return sorted_ret;
 }
 
@@ -239,7 +247,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::Apply(
   ir::Graph &result = *graph;
 
   for (auto &node : nodes) {
-    if (node->NodeType() == ir::Node::Type::kVariable) {
+    if (node->NodeType() == ir::Node::Type::kVariable && node->Var()) {
       all_vars_.emplace(node->Name(), node->Var());
     }
   }
@@ -361,6 +369,11 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::Apply(
       }
     }
   }
+  /*
+  Dependency graph has been constructed. However, there are still data
+  hazards need to be handled.
+ */
+  PolishGraphToSupportDataHazards(&result);
 
   /*
    * Only variables should be the leaves of graph.
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc
index 203d5fbbc1..506e7eb35c 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -17,6 +17,46 @@
 namespace paddle {
 namespace framework {
 namespace details {
+void SSAGraphBuilder::PolishGraphToSupportDataHazards(ir::Graph *graph) {
+  for (auto &var_map : graph->Get<GraphVars>("vars")) {
+    for (auto &name_pair : var_map) {
+      if (name_pair.second.size() <= 1) {
+        continue;
+      }
+      auto it_new = name_pair.second.rbegin();
+      auto it_old = name_pair.second.rbegin();
+      ++it_old;
+      for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) {
+        OpHandleBase *write_op = (*it_new)->GeneratedOp();
+        const auto &read_ops = (*it_old)->PendingOps();
+
+        for (auto *read_op : read_ops) {
+          // Manually add a dependency var from read_op to write_op;
+          if (read_op == write_op) {
+            // Read Write is the same op.
+            continue;
+          }
+          bool has_dep = false;
+          for (auto *r_out : read_op->Outputs()) {
+            for (auto *w_in : write_op->Inputs()) {
+              if (r_out->Node() == w_in->Node()) {
+                has_dep = true;
+                break;
+              }
+            }
+          }
+          if (has_dep) continue;
+
+          auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
+          read_op->AddOutput(dep_var);
+          write_op->AddInput(dep_var);
+          graph->Get<GraphDepVars>("dep_vars").emplace(dep_var);
+        }
+      }
+    }
+  }
+}
+
 VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
     ir::Graph *graph, ir::Node *node, const platform::Place &place,
     size_t place_offset) {
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h
index e99bab518e..2b4f31f2ff 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -57,6 +57,12 @@ class SSAGraphBuilder : public ir::Pass {
   DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
 
  protected:
+  /*
+    Dependency graph has been constructed. However, there are still data
+    hazards need to be handled.
+  */
+  static void PolishGraphToSupportDataHazards(ir::Graph *graph);
+
   static VarHandle *CreateOrGetLatestVarHandle(ir::Graph *graph, ir::Node *node,
                                                const platform::Place &place,
                                                size_t place_offset);
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index c5b4851477..740acfafb7 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -98,6 +98,7 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
           }
         }
         if (has_dep) continue;
+
         ir::Node *dep_var = CreateControlDepVar();
         read_op->outputs.push_back(dep_var);
         dep_var->inputs.push_back(read_op);

From 8a521c0b4da5118098d57e34c1c4150e276f140a Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 16 Jul 2018 17:44:14 +0800
Subject: [PATCH 10/37] Remove buggy get_test_program and refine c++ reader
 demo

---
 python/paddle/fluid/io.py                     |  98 ---------------
 .../convert_data_to_recordio.py               |   8 +-
 .../tests/demo/text_classification/train.py   | 115 +++++++++---------
 3 files changed, 62 insertions(+), 159 deletions(-)

diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 0eb1194e27..32368d3c0c 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -789,101 +789,3 @@ def get_parameter_value_by_name(name, executor, program=None):
         program = default_main_program()
     var = program.global_block().var(name)
     return get_parameter_value(var, executor)
-
-
-def get_test_program(filelist, program=None, startup_program=None):
-    """
-    Transpile current train program to a program to read test dataset
-    if the program is using reader ops like "open_files_op".
-    """
-
-    def _copy_reader_var_(block, var, new_name=None):
-        if new_name == None:
-            new_name = var.name
-        new_var = block.create_var(
-            name=str(new_name), type=core.VarDesc.VarType.READER)
-        new_var.desc.set_shapes(var.desc.shapes())
-        new_var.desc.set_dtypes(var.desc.dtypes())
-        new_var.persistable = True
-        return new_var
-
-    def _get_test_reader_name(train_reader_name):
-        return train_reader_name + "_test"
-
-    def _is_reader_op(op):
-        block = op.block
-        if "Out" in op.output_names:
-            reader_out = block.vars[op.output("Out")[0]]
-            if reader_out.type == core.VarDesc.VarType.READER:
-                return True
-        return False
-
-    if program == None:
-        program = default_main_program()
-    if startup_program == None:
-        startup_program = default_startup_program()
-    startup_block = startup_program.global_block()
-
-    # 1. find out the orignal reader var name
-    startup_reader_op_list = []
-
-    for op in startup_block.ops:
-        if _is_reader_op(op):
-            startup_reader_op_list.append(op)
-
-    if len(startup_reader_op_list) == 0:
-        return program
-
-    root_reader_op = startup_reader_op_list[0]
-    train_test_reader_map = {}
-    # 2. add operators to startup to read open and read test data files
-    for op in startup_reader_op_list:
-        assert (len(op.output("Out")) == 1)
-        train_reader_name = op.output("Out")[0]
-        train_reader = startup_block.vars[train_reader_name]
-        test_reader = _copy_reader_var_(
-            startup_block,
-            train_reader,
-            new_name=_get_test_reader_name(train_reader_name))
-        train_test_reader_map[train_reader.name] = test_reader
-
-        test_op_inputs = {}
-        for name in op.input_names:
-            train_arg_names = op.input(name)
-            test_arg_vars = []
-            for arg_name in train_arg_names:
-                arg_var = train_test_reader_map[
-                    arg_name] if name == "UnderlyingReader" else startup_block.vars[
-                        arg_name]
-                test_arg_vars.append(arg_var)
-            test_op_inputs[name] = test_arg_vars
-
-        test_op = startup_block.append_op(
-            type=op.type,
-            inputs=test_op_inputs,
-            outputs={'Out': [test_reader]},
-            attrs=op.attrs)
-        # root reader op's filelist attr for read test files
-        if op.type == root_reader_op.type:
-            test_op.set_attr("file_names", filelist)
-        if op.type == "create_multi_pass_reader":
-            test_op.set_attr("pass_num", 1)
-
-    # 3. rename reader vars in inference program to different name
-    #    to avoid read from train data.
-    main_block = program.global_block()
-    for var in main_block.vars.values():
-        if var.type == core.VarDesc.VarType.READER:
-            main_block.rename_var(
-                str(var.name), str(_get_test_reader_name(var.name)))
-
-    for op in main_block.ops:
-        if op.type == root_reader_op.type:
-            test_op.set_attr("file_names", filelist)
-        if op.type == "create_multi_pass_reader":
-            test_op.set_attr("pass_num", 1)
-
-    startup_program.sync_with_cpp()
-    program.sync_with_cpp()
-
-    return program
diff --git a/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py b/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py
index 9425d472a4..2dd8f352f7 100644
--- a/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py
+++ b/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py
@@ -31,8 +31,12 @@ def load_vocab(filename):
 
 
 # load word dict with paddle inner function
-word_dict = load_vocab(sys.argv[1])
-word_dict["<unk>"] = len(word_dict)
+if len(sys.argv) > 1:
+    word_dict = load_vocab(sys.argv[1])
+    word_dict["<unk>"] = len(word_dict)
+else:
+    word_dict = paddle.dataset.imdb.word_dict()
+
 print "Dict dim = ", len(word_dict)
 
 # input text data
diff --git a/python/paddle/fluid/tests/demo/text_classification/train.py b/python/paddle/fluid/tests/demo/text_classification/train.py
index e408684c6e..9e930b67a4 100644
--- a/python/paddle/fluid/tests/demo/text_classification/train.py
+++ b/python/paddle/fluid/tests/demo/text_classification/train.py
@@ -19,7 +19,7 @@ import sys
 TRAIN_FILES = ['train.recordio']
 TEST_FILES = ['test.recordio']
 
-DICT_DIM = 89528
+DICT_DIM = 5147
 
 # embedding dim
 emb_dim = 128
@@ -33,33 +33,24 @@ hid_dim2 = 96
 # class num
 class_dim = 2
 
+# epoch num
+epoch_num = 10
 
-def network_cfg(is_train, pass_num=100):
-    with fluid.unique_name.guard():
-        train_file_obj = fluid.layers.open_files(
-            filenames=TRAIN_FILES,
-            pass_num=pass_num,
-            shapes=[[-1, 1], [-1, 1]],
-            lod_levels=[1, 0],
-            dtypes=['int64', 'int64'],
-            thread_num=1)
-
-        test_file_obj = fluid.layers.open_files(
-            filenames=TEST_FILES,
-            pass_num=1,
-            shapes=[[-1, 1], [-1, 1]],
-            lod_levels=[1, 0],
-            dtypes=['int64', 'int64'],
-            thread_num=1)
 
-        if is_train:
-            file_obj = fluid.layers.shuffle(train_file_obj, buffer_size=1000)
-        else:
-            file_obj = test_file_obj
+def build_program(is_train):
+    file_obj_handle = fluid.layers.io.open_files(
+        filenames=TRAIN_FILES if is_train else TEST_FILES,
+        shapes=[[-1, 1], [-1, 1]],
+        lod_levels=[1, 0],
+        dtypes=['int64', 'int64'],
+        thread_num=1)
+    if is_train:
+        file_obj = fluid.layers.io.shuffle(file_obj_handle, buffer_size=1000)
+    else:
+        file_obj = file_obj_handle
+    file_obj = fluid.layers.io.double_buffer(file_obj)
 
-        file_obj = fluid.layers.double_buffer(
-            file_obj,
-            name="train_double_buffer" if is_train else 'test_double_buffer')
+    with fluid.unique_name.guard():
 
         data, label = fluid.layers.read_file(file_obj)
 
@@ -90,58 +81,64 @@ def network_cfg(is_train, pass_num=100):
 
         if is_train:
             # SGD optimizer
-            sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.01)
+            sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.001)
             sgd_optimizer.minimize(avg_cost)
 
-        return {
-            'loss': avg_cost,
-            'log': [avg_cost, acc],
-            'file': train_file_obj if is_train else test_file_obj
-        }
+    return {'loss': avg_cost, 'log': [avg_cost, acc], 'file': file_obj_handle}
 
 
 def main():
     train = fluid.Program()
     startup = fluid.Program()
+    test = fluid.Program()
 
     with fluid.program_guard(train, startup):
-        train_args = network_cfg(is_train=True)
-
-    test = fluid.Program()
+        train_args = build_program(is_train=True)
 
-    with fluid.program_guard(test, fluid.Program()):
-        test_args = network_cfg(is_train=False)
+    with fluid.program_guard(test, startup):
+        test_args = build_program(is_train=False)
 
+    use_cuda = fluid.core.is_compiled_with_cuda()
     # startup
-    place = fluid.CUDAPlace(0)
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place=place)
     exe.run(startup)
 
     train_exe = fluid.ParallelExecutor(
-        use_cuda=True, loss_name=train_args['loss'].name, main_program=train)
+        use_cuda=use_cuda,
+        loss_name=train_args['loss'].name,
+        main_program=train)
+    test_exe = fluid.ParallelExecutor(
+        use_cuda=use_cuda, main_program=test, share_vars_from=train_exe)
 
     fetch_var_list = [var.name for var in train_args['log']]
-    for i in xrange(sys.maxint):
-        result = map(numpy.array,
-                     train_exe.run(fetch_list=fetch_var_list
-                                   if i % 1000 == 0 else []))
-        if len(result) != 0:
-            print 'Train: ', result
-
-        if i % 1000 == 0:
-            test_exe = fluid.ParallelExecutor(
-                use_cuda=True, main_program=test, share_vars_from=train_exe)
-            loss = []
-            acc = []
-            try:
-                while True:
-                    loss_np, acc_np = map(
-                        numpy.array, test_exe.run(fetch_list=fetch_var_list))
-                    loss.append(loss_np[0])
-                    acc.append(acc_np[0])
-            except:
-                test_args['file'].reset()
-                print 'TEST: ', numpy.mean(loss), numpy.mean(acc)
+    for epoch_id in range(epoch_num):
+        # train
+        try:
+            batch_id = 0
+            while True:
+                result = map(numpy.array,
+                             train_exe.run(fetch_list=fetch_var_list
+                                           if batch_id % 10 == 0 else []))
+                if len(result) != 0:
+                    print 'Train loss: ', result
+                batch_id += 1
+        except fluid.core.EOFException:
+            print 'End of epoch', epoch_id
+            train_args['file'].reset()
+
+        # test
+        loss = []
+        acc = []
+        try:
+            while True:
+                loss_np, acc_np = map(numpy.array,
+                                      test_exe.run(fetch_list=fetch_var_list))
+                loss.append(loss_np[0])
+                acc.append(acc_np[0])
+        except:
+            test_args['file'].reset()
+            print 'TEST: ', numpy.mean(loss), numpy.mean(acc)
 
 
 if __name__ == '__main__':

From aae994fd26f5d09488d5993efa29349a056ca83c Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Tue, 24 Jul 2018 14:49:45 +0800
Subject: [PATCH 11/37] refine im2col no padding

---
 paddle/fluid/operators/math/im2col.cc | 34 +++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
index a50b9ace39..1a42ef1264 100644
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -40,22 +40,46 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
     int im_width = im.dims()[2];
     int filter_height = col->dims()[1];
     int filter_width = col->dims()[2];
-    int col_height = col->dims()[3];
-    int col_width = col->dims()[4];
+    int output_height = col->dims()[3];
+    int output_width = col->dims()[4];
 
     int channels_col = im_channels * filter_height * filter_width;
 
     const T* im_data = im.data<T>();
     T* col_data = col->data<T>();
+    // TODO(TJ): change me to template
+    // further optimaze:
+    // 1. padding != 1
+    // 2. could also support stride_h != 1
+    if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 &&
+        dilation[1] == 1 && padding[0] == 0 && padding[1] == 0) {
+      int col_matrix_width = output_width * output_height;
+      for (int oh = 0; oh < output_height; ++oh) {
+        const T* im_data_start = im_data + oh * im_width;
+        T* dst_data = col_data + oh * output_width;
+        for (int ic = 0; ic < im_channels; ++ic) {
+          const T* src_data = im_data_start + ic * im_height * im_width;
+          for (int kh = 0; kh < filter_height; ++kh) {
+            for (int kw = 0; kw < filter_width; ++kw) {
+              std::memcpy(dst_data, src_data + kw, sizeof(T) * output_width);
+              dst_data = dst_data + col_matrix_width;
+            }
+            src_data = src_data + im_width;
+          }
+        }
+      }
+      return;
+    }
+
     for (int c = 0; c < channels_col; ++c) {
       int w_offset = c % filter_width;
       int h_offset = (c / filter_width) % filter_height;
       int c_im = c / (filter_width * filter_height);
-      for (int h = 0; h < col_height; ++h) {
+      for (int h = 0; h < output_height; ++h) {
         int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
-        for (int w = 0; w < col_width; ++w) {
+        for (int w = 0; w < output_width; ++w) {
           int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
-          int col_idx = (c * col_height + h) * col_width + w;
+          int col_idx = (c * output_height + h) * output_width + w;
           int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
 
           col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||

From 5022b14de85ee6787629839506919363a3d85b84 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Tue, 24 Jul 2018 16:18:22 +0800
Subject: [PATCH 12/37] fix mixed tensor compile and add cpu unit test

---
 paddle/fluid/framework/CMakeLists.txt         |  7 ++-
 paddle/fluid/framework/mixed_vector.h         | 10 ++--
 ...ed_vector_test.cu => mixed_vector_test.cc} | 60 ++++++++++++-------
 3 files changed, 51 insertions(+), 26 deletions(-)
 rename paddle/fluid/framework/{mixed_vector_test.cu => mixed_vector_test.cc} (92%)

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index de06c860f5..a526c47aa9 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -22,7 +22,12 @@ endif()
 
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
-nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place memory device_context tensor)
+if(WITH_GPU)
+  nv_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
+else()
+  cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
+endif()
+
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio)
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index 71bebeea63..f4fe334342 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -16,6 +16,7 @@
 
 #include <algorithm>
 #include <initializer_list>
+#include <memory>
 #include <vector>
 
 #include "paddle/fluid/framework/tensor.h"
@@ -386,13 +387,14 @@ template <typename T>
 class CPUVector : public std::vector<T, std::allocator<T>> {
  public:
   CPUVector() : std::vector<T>() {}
-  CPUVector(size_t count, const T &value = T())
+  explicit CPUVector(size_t count, const T &value = T())
       : std::vector<T>(count, value) {}
   CPUVector(std::initializer_list<T> init) : std::vector<T>(init) {}
-  CPUVector(const std::vector<T> &other) : std::vector<T>(other) {}
+  explicit CPUVector(const std::vector<T> &other) : std::vector<T>(other) {}
   explicit CPUVector(const CPUVector<T> &other) : std::vector<T>(other) {}
   CPUVector(CPUVector<T> &&other) : std::vector<T>(std::move(other)) {}
-  CPUVector(std::vector<T> &&other) : std::vector<T>(std::move(other)) {}
+  explicit CPUVector(std::vector<T> &&other)
+      : std::vector<T>(std::move(other)) {}
   CPUVector &operator=(const CPUVector &other) {
     this->assign(other.begin(), other.end());
     return *this;
@@ -410,8 +412,6 @@ class CPUVector : public std::vector<T, std::allocator<T>> {
     return os;
   }
 
-  void resize(size_t size) { this->resize(size); }
-
   T &operator[](size_t id) { return this->at(id); }
 
   const T &operator[](size_t id) const { return this->at(id); }
diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cc
similarity index 92%
rename from paddle/fluid/framework/mixed_vector_test.cu
rename to paddle/fluid/framework/mixed_vector_test.cc
index d57f825108..90d9a7c828 100644
--- a/paddle/fluid/framework/mixed_vector_test.cu
+++ b/paddle/fluid/framework/mixed_vector_test.cc
@@ -11,8 +11,15 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+
 #include <cuda_runtime.h>
 
+#endif
+
+#include <memory>
+
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/mixed_vector.h"
@@ -41,6 +48,38 @@ TEST(mixed_vector, CPU_VECTOR) {
   }
 }
 
+TEST(mixed_vector, InitWithCount) {
+  paddle::framework::Vector<int> vec(10, 10);
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_EQ(vec[i], 10);
+  }
+}
+
+TEST(mixed_vector, ForEach) {
+  vec<int> tmp;
+  for (auto& v : tmp) {
+    VLOG(3) << v;
+  }
+}
+
+TEST(mixed_vector, Reserve) {
+  paddle::framework::Vector<int> vec;
+  vec.reserve(1);
+  vec.push_back(0);
+  vec.push_back(0);
+  vec.push_back(0);
+}
+
+TEST(mixed_vector, Resize) {
+  paddle::framework::Vector<int> vec;
+  vec.resize(1);
+  vec.push_back(0);
+  vec.push_back(0);
+  vec.push_back(0);
+}
+
+#ifdef PADDLE_WITH_CUDA
+
 static __global__ void multiply_10(int* ptr) {
   for (int i = 0; i < 10; ++i) {
     ptr[i] *= 10;
@@ -92,23 +131,4 @@ TEST(mixed_vector, MultiGPU) {
   }
 }
 
-TEST(mixed_vector, InitWithCount) {
-  paddle::framework::Vector<int> vec(10, 10);
-  for (int i = 0; i < 10; ++i) {
-    ASSERT_EQ(vec[i], 10);
-  }
-}
-
-TEST(mixed_vector, ForEach) {
-  vec<int> tmp;
-  for (auto& v : tmp) {
-  }
-}
-
-TEST(mixed_vector, Reserve) {
-  paddle::framework::Vector<int> vec;
-  vec.reserve(1);
-  vec.push_back(0);
-  vec.push_back(0);
-  vec.push_back(0);
-}
+#endif

From 0388d1cb2ac73b28896e28b510cf929da7fa7776 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Tue, 24 Jul 2018 16:36:33 +0800
Subject: [PATCH 13/37] some update

---
 .../.gitignore                                |  0
 .../convert_data_to_recordio.py               |  2 +-
 .../train.py                                  | 41 ++++++++-----------
 3 files changed, 18 insertions(+), 25 deletions(-)
 rename python/paddle/fluid/tests/demo/{text_classification => file_reader}/.gitignore (100%)
 rename python/paddle/fluid/tests/demo/{text_classification => file_reader}/convert_data_to_recordio.py (96%)
 rename python/paddle/fluid/tests/demo/{text_classification => file_reader}/train.py (78%)

diff --git a/python/paddle/fluid/tests/demo/text_classification/.gitignore b/python/paddle/fluid/tests/demo/file_reader/.gitignore
similarity index 100%
rename from python/paddle/fluid/tests/demo/text_classification/.gitignore
rename to python/paddle/fluid/tests/demo/file_reader/.gitignore
diff --git a/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py b/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
similarity index 96%
rename from python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py
rename to python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
index aaa713df88..b839e14889 100644
--- a/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py
+++ b/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
@@ -50,7 +50,7 @@ feeder = fluid.DataFeeder(feed_list=[data, label], place=fluid.CPUPlace())
 BATCH_SIZE = 128
 train_reader = paddle.batch(
     paddle.reader.shuffle(
-        paddle.dataset.imdb.train(word_dict), buf_size=10000),
+        paddle.dataset.imdb.train(word_dict), buf_size=25000),
     batch_size=BATCH_SIZE)
 
 test_reader = paddle.batch(
diff --git a/python/paddle/fluid/tests/demo/text_classification/train.py b/python/paddle/fluid/tests/demo/file_reader/train.py
similarity index 78%
rename from python/paddle/fluid/tests/demo/text_classification/train.py
rename to python/paddle/fluid/tests/demo/file_reader/train.py
index 9e930b67a4..bc3a6dc81d 100644
--- a/python/paddle/fluid/tests/demo/text_classification/train.py
+++ b/python/paddle/fluid/tests/demo/file_reader/train.py
@@ -27,9 +27,6 @@ emb_dim = 128
 # hidden dim
 hid_dim = 128
 
-# hidden dim2
-hid_dim2 = 96
-
 # class num
 class_dim = 2
 
@@ -42,13 +39,9 @@ def build_program(is_train):
         filenames=TRAIN_FILES if is_train else TEST_FILES,
         shapes=[[-1, 1], [-1, 1]],
         lod_levels=[1, 0],
-        dtypes=['int64', 'int64'],
-        thread_num=1)
-    if is_train:
-        file_obj = fluid.layers.io.shuffle(file_obj_handle, buffer_size=1000)
-    else:
-        file_obj = file_obj_handle
-    file_obj = fluid.layers.io.double_buffer(file_obj)
+        dtypes=['int64', 'int64'])
+
+    file_obj = fluid.layers.io.double_buffer(file_obj_handle)
 
     with fluid.unique_name.guard():
 
@@ -56,22 +49,24 @@ def build_program(is_train):
 
         emb = fluid.layers.embedding(input=data, size=[DICT_DIM, emb_dim])
 
-        # sequence conv with window size = 3
-        win_size = 3
         conv_3 = fluid.nets.sequence_conv_pool(
             input=emb,
             num_filters=hid_dim,
-            filter_size=win_size,
+            filter_size=3,
             act="tanh",
-            pool_type="max")
+            pool_type="sqrt")
 
-        # fc layer after conv
-        fc_1 = fluid.layers.fc(input=[conv_3], size=hid_dim2)
+        conv_4 = fluid.nets.sequence_conv_pool(
+            input=emb,
+            num_filters=hid_dim,
+            filter_size=4,
+            act="tanh",
+            pool_type="sqrt")
 
-        # probability of each class
-        prediction = fluid.layers.fc(input=[fc_1],
+        prediction = fluid.layers.fc(input=[conv_3, conv_4],
                                      size=class_dim,
                                      act="softmax")
+
         # cross entropy loss
         cost = fluid.layers.cross_entropy(input=prediction, label=label)
 
@@ -117,11 +112,9 @@ def main():
         try:
             batch_id = 0
             while True:
-                result = map(numpy.array,
-                             train_exe.run(fetch_list=fetch_var_list
-                                           if batch_id % 10 == 0 else []))
-                if len(result) != 0:
-                    print 'Train loss: ', result
+                loss, acc = map(numpy.array,
+                                train_exe.run(fetch_list=fetch_var_list))
+                print 'Train epoch', epoch_id, 'batch', batch_id, 'loss:', loss, 'acc:', acc
                 batch_id += 1
         except fluid.core.EOFException:
             print 'End of epoch', epoch_id
@@ -138,7 +131,7 @@ def main():
                 acc.append(acc_np[0])
         except:
             test_args['file'].reset()
-            print 'TEST: ', numpy.mean(loss), numpy.mean(acc)
+            print 'Test loss:', numpy.mean(loss), 'acc:', numpy.mean(acc)
 
 
 if __name__ == '__main__':

From b163e601b6a4886523fb869b2055eb07be0799b9 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Tue, 24 Jul 2018 17:38:40 +0800
Subject: [PATCH 14/37] add gtest

---
 paddle/fluid/operators/math/im2col_test.cc | 85 ++++++++++++++++++++++
 1 file changed, 85 insertions(+)

diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc
index 8e3f0f2868..87bfc1e95a 100644
--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
@@ -160,8 +160,93 @@ void testIm2col() {
   delete context;
 }
 
+void testIm2colCPU() {
+  paddle::framework::Tensor input;
+  paddle::framework::Tensor output;
+  int input_height = 3;
+  int input_width = 4;
+  int filter_size = 2;
+  int ic = 2;
+  std::vector<int> stride({1, 1});  // stride_y, stride_x
+  std::vector<int> padding({0, 0});
+  std::vector<int> dilation({1, 1});  // dilation_y, dilation_x
+  int output_height =
+      (input_height - filter_size + padding[0] * 2) / stride[0] + 1;
+  int output_width =
+      (input_width - filter_size + padding[1] * 2) / stride[1] + 1;
+  float* input_ptr = input.mutable_data<float>({ic, input_height, input_width},
+                                               paddle::platform::CPUPlace());
+  for (int i = 0; i < input.numel(); ++i) {
+    input_ptr[i] = static_cast<float>(i);
+  }
+
+  paddle::platform::CPUPlace place;
+  paddle::platform::CPUDeviceContext context(place);
+  output.mutable_data<float>(
+      {ic, filter_size, filter_size, output_height, output_width}, place);
+  paddle::operators::math::Im2ColFunctor<
+      paddle::operators::math::ColFormat::kCFO,
+      paddle::platform::CPUDeviceContext, float>
+      im2col;
+  im2col(context, input, dilation, stride, padding, &output);
+  auto ref_im2col = [&](
+      const paddle::framework::Tensor& im, const std::vector<int>& dilation,
+      const std::vector<int>& stride, const std::vector<int>& padding,
+      paddle::framework::Tensor* col) {
+    int im_channels = im.dims()[0];
+    int im_height = im.dims()[1];
+    int im_width = im.dims()[2];
+    int filter_height = col->dims()[1];
+    int filter_width = col->dims()[2];
+    int output_height = col->dims()[3];
+    int output_width = col->dims()[4];
+
+    int channels_col = im_channels * filter_height * filter_width;
+
+    const float* im_data = im.data<float>();
+    float* col_data = col->data<float>();
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int c_im = c / (filter_width * filter_height);
+      for (int h = 0; h < output_height; ++h) {
+        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+        for (int w = 0; w < output_width; ++w) {
+          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
+          int col_idx = (c * output_height + h) * output_width + w;
+          int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
+
+          col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
+                               im_col_idx < 0 || im_col_idx >= im_width)
+                                  ? 0.f
+                                  : im_data[im_idx];
+        }
+      }
+    }
+  };
+
+  paddle::framework::Tensor ref_output;
+  ref_output.mutable_data<float>(
+      {ic, filter_size, filter_size, output_height, output_width}, place);
+  ref_im2col(input, dilation, stride, padding, &ref_output);
+
+  float* out_cfo_ptr = output.data<float>();
+  for (int i = 0; i < ic * filter_size * filter_size; ++i) {
+    for (int j = 0; j < output_height * output_width; ++j) {
+      std::cout << out_cfo_ptr[i * output_height * output_width + j] << ",";
+    }
+    std::cout << std::endl;
+  }
+
+  float* out_ref_ptr = ref_output.data<float>();
+  for (int i = 0; i < output.numel(); ++i) {
+    EXPECT_EQ(out_cfo_ptr[i], out_ref_ptr[i]);
+  }
+}
+
 TEST(math, im2col) {
   testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
+  testIm2colCPU();
 #ifdef PADDLE_WITH_CUDA
   testIm2col<paddle::platform::CUDADeviceContext,
              paddle::platform::CUDAPlace>();

From 6788af4bf1a3d6fd6803b9506b09e77d79aeb18e Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Tue, 24 Jul 2018 17:50:35 +0800
Subject: [PATCH 15/37] refine test cases

---
 paddle/fluid/operators/math/im2col_test.cc | 45 ++++++++--------------
 1 file changed, 16 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc
index 87bfc1e95a..db61f68db3 100644
--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
@@ -160,30 +160,26 @@ void testIm2col() {
   delete context;
 }
 
-void testIm2colCPU() {
+void testIm2colCPU(int ic, int ih, int iw, int fh, int fw, int ph, int pw) {
   paddle::framework::Tensor input;
   paddle::framework::Tensor output;
-  int input_height = 3;
-  int input_width = 4;
-  int filter_size = 2;
-  int ic = 2;
-  std::vector<int> stride({1, 1});  // stride_y, stride_x
-  std::vector<int> padding({0, 0});
+  paddle::framework::Tensor ref_output;
+  std::vector<int> padding({ph, pw});
+  std::vector<int> stride({1, 1});    // stride_y, stride_x
   std::vector<int> dilation({1, 1});  // dilation_y, dilation_x
-  int output_height =
-      (input_height - filter_size + padding[0] * 2) / stride[0] + 1;
-  int output_width =
-      (input_width - filter_size + padding[1] * 2) / stride[1] + 1;
-  float* input_ptr = input.mutable_data<float>({ic, input_height, input_width},
-                                               paddle::platform::CPUPlace());
+  int output_height = (ih - fh + padding[0] * 2) / stride[0] + 1;
+  int output_width = (iw - fw + padding[1] * 2) / stride[1] + 1;
+  float* input_ptr =
+      input.mutable_data<float>({ic, ih, iw}, paddle::platform::CPUPlace());
   for (int i = 0; i < input.numel(); ++i) {
-    input_ptr[i] = static_cast<float>(i);
+    input_ptr[i] = static_cast<float>(i + 1);
   }
 
   paddle::platform::CPUPlace place;
   paddle::platform::CPUDeviceContext context(place);
-  output.mutable_data<float>(
-      {ic, filter_size, filter_size, output_height, output_width}, place);
+  output.mutable_data<float>({ic, fh, fw, output_height, output_width}, place);
+  ref_output.mutable_data<float>({ic, fh, fw, output_height, output_width},
+                                 place);
   paddle::operators::math::Im2ColFunctor<
       paddle::operators::math::ColFormat::kCFO,
       paddle::platform::CPUDeviceContext, float>
@@ -200,7 +196,6 @@ void testIm2colCPU() {
     int filter_width = col->dims()[2];
     int output_height = col->dims()[3];
     int output_width = col->dims()[4];
-
     int channels_col = im_channels * filter_height * filter_width;
 
     const float* im_data = im.data<float>();
@@ -215,7 +210,6 @@ void testIm2colCPU() {
           int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
           int col_idx = (c * output_height + h) * output_width + w;
           int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
-
           col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
                                im_col_idx < 0 || im_col_idx >= im_width)
                                   ? 0.f
@@ -225,19 +219,9 @@ void testIm2colCPU() {
     }
   };
 
-  paddle::framework::Tensor ref_output;
-  ref_output.mutable_data<float>(
-      {ic, filter_size, filter_size, output_height, output_width}, place);
   ref_im2col(input, dilation, stride, padding, &ref_output);
 
   float* out_cfo_ptr = output.data<float>();
-  for (int i = 0; i < ic * filter_size * filter_size; ++i) {
-    for (int j = 0; j < output_height * output_width; ++j) {
-      std::cout << out_cfo_ptr[i * output_height * output_width + j] << ",";
-    }
-    std::cout << std::endl;
-  }
-
   float* out_ref_ptr = ref_output.data<float>();
   for (int i = 0; i < output.numel(); ++i) {
     EXPECT_EQ(out_cfo_ptr[i], out_ref_ptr[i]);
@@ -246,7 +230,10 @@ void testIm2colCPU() {
 
 TEST(math, im2col) {
   testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
-  testIm2colCPU();
+  testIm2colCPU(/*ic*/ 3, /*ih*/ 5, /*iw*/ 5, /*fh*/ 3, /*fw*/ 2, /*ph*/ 0,
+                /*pw*/ 0);
+  testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 3, /*fw*/ 3, /*ph*/ 1,
+                /*pw*/ 1);
 #ifdef PADDLE_WITH_CUDA
   testIm2col<paddle::platform::CUDADeviceContext,
              paddle::platform::CUDAPlace>();

From b72befc5cc06b75f4fdc44ac27f13d9114fad12f Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Tue, 24 Jul 2018 23:22:53 +0800
Subject: [PATCH 16/37] reuse copy size

---
 paddle/fluid/operators/math/im2col.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
index 1a42ef1264..bb55ce21b0 100644
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -54,6 +54,7 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
     if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 &&
         dilation[1] == 1 && padding[0] == 0 && padding[1] == 0) {
       int col_matrix_width = output_width * output_height;
+      size_t copy_size = sizeof(T) * output_width;
       for (int oh = 0; oh < output_height; ++oh) {
         const T* im_data_start = im_data + oh * im_width;
         T* dst_data = col_data + oh * output_width;
@@ -61,7 +62,7 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
           const T* src_data = im_data_start + ic * im_height * im_width;
           for (int kh = 0; kh < filter_height; ++kh) {
             for (int kw = 0; kw < filter_width; ++kw) {
-              std::memcpy(dst_data, src_data + kw, sizeof(T) * output_width);
+              std::memcpy(dst_data, src_data + kw, copy_size);
               dst_data = dst_data + col_matrix_width;
             }
             src_data = src_data + im_width;

From da035fc674b5b837860973d2af0fbfc8db1a60c6 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Wed, 25 Jul 2018 09:35:27 +0800
Subject: [PATCH 17/37] remove explicit for compile problem

---
 paddle/fluid/framework/mixed_vector.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index f4fe334342..7836ecb127 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -387,13 +387,13 @@ template <typename T>
 class CPUVector : public std::vector<T, std::allocator<T>> {
  public:
   CPUVector() : std::vector<T>() {}
-  explicit CPUVector(size_t count, const T &value = T())
+  CPUVector(size_t count, const T &value = T())  // NOLINT
       : std::vector<T>(count, value) {}
   CPUVector(std::initializer_list<T> init) : std::vector<T>(init) {}
-  explicit CPUVector(const std::vector<T> &other) : std::vector<T>(other) {}
-  explicit CPUVector(const CPUVector<T> &other) : std::vector<T>(other) {}
+  CPUVector(const std::vector<T> &other) : std::vector<T>(other) {}  // NOLINT
+  CPUVector(const CPUVector<T> &other) : std::vector<T>(other) {}
   CPUVector(CPUVector<T> &&other) : std::vector<T>(std::move(other)) {}
-  explicit CPUVector(std::vector<T> &&other)
+  CPUVector(std::vector<T> &&other)  // NOLINT
       : std::vector<T>(std::move(other)) {}
   CPUVector &operator=(const CPUVector &other) {
     this->assign(other.begin(), other.end());

From aa3618ed3e585802fea5e895430c8d6ff02beb45 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Wed, 25 Jul 2018 10:11:03 +0800
Subject: [PATCH 18/37] fix _create_prefetch_block in distribute_transpiler

---
 python/paddle/fluid/transpiler/distribute_transpiler.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index fc58703eca..e7698d8c52 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -887,7 +887,8 @@ class DistributeTranspiler(object):
         # create table optimize block in pserver program
         table_opt_op = [
             op for op in self.optimize_ops
-            if op.input("Param")[0] == self.table_name
+            if 'Param' in op.input_names and op.input("Param")[0] ==
+            self.table_name
         ][0]
         table_opt_block = pserver_program.create_block(pre_block_idx)
         # only support sgd now

From 65e5aebd437f5352e1e937ca6de83f8baa318c8e Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Wed, 25 Jul 2018 12:59:40 +0800
Subject: [PATCH 19/37] fix mixed_vector_test

---
 paddle/fluid/framework/CMakeLists.txt       |  2 +-
 paddle/fluid/framework/mixed_vector_test.cc | 62 -----------------
 paddle/fluid/framework/mixed_vector_test.cu | 75 +++++++++++++++++++++
 3 files changed, 76 insertions(+), 63 deletions(-)
 create mode 100644 paddle/fluid/framework/mixed_vector_test.cu

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index a526c47aa9..93ec047c80 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -23,7 +23,7 @@ endif()
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
 if(WITH_GPU)
-  nv_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
+  nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor)
 else()
   cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
 endif()
diff --git a/paddle/fluid/framework/mixed_vector_test.cc b/paddle/fluid/framework/mixed_vector_test.cc
index 90d9a7c828..0599c8d384 100644
--- a/paddle/fluid/framework/mixed_vector_test.cc
+++ b/paddle/fluid/framework/mixed_vector_test.cc
@@ -12,18 +12,11 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#ifdef PADDLE_WITH_CUDA
-
-#include <cuda_runtime.h>
-
-#endif
-
 #include <memory>
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/mixed_vector.h"
-#include "paddle/fluid/platform/gpu_info.h"
 
 template <typename T>
 using vec = paddle::framework::Vector<T>;
@@ -77,58 +70,3 @@ TEST(mixed_vector, Resize) {
   vec.push_back(0);
   vec.push_back(0);
 }
-
-#ifdef PADDLE_WITH_CUDA
-
-static __global__ void multiply_10(int* ptr) {
-  for (int i = 0; i < 10; ++i) {
-    ptr[i] *= 10;
-  }
-}
-
-cudaStream_t GetCUDAStream(paddle::platform::CUDAPlace place) {
-  return reinterpret_cast<const paddle::platform::CUDADeviceContext*>(
-             paddle::platform::DeviceContextPool::Instance().Get(place))
-      ->stream();
-}
-
-TEST(mixed_vector, GPU_VECTOR) {
-  vec<int> tmp;
-  for (int i = 0; i < 10; ++i) {
-    tmp.push_back(i);
-  }
-  ASSERT_EQ(tmp.size(), 10UL);
-  paddle::platform::CUDAPlace gpu(0);
-
-  multiply_10<<<1, 1, 0, GetCUDAStream(gpu)>>>(tmp.MutableData(gpu));
-
-  for (int i = 0; i < 10; ++i) {
-    ASSERT_EQ(tmp[i], i * 10);
-  }
-}
-
-TEST(mixed_vector, MultiGPU) {
-  if (paddle::platform::GetCUDADeviceCount() < 2) {
-    LOG(WARNING) << "Skip mixed_vector.MultiGPU since there are not multiple "
-                    "GPUs in your machine.";
-    return;
-  }
-
-  vec<int> tmp;
-  for (int i = 0; i < 10; ++i) {
-    tmp.push_back(i);
-  }
-  ASSERT_EQ(tmp.size(), 10UL);
-  paddle::platform::CUDAPlace gpu0(0);
-  paddle::platform::SetDeviceId(0);
-  multiply_10<<<1, 1, 0, GetCUDAStream(gpu0)>>>(tmp.MutableData(gpu0));
-  paddle::platform::CUDAPlace gpu1(1);
-  auto* gpu1_ptr = tmp.MutableData(gpu1);
-  paddle::platform::SetDeviceId(1);
-  multiply_10<<<1, 1, 0, GetCUDAStream(gpu1)>>>(gpu1_ptr);
-  for (int i = 0; i < 10; ++i) {
-    ASSERT_EQ(tmp[i], i * 100);
-  }
-}
-
-#endif
diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu
new file mode 100644
index 0000000000..4b0caa8d35
--- /dev/null
+++ b/paddle/fluid/framework/mixed_vector_test.cu
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <cuda_runtime.h>
+#include <memory>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+template <typename T>
+using vec = paddle::framework::Vector<T>;
+
+static __global__ void multiply_10(int* ptr) {
+  for (int i = 0; i < 10; ++i) {
+    ptr[i] *= 10;
+  }
+}
+
+cudaStream_t GetCUDAStream(paddle::platform::CUDAPlace place) {
+  return reinterpret_cast<const paddle::platform::CUDADeviceContext*>(
+             paddle::platform::DeviceContextPool::Instance().Get(place))
+      ->stream();
+}
+
+TEST(mixed_vector, GPU_VECTOR) {
+  vec<int> tmp;
+  for (int i = 0; i < 10; ++i) {
+    tmp.push_back(i);
+  }
+  ASSERT_EQ(tmp.size(), 10UL);
+  paddle::platform::CUDAPlace gpu(0);
+
+  multiply_10<<<1, 1, 0, GetCUDAStream(gpu)>>>(tmp.MutableData(gpu));
+
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_EQ(tmp[i], i * 10);
+  }
+}
+
+TEST(mixed_vector, MultiGPU) {
+  if (paddle::platform::GetCUDADeviceCount() < 2) {
+    LOG(WARNING) << "Skip mixed_vector.MultiGPU since there are not multiple "
+                    "GPUs in your machine.";
+    return;
+  }
+
+  vec<int> tmp;
+  for (int i = 0; i < 10; ++i) {
+    tmp.push_back(i);
+  }
+  ASSERT_EQ(tmp.size(), 10UL);
+  paddle::platform::CUDAPlace gpu0(0);
+  paddle::platform::SetDeviceId(0);
+  multiply_10<<<1, 1, 0, GetCUDAStream(gpu0)>>>(tmp.MutableData(gpu0));
+  paddle::platform::CUDAPlace gpu1(1);
+  auto* gpu1_ptr = tmp.MutableData(gpu1);
+  paddle::platform::SetDeviceId(1);
+  multiply_10<<<1, 1, 0, GetCUDAStream(gpu1)>>>(gpu1_ptr);
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_EQ(tmp[i], i * 100);
+  }
+}

From a4c5223713ab1c2d1683071bd8218f2a37cc15e1 Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Wed, 25 Jul 2018 13:44:13 +0800
Subject: [PATCH 20/37] Update test_pe_mnist threshold (#12348)

* update test_pe_mnist threshold

* clean code
---
 .../unittests/test_parallel_executor_mnist.py | 85 ++++++-------------
 1 file changed, 28 insertions(+), 57 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
index b21e16439a..76389d916f 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -107,44 +107,24 @@ class TestMNIST(TestParallelExecutorBase):
         label = np.ones(shape=[32, 1], dtype='int64')
         return img, label
 
-    # simple_fc
-    def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
+    def _compare_reduce_and_allreduce(self, model, use_cuda, random_data=True):
         if use_cuda and not core.is_compiled_with_cuda():
             return
-        self.check_network_convergence(simple_fc_net, use_cuda=use_cuda)
         self.check_network_convergence(
-            simple_fc_net, use_cuda=use_cuda, allow_op_delay=True)
-
-        img, label = self._init_data()
-
+            model, use_cuda=use_cuda, use_reduce=True)
         self.check_network_convergence(
-            simple_fc_net,
-            feed_dict={"image": img,
-                       "label": label},
-            use_cuda=use_cuda,
-            use_reduce=use_reduce)
+            model, use_cuda=use_cuda, allow_op_delay=True, use_reduce=True)
 
-    def check_simple_fc_convergence_with_Reduce(self, use_cuda):
-        if use_cuda and not core.is_compiled_with_cuda():
-            return
-        self.check_network_convergence(
-            simple_fc_net, use_cuda=use_cuda, use_reduce=True)
-        self.check_network_convergence(
-            simple_fc_net,
-            use_cuda=use_cuda,
-            allow_op_delay=True,
-            use_reduce=True)
-
-        img, label = self._init_data()
+        img, label = self._init_data(random_data)
 
         all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
-            simple_fc_net,
+            model,
             feed_dict={"image": img,
                        "label": label},
             use_cuda=use_cuda,
             use_reduce=False)
         reduce_first_loss, reduce_last_loss = self.check_network_convergence(
-            simple_fc_net,
+            model,
             feed_dict={"image": img,
                        "label": label},
             use_cuda=use_cuda,
@@ -153,7 +133,24 @@ class TestMNIST(TestParallelExecutorBase):
         for loss in zip(all_reduce_first_loss, reduce_first_loss):
             self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
         for loss in zip(all_reduce_last_loss, reduce_last_loss):
-            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-4)
+
+    # simple_fc
+    def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+        self.check_network_convergence(simple_fc_net, use_cuda=use_cuda)
+        self.check_network_convergence(
+            simple_fc_net, use_cuda=use_cuda, allow_op_delay=True)
+
+        img, label = self._init_data()
+
+        self.check_network_convergence(
+            simple_fc_net,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            use_reduce=use_reduce)
 
     def test_simple_fc(self):
         # use_cuda
@@ -162,8 +159,8 @@ class TestMNIST(TestParallelExecutorBase):
 
     def test_simple_fc_with_new_strategy(self):
         # use_cuda, use_reduce
-        self.check_simple_fc_convergence_with_Reduce(True)
-        self.check_simple_fc_convergence_with_Reduce(False)
+        self._compare_reduce_and_allreduce(simple_fc_net, True)
+        self._compare_reduce_and_allreduce(simple_fc_net, False)
 
     def check_simple_fc_parallel_accuracy(self, use_cuda):
         if use_cuda and not core.is_compiled_with_cuda():
@@ -209,39 +206,13 @@ class TestMNIST(TestParallelExecutorBase):
                        "label": label},
             use_cuda=use_cuda)
 
-    def check_batchnorm_fc_convergence_use_reduce(self, use_cuda):
-        if use_cuda and not core.is_compiled_with_cuda():
-            return
-        self.check_network_convergence(
-            fc_with_batchnorm, use_cuda=use_cuda, use_reduce=True)
-
-        img, label = self._init_data()
-
-        all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
-            fc_with_batchnorm,
-            feed_dict={"image": img,
-                       "label": label},
-            use_cuda=use_cuda,
-            use_reduce=False)
-        reduce_first_loss, reduce_last_loss = self.check_network_convergence(
-            fc_with_batchnorm,
-            feed_dict={"image": img,
-                       "label": label},
-            use_cuda=use_cuda,
-            use_reduce=True)
-
-        for loss in zip(all_reduce_first_loss, reduce_first_loss):
-            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
-        for loss in zip(all_reduce_last_loss, reduce_last_loss):
-            self.assertAlmostEquals(loss[0], loss[1], delta=1e-4)
-
     def test_batchnorm_fc(self):
         self.check_batchnorm_fc_convergence(True)
         self.check_batchnorm_fc_convergence(False)
 
     def test_batchnorm_fc_with_new_strategy(self):
-        self.check_batchnorm_fc_convergence_use_reduce(True)
-        self.check_batchnorm_fc_convergence_use_reduce(False)
+        self._compare_reduce_and_allreduce(fc_with_batchnorm, True)
+        self._compare_reduce_and_allreduce(fc_with_batchnorm, False)
 
 
 if __name__ == '__main__':

From 754e96a30c1812a658c5ccf1281a1c39fef9416b Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Wed, 25 Jul 2018 14:04:58 +0800
Subject: [PATCH 21/37] distribute lookup table work with parallel executor

---
 paddle/fluid/framework/details/multi_devices_graph_builder.cc | 2 +-
 python/paddle/fluid/transpiler/distribute_transpiler.py       | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index f1f8674caf..e0fb92cc3c 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -207,7 +207,7 @@ std::unique_ptr<Graph> MultiDevSSAGraphBuilder::Apply(
   result.Set("ops", new GraphOps);
 
   // find send/recv vars so that we can place the distributed training
-  // realted op in the place 0
+  // related op in the place 0
   auto send_vars = FindDistTrainSendVars(nodes);
   auto recv_vars = FindDistTrainRecvVars(nodes);
 
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index e7698d8c52..4a9ea6af74 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -779,7 +779,9 @@ class DistributeTranspiler(object):
                         outputs={"Out": prefetch_output_vars},
                         attrs={
                             "epmap": pserver_endpoints,
-                            RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                            # FIXME(qiao) temporarily disable this config because prefetch
+                            # is not act as other rpc op, it's more like a forward op
+                            # RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
                         })
 
                     # insert concat_op

From 01566fb61b6605906339dad6eefa68a9a235299d Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Wed, 25 Jul 2018 06:25:03 +0000
Subject: [PATCH 22/37] 1. support mutil batch utest 2. support pool op

---
 .../inference/tensorrt/convert/pool2d_op.cc   |  2 ++
 .../tensorrt/convert/test_activation_op.cc    |  5 +--
 .../inference/tensorrt/convert/test_fc_op.cc  |  8 ++---
 .../inference/tensorrt/convert/test_mul_op.cc |  3 +-
 .../tensorrt/convert/test_pool2d_op.cc        | 21 ++++++------
 .../inference/tensorrt/convert/ut_helper.h    | 26 +++++++++++----
 .../fluid/inference/tensorrt/test_engine.cc   | 33 ++++++++++++++++++-
 7 files changed, 73 insertions(+), 25 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index ea282a79d5..2db551758c 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -43,6 +43,8 @@ class Pool2dOpConverter : public OpConverter {
     const nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
     const nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
 
+    PADDLE_ENFORCE_EQ(input1->getDimensions().nbDims, 3UL);
+
     nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kMAX;
     if (pool_type == "max") {
       pool_t = nvinfer1::PoolingType::kMAX;
diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
index 7dabfd9f6a..5a1543de81 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
@@ -23,7 +23,8 @@ namespace tensorrt {
 TEST(ReluOpConverter, main) {
   framework::Scope scope;
   std::unordered_set<std::string> parameters;
-  TRTConvertValidation validator(10, parameters, scope, 1000);
+  int runtime_batch = 3;
+  TRTConvertValidation validator(10, parameters, scope, 1000, runtime_batch);
   validator.DeclInputVar("relu-X", nvinfer1::Dims2(10, 6));
   validator.DeclOutputVar("relu-Out", nvinfer1::Dims2(10, 6));
 
@@ -37,7 +38,7 @@ TEST(ReluOpConverter, main) {
   validator.SetOp(*desc.Proto());
   LOG(INFO) << "execute";
 
-  validator.Execute(1);
+  validator.Execute(runtime_batch);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
index 081f4d6059..0c62108332 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
@@ -23,10 +23,10 @@ namespace tensorrt {
 TEST(fc_op, test) {
   std::unordered_set<std::string> parameters({"mul-Y"});
   framework::Scope scope;
-  TRTConvertValidation validator(10, parameters, scope, 1000);
-  validator.DeclInputVar("mul-X", nvinfer1::Dims4(1, 10, 1, 1));
+  int runtime_batch = 2;
+  TRTConvertValidation validator(10, parameters, scope, 1000, runtime_batch);
+  validator.DeclInputVar("mul-X", nvinfer1::Dims3(10, 1, 1));
   validator.DeclParamVar("mul-Y", nvinfer1::Dims2(10, 2));
-  // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
   validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(1, 2));
 
   // Prepare Op description
@@ -38,7 +38,7 @@ TEST(fc_op, test) {
 
   validator.SetOp(*desc.Proto());
 
-  validator.Execute(1);
+  validator.Execute(runtime_batch);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
index 674f37f2fd..a54870566a 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
@@ -23,7 +23,8 @@ namespace tensorrt {
 TEST(MulOpConverter, main) {
   framework::Scope scope;
   std::unordered_set<std::string> parameters;
-  TRTConvertValidation validator(10, parameters, scope, 1000);
+  int runtime_batch = 0;
+  TRTConvertValidation validator(10, parameters, scope, 1000, runtime_batch);
   validator.DeclInputVar("mul-X", nvinfer1::Dims2(10, 6));
   validator.DeclInputVar("mul-Y", nvinfer1::Dims2(6, 10));
   validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(10, 10));
diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
index 261eb2c6ce..57c982f2c0 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
@@ -23,9 +23,14 @@ namespace tensorrt {
 TEST(Pool2dOpConverter, main) {
   framework::Scope scope;
   std::unordered_set<std::string> parameters;
-  TRTConvertValidation validator(10, parameters, scope, 1000);
-  validator.DeclInputVar("pool2d-X", nvinfer1::Dims4(10, 3, 2, 2));
-  validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims4(10, 3, 1, 1));
+  int runtime_batch = 3;
+  TRTConvertValidation validator(5, parameters, scope, 1 << 15, runtime_batch);
+
+  // We have already set the runtime batchsize, so the
+  // Dims should not contain the batch size.
+  // The ITensor's Dims of input and output should be C * H * W.
+  validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 4, 4));
+  validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 2, 2));
 
   // Prepare Op description
   framework::OpDesc desc;
@@ -34,7 +39,7 @@ TEST(Pool2dOpConverter, main) {
   desc.SetOutput("Out", {"pool2d-Out"});
 
   std::vector<int> ksize({2, 2});
-  std::vector<int> strides({1, 1});
+  std::vector<int> strides({2, 2});
   std::vector<int> paddings({0, 0});
   std::string pooling_t = "max";
 
@@ -42,18 +47,12 @@ TEST(Pool2dOpConverter, main) {
   desc.SetAttr("ksize", ksize);
   desc.SetAttr("strides", strides);
   desc.SetAttr("paddings", paddings);
-  // std::string temp = "";
-  // (*desc.Proto()).SerializeToString(&temp);
-
-  // std::cout << temp << std::endl;
-  // std::ofstream f("__temp__", std::ios::out);
-  // f << temp;
 
   LOG(INFO) << "set OP";
   validator.SetOp(*desc.Proto());
   LOG(INFO) << "execute";
 
-  validator.Execute(10);
+  validator.Execute(runtime_batch);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index f14885b238..e20169c4cf 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -63,13 +63,15 @@ class TRTConvertValidation {
  public:
   TRTConvertValidation() = delete;
 
-  TRTConvertValidation(int batch_size,
+  TRTConvertValidation(int max_batch_size,
                        const std::unordered_set<std::string>& parameters,
                        framework::Scope& scope,  // NOLINT
-                       int workspace_size = 1 << 10)
-      : parameters_(parameters), scope_(scope) {
+                       int workspace_size = 1 << 10, int runtime_batch_size = 1)
+      : parameters_(parameters),
+        scope_(scope),
+        runtime_batch_size_(runtime_batch_size) {
     // create engine.
-    engine_.reset(new TensorRTEngine(batch_size, workspace_size, &stream_));
+    engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, &stream_));
     engine_->InitNetwork();
 
     PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
@@ -84,7 +86,7 @@ class TRTConvertValidation {
 
   // Declare a parameter varaible in the scope.
   void DeclParamVar(const std::string& name, const nvinfer1::Dims& dims) {
-    DeclVar(name, dims);
+    DeclVar(name, dims, true);
   }
 
   void DeclOutputVar(const std::string& name, const nvinfer1::Dims& dims) {
@@ -92,12 +94,20 @@ class TRTConvertValidation {
   }
 
   // Declare a variable in a fluid Scope.
-  void DeclVar(const std::string& name, const nvinfer1::Dims& dims) {
+  void DeclVar(const std::string& name, const nvinfer1::Dims& dims,
+               bool is_param = false) {
     platform::CPUPlace place;
     platform::CPUDeviceContext ctx(place);
 
     // Init Fluid tensor.
     std::vector<int> dim_vec(dims.d, dims.d + dims.nbDims);
+    // There is no batchsize in ITensor's shape, but We should add it to
+    // tensor's
+    // shape of fluid. If the variable is not parameter and the batch size
+    // greater than 0,
+    // add the batchsize to dim_vec.
+    if (is_param != true && runtime_batch_size_ > 0)
+      dim_vec.insert(dim_vec.begin(), runtime_batch_size_);
     auto* x = scope_.Var(name);
     auto* x_tensor = x->GetMutable<framework::LoDTensor>();
     x_tensor->Resize(framework::make_ddim(dim_vec));
@@ -167,6 +177,10 @@ class TRTConvertValidation {
   std::unique_ptr<framework::OpDesc> op_desc_;
   const std::unordered_set<std::string>& parameters_;
   framework::Scope& scope_;
+  //  It represents the runtime batchsize when we test.
+  //  If the value greater than 0, we add this to
+  //  the first dimension of tensor's shape of fluid.
+  int runtime_batch_size_;
 };
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index f8732e51b6..dc03702990 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -113,7 +113,7 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
   ASSERT_EQ(y_cpu[1], 14.5);
 }
 
-TEST_F(TensorRTEngineTest, test_conv2d_temp) {
+TEST_F(TensorRTEngineTest, test_conv2d) {
   // Weight in CPU memory.
   float raw_weight[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
   float raw_bias[1] = {0};
@@ -146,6 +146,37 @@ TEST_F(TensorRTEngineTest, test_conv2d_temp) {
   ASSERT_EQ(y_cpu[1], 6.0);
 }
 
+TEST_F(TensorRTEngineTest, test_pool2d) {
+  // Weight in CPU memory.
+  auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
+                                  nvinfer1::Dims3{1, 2, 2});
+
+  nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kAVERAGE;
+  auto* pool_layer =
+      TRT_ENGINE_ADD_LAYER(engine_, Pooling, *const_cast<nvinfer1::ITensor*>(x),
+                           pool_t, nvinfer1::DimsHW{2, 2});
+
+  PADDLE_ENFORCE(pool_layer != nullptr);
+  pool_layer->setStride(nvinfer1::DimsHW{1, 1});
+  pool_layer->setPadding(nvinfer1::DimsHW{0, 0});
+
+  engine_->DeclareOutput(pool_layer, 0, "y");
+  engine_->FreezeNetwork();
+  ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
+
+  float x_v[8] = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0};
+  engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
+                           8 * sizeof(float));
+  engine_->Execute(2);
+
+  LOG(INFO) << "to get output";
+  float* y_cpu = new float[2];
+  engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float));
+
+  ASSERT_EQ(y_cpu[0], 2.0);
+  ASSERT_EQ(y_cpu[1], 5.0);
+}
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle

From fd2d2c66e98e862d3b35e768149ebc18bc64c8a5 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Wed, 25 Jul 2018 17:17:34 +0800
Subject: [PATCH 23/37] add flag to prevent unnessary memory free

---
 paddle/fluid/memory/detail/buddy_allocator.cc | 17 ++++--
 python/paddle/fluid/__init__.py               | 56 +++++++++----------
 2 files changed, 39 insertions(+), 34 deletions(-)

diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index 01a8501dd4..c2f45fdc99 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -15,6 +15,10 @@ limitations under the License. */
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "glog/logging.h"
 
+DEFINE_bool(free_idle_memory, false,
+            "If it is true, Paddle will try to free idle memory trunks during "
+            "running time.");
+
 namespace paddle {
 namespace memory {
 namespace detail {
@@ -152,13 +156,14 @@ void BuddyAllocator::Free(void* p) {
   pool_.insert(
       IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
 
-  // Clean up if existing too much free memory
-
-  // Prefer freeing fallback allocation first
-  CleanIdleFallBackAlloc();
+  if (FLAGS_free_idle_memory) {
+    // Clean up if existing too much free memory
+    // Prefer freeing fallback allocation first
+    CleanIdleFallBackAlloc();
 
-  // Free normal allocation
-  CleanIdleNormalAlloc();
+    // Free normal allocation
+    CleanIdleNormalAlloc();
+  }
 }
 
 size_t BuddyAllocator::Used() { return total_used_; }
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 9903047f74..c2d641600c 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -62,33 +62,33 @@ from paddle.fluid.layers.math_op_patch import monkey_patch_variable
 Tensor = LoDTensor
 
 __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \
-          trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
-          parallel_executor.__all__ + lod_tensor.__all__ + [
-              'io',
-              'initializer',
-              'layers',
-              'contrib',
-              'transpiler',
-              'nets',
-              'optimizer',
-              'learning_rate_decay',
-              'backward',
-              'regularizer',
-              'LoDTensor',
-              'LoDTensorArray',
-              'CPUPlace',
-              'CUDAPlace',
-              'CUDAPinnedPlace',
-              'Tensor',
-              'ParamAttr',
-              'WeightNormParamAttr',
-              'DataFeeder',
-              'clip',
-              'profiler',
-              'unique_name',
-              'recordio_writer',
-              'Scope',
-          ]
+    trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
+    parallel_executor.__all__ + lod_tensor.__all__ + [
+        'io',
+        'initializer',
+        'layers',
+        'contrib',
+        'transpiler',
+        'nets',
+        'optimizer',
+        'learning_rate_decay',
+        'backward',
+        'regularizer',
+        'LoDTensor',
+        'LoDTensorArray',
+        'CPUPlace',
+        'CUDAPlace',
+        'CUDAPinnedPlace',
+        'Tensor',
+        'ParamAttr',
+        'WeightNormParamAttr',
+        'DataFeeder',
+        'clip',
+        'profiler',
+        'unique_name',
+        'recordio_writer',
+        'Scope',
+    ]
 
 
 def __bootstrap__():
@@ -123,7 +123,7 @@ def __bootstrap__():
     read_env_flags = [
         'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
         'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
-        'init_allocated_mem'
+        'init_allocated_mem', 'free_idle_memory'
     ]
     if core.is_compiled_with_dist():
         read_env_flags.append('rpc_deadline')

From 5533400720d5f1d407320667ea09b58395d44607 Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Wed, 25 Jul 2018 11:48:19 +0000
Subject: [PATCH 24/37] fix comments

---
 .../inference/tensorrt/convert/pool2d_op.cc   |  9 ++---
 .../tensorrt/convert/test_activation_op.cc    |  5 ++-
 .../inference/tensorrt/convert/test_fc_op.cc  |  5 ++-
 .../inference/tensorrt/convert/test_mul_op.cc |  5 ++-
 .../tensorrt/convert/test_pool2d_op.cc        | 10 +++---
 .../inference/tensorrt/convert/ut_helper.h    | 33 +++++++++++--------
 6 files changed, 35 insertions(+), 32 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index 2db551758c..c5852bdb53 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -30,6 +30,7 @@ class Pool2dOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+
     std::string pool_type =
         boost::get<std::string>(op_desc.GetAttr("pooling_type"));
     std::vector<int> ksize =
@@ -45,18 +46,18 @@ class Pool2dOpConverter : public OpConverter {
 
     PADDLE_ENFORCE_EQ(input1->getDimensions().nbDims, 3UL);
 
-    nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kMAX;
+    nvinfer1::PoolingType nv_pool_type = nvinfer1::PoolingType::kMAX;
     if (pool_type == "max") {
-      pool_t = nvinfer1::PoolingType::kMAX;
+      nv_pool_type = nvinfer1::PoolingType::kMAX;
     } else if (pool_type == "avg") {
-      pool_t = nvinfer1::PoolingType::kAVERAGE;
+      nv_pool_type = nvinfer1::PoolingType::kAVERAGE;
     } else {
       PADDLE_THROW("TensorRT unsupported pooling type!");
     }
 
     auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling,
                                        *const_cast<nvinfer1::ITensor*>(input1),
-                                       pool_t, nv_ksize);
+                                       nv_pool_type, nv_ksize);
     PADDLE_ENFORCE_NOT_NULL(layer, "pool layer could not be created.");
     layer->setStride(nv_strides);
     layer->setPadding(nv_paddings);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
index 5a1543de81..e82762ea03 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
@@ -23,8 +23,7 @@ namespace tensorrt {
 TEST(ReluOpConverter, main) {
   framework::Scope scope;
   std::unordered_set<std::string> parameters;
-  int runtime_batch = 3;
-  TRTConvertValidation validator(10, parameters, scope, 1000, runtime_batch);
+  TRTConvertValidation validator(10, parameters, scope, 1000);
   validator.DeclInputVar("relu-X", nvinfer1::Dims2(10, 6));
   validator.DeclOutputVar("relu-Out", nvinfer1::Dims2(10, 6));
 
@@ -38,7 +37,7 @@ TEST(ReluOpConverter, main) {
   validator.SetOp(*desc.Proto());
   LOG(INFO) << "execute";
 
-  validator.Execute(runtime_batch);
+  validator.Execute(5);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
index 0c62108332..1ae2668e73 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
@@ -23,8 +23,7 @@ namespace tensorrt {
 TEST(fc_op, test) {
   std::unordered_set<std::string> parameters({"mul-Y"});
   framework::Scope scope;
-  int runtime_batch = 2;
-  TRTConvertValidation validator(10, parameters, scope, 1000, runtime_batch);
+  TRTConvertValidation validator(10, parameters, scope, 1000);
   validator.DeclInputVar("mul-X", nvinfer1::Dims3(10, 1, 1));
   validator.DeclParamVar("mul-Y", nvinfer1::Dims2(10, 2));
   validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(1, 2));
@@ -38,7 +37,7 @@ TEST(fc_op, test) {
 
   validator.SetOp(*desc.Proto());
 
-  validator.Execute(runtime_batch);
+  validator.Execute(10);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
index a54870566a..3d34cd7d5d 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
@@ -23,8 +23,7 @@ namespace tensorrt {
 TEST(MulOpConverter, main) {
   framework::Scope scope;
   std::unordered_set<std::string> parameters;
-  int runtime_batch = 0;
-  TRTConvertValidation validator(10, parameters, scope, 1000, runtime_batch);
+  TRTConvertValidation validator(10, parameters, scope, 1000, false);
   validator.DeclInputVar("mul-X", nvinfer1::Dims2(10, 6));
   validator.DeclInputVar("mul-Y", nvinfer1::Dims2(6, 10));
   validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(10, 10));
@@ -40,7 +39,7 @@ TEST(MulOpConverter, main) {
   validator.SetOp(*desc.Proto());
   LOG(INFO) << "execute";
 
-  validator.Execute(1);
+  validator.Execute(2);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
index 57c982f2c0..c5dddbc8cd 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
@@ -23,12 +23,10 @@ namespace tensorrt {
 TEST(Pool2dOpConverter, main) {
   framework::Scope scope;
   std::unordered_set<std::string> parameters;
-  int runtime_batch = 3;
-  TRTConvertValidation validator(5, parameters, scope, 1 << 15, runtime_batch);
+  TRTConvertValidation validator(5, parameters, scope, 1 << 15);
 
-  // We have already set the runtime batchsize, so the
-  // Dims should not contain the batch size.
-  // The ITensor's Dims of input and output should be C * H * W.
+  // The ITensor's Dims should not contain the batch size.
+  // So, the ITensor's Dims of input and output should be C * H * W.
   validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 4, 4));
   validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 2, 2));
 
@@ -52,7 +50,7 @@ TEST(Pool2dOpConverter, main) {
   validator.SetOp(*desc.Proto());
   LOG(INFO) << "execute";
 
-  validator.Execute(runtime_batch);
+  validator.Execute(3);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index e20169c4cf..a4ab111eb4 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -66,10 +66,11 @@ class TRTConvertValidation {
   TRTConvertValidation(int max_batch_size,
                        const std::unordered_set<std::string>& parameters,
                        framework::Scope& scope,  // NOLINT
-                       int workspace_size = 1 << 10, int runtime_batch_size = 1)
+                       int workspace_size = 1 << 10, bool if_add_batch = true)
       : parameters_(parameters),
         scope_(scope),
-        runtime_batch_size_(runtime_batch_size) {
+        if_add_batch_(if_add_batch),
+        max_batch_size_(max_batch_size) {
     // create engine.
     engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, &stream_));
     engine_->InitNetwork();
@@ -102,12 +103,10 @@ class TRTConvertValidation {
     // Init Fluid tensor.
     std::vector<int> dim_vec(dims.d, dims.d + dims.nbDims);
     // There is no batchsize in ITensor's shape, but We should add it to
-    // tensor's
-    // shape of fluid. If the variable is not parameter and the batch size
-    // greater than 0,
-    // add the batchsize to dim_vec.
-    if (is_param != true && runtime_batch_size_ > 0)
-      dim_vec.insert(dim_vec.begin(), runtime_batch_size_);
+    // tensor's shape of fluid. If the variable is not parameter and the
+    // if_add_batch_ flag is true, add the max batchsize to dim_vec.
+    if (is_param != true && if_add_batch_ == true)
+      dim_vec.insert(dim_vec.begin(), max_batch_size_);
     auto* x = scope_.Var(name);
     auto* x_tensor = x->GetMutable<framework::LoDTensor>();
     x_tensor->Resize(framework::make_ddim(dim_vec));
@@ -141,6 +140,7 @@ class TRTConvertValidation {
 
   void Execute(int batch_size) {
     // Execute Fluid Op
+    PADDLE_ENFORCE_LE(batch_size, max_batch_size_);
     platform::CPUPlace place;
     platform::CPUDeviceContext ctx(place);
     op_->Run(scope_, place);
@@ -159,9 +159,14 @@ class TRTConvertValidation {
       auto* var = scope_.FindVar(output);
       auto tensor = var->GetMutable<framework::LoDTensor>();
       framework::TensorToVector(*tensor, ctx, &fluid_out);
+
+      size_t fluid_out_size = fluid_out.size();
+      if (if_add_batch_ == true) {
+        fluid_out_size = batch_size * (tensor->dims().size() / max_batch_size_);
+      }
       // Compare two output
       ASSERT_FALSE(fluid_out.empty());
-      for (size_t i = 0; i < fluid_out.size(); i++) {
+      for (size_t i = 0; i < fluid_out_size; i++) {
         // Loose the threshold for CI in different machine model.
         EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 2e-5);
       }
@@ -177,10 +182,12 @@ class TRTConvertValidation {
   std::unique_ptr<framework::OpDesc> op_desc_;
   const std::unordered_set<std::string>& parameters_;
   framework::Scope& scope_;
-  //  It represents the runtime batchsize when we test.
-  //  If the value greater than 0, we add this to
-  //  the first dimension of tensor's shape of fluid.
-  int runtime_batch_size_;
+  // The ITensor of trt does not cotain the batch size,
+  // bug, in most cases, we need to set batch size for
+  // fluid's tensor shape. This variable indicates
+  // whether to add batch size to tensor shape of fluid.
+  bool if_add_batch_;
+  int max_batch_size_;
 };
 
 }  // namespace tensorrt

From c8adfb3451281aad51e1398023d5af80b31fa7f3 Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Wed, 25 Jul 2018 12:32:47 +0000
Subject: [PATCH 25/37] add paddle_enforce

---
 paddle/fluid/inference/tensorrt/convert/pool2d_op.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index c5852bdb53..11cad95361 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -29,6 +29,8 @@ class Pool2dOpConverter : public OpConverter {
         << "convert a fluid pool2d op to tensorrt pool2d layer without bias";
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
+    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
     auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
 
     std::string pool_type =

From 83e59257d0a156b5bbe992d44d9b5d9ff5207d3d Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Thu, 26 Jul 2018 12:16:14 +0800
Subject: [PATCH 26/37] fix manylinux1 Failed to publish artifacts

---
 paddle/fluid/inference/api/demo_ci/clean.sh | 4 ++++
 paddle/scripts/paddle_build.sh              | 1 +
 2 files changed, 5 insertions(+)
 create mode 100755 paddle/fluid/inference/api/demo_ci/clean.sh

diff --git a/paddle/fluid/inference/api/demo_ci/clean.sh b/paddle/fluid/inference/api/demo_ci/clean.sh
new file mode 100755
index 0000000000..0d9f3d2aa2
--- /dev/null
+++ b/paddle/fluid/inference/api/demo_ci/clean.sh
@@ -0,0 +1,4 @@
+set -x
+cd `dirname $0`
+rm -rf build/ data/
+set +x
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 9e58a39eb0..0f334b2892 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -547,6 +547,7 @@ function test_fluid_inference_lib() {
 EOF
         cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci
         ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF}
+        ./clean.sh
       fi
 }
 

From 4f71a3b12bfb5401f1f4e49fe8e71634a5d1c645 Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Thu, 26 Jul 2018 04:33:08 +0000
Subject: [PATCH 27/37] fix a bug

---
 paddle/fluid/inference/tensorrt/convert/ut_helper.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index a4ab111eb4..39529cc2c7 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -162,7 +162,8 @@ class TRTConvertValidation {
 
       size_t fluid_out_size = fluid_out.size();
       if (if_add_batch_ == true) {
-        fluid_out_size = batch_size * (tensor->dims().size() / max_batch_size_);
+        fluid_out_size =
+            batch_size * (framework::product(tensor->dims()) / max_batch_size_);
       }
       // Compare two output
       ASSERT_FALSE(fluid_out.empty());

From 73fcfc06ec76bae730906583b44c649a7a008bc0 Mon Sep 17 00:00:00 2001
From: Wu Yi <typhoonzero1986@gmail.com>
Date: Thu, 26 Jul 2018 13:46:08 +0800
Subject: [PATCH 28/37] refine conv cudnn enforce (#12353)

* refine conv cudnn enforce

* update

* update all cudnn ops

* fix
---
 paddle/fluid/operators/conv_cudnn_op.cu.cc    | 26 +++++++++----------
 .../operators/conv_transpose_cudnn_op.cu.cc   | 18 ++++++-------
 paddle/fluid/operators/math/softmax.cu        |  4 +--
 paddle/fluid/operators/pool_cudnn_op.cu.cc    |  4 +--
 4 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index 1828be57b5..b3781ded01 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -77,7 +77,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     // cudnn 7 can support groups, no need to do it mannually
     // FIXME(typhoonzero): find a better way to disable groups
     // rather than setting it to 1.
-    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
+    CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
         cudnn_conv_desc, groups));
     groups = 1;
 #endif
@@ -129,7 +129,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
 
-    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
         cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
         workspace_size_limit, &algo));
@@ -140,18 +140,18 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     if (dev_ctx.GetComputeCapability() >= 70 &&
         std::type_index(typeid(T)) ==
             std::type_index(typeid(platform::float16))) {
-      PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
           cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
       // Currently tensor core is only enabled using this algo
       algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
     } else {
-      PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
           cudnn_conv_desc, CUDNN_DEFAULT_MATH));
     }
 #endif
 
     // get workspace size able to allocate
-    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
         cudnn_output_desc, algo, &workspace_size_in_bytes));
     // It is possible for float16 on Volta GPU to allocate more memory than
@@ -165,7 +165,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     // ------------------- cudnn conv forward ---------------------
     ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
     for (int i = 0; i < groups; i++) {
-      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
+      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
           handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
           cudnn_filter_desc, filter_data + i * group_offset_filter,
           cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
@@ -218,7 +218,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     // cudnn 7 can support groups, no need to do it mannually
     // FIXME(typhoonzero): find a better way to disable groups
     // rather than setting it to 1.
-    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
+    CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
         cudnn_conv_desc, groups));
     groups = 1;
 #endif
@@ -273,7 +273,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     auto handle = dev_ctx.cudnn_handle();
     if (input_grad) {
       if (FLAGS_cudnn_deterministic) {
-        PADDLE_ENFORCE(
+        CUDNN_ENFORCE(
             platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
                 handle, cudnn_filter_desc,
                 // dyDesc: Handle to the previously initialized input
@@ -289,7 +289,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
         data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
       }
 
-      PADDLE_ENFORCE(
+      CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
               handle, cudnn_filter_desc, cudnn_output_grad_desc,
               cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size));
@@ -298,7 +298,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
 
     if (filter_grad) {
       if (FLAGS_cudnn_deterministic) {
-        PADDLE_ENFORCE(
+        CUDNN_ENFORCE(
             platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
                 handle, cudnn_input_desc, cudnn_output_grad_desc,
                 cudnn_conv_desc, cudnn_filter_desc,
@@ -308,7 +308,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
         filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
       }
 
-      PADDLE_ENFORCE(
+      CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
               handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
               cudnn_filter_desc, filter_algo, &tmp_size));
@@ -326,7 +326,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       // Because beta is zero, it is unnecessary to reset input_grad.
 
       for (int i = 0; i < groups; i++) {
-        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
             handle, &alpha, cudnn_filter_desc,
             filter_data + i * group_offset_filter, cudnn_output_grad_desc,
             output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo,
@@ -339,7 +339,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset filter_grad.
       for (int i = 0; i < groups; i++) {
-        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
             handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
             cudnn_output_grad_desc, output_grad_data + i * group_offset_out,
             cudnn_conv_desc, filter_algo, cudnn_workspace,
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
index 038ea89990..82fff68e75 100644
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
@@ -87,7 +87,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
     // Get the algorithm
-    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
         handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
         // dxDesc: Handle to the previously initialized output tensor
         // descriptor.
@@ -95,7 +95,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
         workspace_size_limit, &algo));
 
     // get workspace size able to allocate
-    PADDLE_ENFORCE(
+    CUDNN_ENFORCE(
         platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
             handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
             cudnn_output_desc, algo, &workspace_size_in_bytes));
@@ -110,7 +110,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
     int filter_offset = filter->numel() / groups;
     T alpha = 1.0f, beta = 0.0f;
     for (int g = 0; g < groups; g++) {
-      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
+      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
           handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
           cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
           algo, cudnn_workspace, workspace_size_in_bytes, &beta,
@@ -178,11 +178,11 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
     auto handle = dev_ctx.cudnn_handle();
     if (input_grad) {
       // choose backward algorithm for data
-      PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
           handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
           cudnn_input_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
           workspace_size_limit, &data_algo));
-      PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
           handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
           cudnn_input_desc, data_algo, &fwd_ws_size));
       workspace_size_in_bytes = std::max(workspace_size_in_bytes, fwd_ws_size);
@@ -190,7 +190,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
 
     if (filter_grad) {
       // choose backward algorithm for filter
-      PADDLE_ENFORCE(
+      CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
               handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
               cudnn_filter_desc,
@@ -198,7 +198,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
               workspace_size_limit, &filter_algo));
 
       // get workspace for backwards filter algorithm
-      PADDLE_ENFORCE(
+      CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
               handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
               cudnn_filter_desc, filter_algo, &bwd_filter_ws_size));
@@ -222,7 +222,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset input_grad.
       for (int g = 0; g < groups; g++) {
-        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
             handle, &alpha, cudnn_output_desc,
             output_grad_data + output_grad_offset * g, cudnn_filter_desc,
             filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
@@ -237,7 +237,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       // Because beta is zero, it is unnecessary to reset filter_grad.
       // Gradient with respect to the filter
       for (int g = 0; g < groups; g++) {
-        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
             handle, &alpha, cudnn_output_desc,
             output_grad_data + output_grad_offset * g, cudnn_input_desc,
             input_data + input_offset * g, cudnn_conv_desc, filter_algo,
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index a579182ec1..3effe77625 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -52,7 +52,7 @@ void SoftmaxCUDNNFunctor<T>::operator()(
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
   cudnnTensorDescriptor_t cudnn_y_desc =
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE(platform::dynload::cudnnSoftmaxForward(
+  CUDNN_ENFORCE(platform::dynload::cudnnSoftmaxForward(
       context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE,
       CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType<T>::kOne(), cudnn_x_desc,
       X->data<T>(), CudnnDataType<T>::kZero(), cudnn_y_desc,
@@ -83,7 +83,7 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
       dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
   cudnnTensorDescriptor_t cudnn_ygrad_desc =
       dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE(platform::dynload::cudnnSoftmaxBackward(
+  CUDNN_ENFORCE(platform::dynload::cudnnSoftmaxBackward(
       context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE,
       CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType<T>::kOne(), cudnn_y_desc,
       Y->data<T>(), cudnn_ygrad_desc, YGrad->data<T>(),
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
index be55bc43b1..31f083565f 100644
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -81,7 +81,7 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
     // ------------------- cudnn pool algorithm ---------------------
     auto handle = ctx.cuda_device_context().cudnn_handle();
     ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-    PADDLE_ENFORCE(platform::dynload::cudnnPoolingForward(
+    CUDNN_ENFORCE(platform::dynload::cudnnPoolingForward(
         handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta,
         cudnn_output_desc, output_data));
   }
@@ -154,7 +154,7 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
       T *input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset input_grad.
 
-      PADDLE_ENFORCE(platform::dynload::cudnnPoolingBackward(
+      CUDNN_ENFORCE(platform::dynload::cudnnPoolingBackward(
           handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,
           cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data,
           &beta, cudnn_input_desc, input_grad_data));

From 604bd85a458ea710342c705d45228dcd822653bf Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Thu, 26 Jul 2018 15:48:39 +0800
Subject: [PATCH 29/37] update inference_optimize()

---
 paddle/fluid/framework/block_desc.h |  5 ++---
 python/paddle/fluid/framework.py    | 23 ++++++++++++++++++++++-
 python/paddle/fluid/layers/io.py    |  3 ---
 3 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
index ce48548418..960ca39e1e 100644
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -88,9 +88,8 @@ class BlockDesc {
   OpDesc *InsertOp(size_t index);
 
   /*
-   * Remove Op and its input/output variables.
-   * Note that for either input or output variable, if it is also an input or
-   * output variable of other ops, we should remain it.
+   * Only remove op itself,
+   * do nothing to its input and output variables
    */
   void RemoveOp(size_t s, size_t e);
 
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index db550eccf9..e10f8325e4 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1540,7 +1540,12 @@ class Program(object):
 
     def inference_optimize(self):
         """
-        This method will create a new program and change the :code:`is_test`
+        This method will create a new program and do following adjustments on it:
+        1. Remove all reader variables and their creator ops if exist.
+
+        2. Remove the :code:`read_op` if exists.
+
+        3. change the :code:`is_test` 
         attribute of operators to :code:`True`. All the :code:`Parameter`
         information will be lost.
 
@@ -1554,6 +1559,22 @@ class Program(object):
         # core.inference_optimize being fixed.
         res = Program()
         res.desc = core.ProgramDesc(self.desc)
+
+        # remove all readers and the read_op if exist
+        read_op_idx = 0
+        root_block = res.desc.block(0)
+        while True:
+            if read_op_idx >= root_block.op_size() or root_block.op(
+                    read_op_idx).type() == 'read':
+                break
+            read_op_idx += 1
+        if read_op_idx < root_block.op_size():
+            root_block._remove_op(0, read_op_idx + 1)
+        for var in root_block.all_vars():
+            if var.type() == core.VarDesc.VarType.READER:
+                root_block._remove_var(var.name())
+
+        # change all `is_test` attributes to True
         for i in xrange(res.desc.num_blocks()):
             block = res.desc.block(i)
             for j in xrange(block.op_size()):
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index df6becabd1..fab4a92a0a 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -443,9 +443,6 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
     main_prog_var = _copy_reader_var_(default_main_program().current_block(),
                                       startup_var)
 
-    if for_parallel:
-        main_prog_var = parallel(reader=main_prog_var)
-
     return monkey_patch_reader_methods(main_prog_var)
 
 

From 54e9fd3f61fcba58e77a253ceab9af77754f176d Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Thu, 26 Jul 2018 18:25:55 +0800
Subject: [PATCH 30/37] fix cudnn enforce

---
 paddle/fluid/platform/cudnn_helper.h | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index 6ea4f8b7cb..bb8b14bb9f 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -59,13 +59,12 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
 #define CUDNN_VERSION_MIN(major, minor, patch) \
   (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))
 
-#define CUDNN_ENFORCE(condition)                                  \
-  do {                                                            \
-    cudnnStatus_t status = condition;                             \
-    if (status != CUDNN_STATUS_SUCCESS) {                         \
-      VLOG(1) << ::paddle::platform::cudnnGetErrorString(status); \
-      PADDLE_THROW("cuDNN call failed");                          \
-    }                                                             \
+#define CUDNN_ENFORCE(condition)                                     \
+  do {                                                               \
+    cudnnStatus_t status = condition;                                \
+    if (UNLIKELY(status != CUDNN_STATUS_SUCCESS)) {                  \
+      PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \
+    }                                                                \
   } while (false)
 
 enum class DataLayout {  // Not use

From 10a1c2bb86d602f27490079adda1ab594fdeeed9 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 27 Jul 2018 10:16:14 +0800
Subject: [PATCH 31/37] control omp num_threads

---
 paddle/fluid/platform/cpu_helper.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc
index 77ecb17011..234a04b5c2 100644
--- a/paddle/fluid/platform/cpu_helper.cc
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 
 #ifdef PADDLE_WITH_MKLML
+#include <omp.h>
 #include "paddle/fluid/platform/dynload/mklml.h"
 #endif
 
@@ -33,6 +34,7 @@ void SetNumThreads(int num_threads) {
 #elif defined(PADDLE_WITH_MKLML)
   int real_num_threads = num_threads > 1 ? num_threads : 1;
   platform::dynload::MKL_Set_Num_Threads(real_num_threads);
+  omp_set_num_threads(num_threads);
 #else
   PADDLE_ENFORCE(false, "To be implemented.");
 #endif

From 9788e5ab8795c9f5cf19f4fa7f7dd37a20512d12 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 27 Jul 2018 10:36:06 +0800
Subject: [PATCH 32/37] add flags to control num_threads

---
 .../fluid/inference/tests/book/test_inference_nlp.cc  | 11 ++---------
 paddle/fluid/platform/cpu_helper.cc                   |  3 +++
 paddle/fluid/platform/init.cc                         |  4 +++-
 3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 5cc1db12bb..e2a3e9d46e 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -20,9 +20,6 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 #include "paddle/fluid/platform/cpu_helper.h"
-#ifdef PADDLE_WITH_MKLML
-#include <omp.h>
-#endif
 
 DEFINE_string(model_path, "", "Directory of the inference model.");
 DEFINE_string(data_file, "", "File of input index data.");
@@ -30,6 +27,7 @@ DEFINE_int32(repeat, 100, "Running the inference program repeat times");
 DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
 DEFINE_int32(num_threads, 1, "Number of threads should be used");
 DECLARE_bool(use_mkldnn);
+DECLARE_int32(paddle_num_threads);
 
 inline double GetCurrentMs() {
   struct timeval time;
@@ -160,12 +158,7 @@ TEST(inference, nlp) {
   std::unique_ptr<paddle::framework::Scope> scope(
       new paddle::framework::Scope());
 
-#ifdef PADDLE_WITH_MKLML
-  // only use 1 thread number per std::thread
-  omp_set_dynamic(0);
-  omp_set_num_threads(1);
-  paddle::platform::SetNumThreads(1);
-#endif
+  paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
 
   double start_ms = 0, stop_ms = 0;
   if (FLAGS_num_threads > 1) {
diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc
index 234a04b5c2..6841652b75 100644
--- a/paddle/fluid/platform/cpu_helper.cc
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -24,6 +24,9 @@ limitations under the License. */
 #include <cblas.h>
 #endif
 
+DEFINE_int32(paddle_num_threads, 1,
+             "Number of threads for each paddle instance.");
+
 namespace paddle {
 namespace platform {
 
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 0b77652841..79a2ca96ed 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -23,6 +23,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/piece.h"
 
+DECLARE_int32(paddle_num_threads);
+
 namespace paddle {
 namespace framework {
 
@@ -115,7 +117,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
   places.emplace_back(platform::CPUPlace());
   platform::DeviceContextPool::Init(places);
 #ifndef PADDLE_WITH_MKLDNN
-  platform::SetNumThreads(1);
+  platform::SetNumThreads(FLAGS_paddle_num_threads);
 #endif
 }
 

From 23e3f790c0601136be1582cad7ad6da8b8e5ddb1 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 27 Jul 2018 11:00:41 +0800
Subject: [PATCH 33/37] enable read from python

---
 python/paddle/fluid/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index c2d641600c..d1d6dd75ee 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -123,7 +123,7 @@ def __bootstrap__():
     read_env_flags = [
         'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
         'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
-        'init_allocated_mem', 'free_idle_memory'
+        'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads'
     ]
     if core.is_compiled_with_dist():
         read_env_flags.append('rpc_deadline')

From 4f0383f52ea7e7da06a7b383d736aeccb7d0e512 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 27 Jul 2018 13:29:38 +0800
Subject: [PATCH 34/37] fix unknown flag

---
 paddle/fluid/platform/cpu_helper.cc | 3 ---
 paddle/fluid/platform/init.cc       | 3 ++-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc
index 6841652b75..234a04b5c2 100644
--- a/paddle/fluid/platform/cpu_helper.cc
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -24,9 +24,6 @@ limitations under the License. */
 #include <cblas.h>
 #endif
 
-DEFINE_int32(paddle_num_threads, 1,
-             "Number of threads for each paddle instance.");
-
 namespace paddle {
 namespace platform {
 
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 79a2ca96ed..6f1f0c4796 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -23,7 +23,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/piece.h"
 
-DECLARE_int32(paddle_num_threads);
+DEFINE_int32(paddle_num_threads, 1,
+             "Number of threads for each paddle instance.");
 
 namespace paddle {
 namespace framework {

From 57c8aa60c2e1b2f0a2e99b17c53c9b8ce56f2521 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 27 Jul 2018 13:50:15 +0800
Subject: [PATCH 35/37] Add self to author

---
 AUTHORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/AUTHORS.md b/AUTHORS.md
index 8c4a113fc2..41b7193677 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -46,6 +46,7 @@
 | tianbingsz | Tian-Bing Xu |
 | tpatejko | Tomasz Patejko |
 | typhoonzero | Yi Wu |
+| velconia | Qi-Yang Min |
 | wanghaoshuang | Hao-Shuang Wang |
 | wangyang59 | Yang Wang |
 | wangzhen-nlp | Zhen Wang |

From 2409d0f710aab9e030ff23b2b0b66d874294b931 Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Fri, 27 Jul 2018 16:29:58 +0800
Subject: [PATCH 36/37] Refine regularization for selected_rows (#12369)

* refine regularization for selected_rows

* clean lookup_table

* refine rpc_server_test

* temporally disable rpc_server_test

* fix rpc_server_test

* add unit test
---
 paddle/fluid/operators/CMakeLists.txt         |   1 +
 .../operators/distributed/CMakeLists.txt      |   6 +-
 .../operators/distributed/rpc_server_test.cc  |  25 ++---
 paddle/fluid/operators/extract_rows_op.cc     | 103 ++++++++++++++++++
 paddle/fluid/operators/lookup_table_op.cc     |  46 +++-----
 paddle/fluid/operators/lookup_table_op.cu     |  73 +++++--------
 paddle/fluid/operators/lookup_table_op.h      |  40 +------
 python/paddle/fluid/regularizer.py            |  16 ++-
 .../tests/unittests/test_extract_rows_op.py   |  58 ++++++++++
 .../tests/unittests/test_lookup_table_op.py   |  47 --------
 10 files changed, 240 insertions(+), 175 deletions(-)
 create mode 100644 paddle/fluid/operators/extract_rows_op.cc
 create mode 100644 python/paddle/fluid/tests/unittests/test_extract_rows_op.py

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 9b56ad4c55..2da52dbf48 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -270,6 +270,7 @@ op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
 op_library(unsqueeze_op DEPS reshape_op)
 op_library(squeeze_op DEPS reshape_op)
+op_library(extract_rows_op DEPS memory)
 
 if (WITH_GPU)
     op_library(conv_op DEPS vol2col depthwise_conv im2col)
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 1612927055..da5d20505e 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -17,9 +17,9 @@ if(WITH_GRPC)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   cc_test(grpc_serde_test SRCS grpc_serde_test.cc 
-     DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
-  cc_test(rpc_server_test SRCS rpc_server_test.cc 
-    DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_table_op SERIAL)
+    DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
+  cc_test(rpc_server_test SRCS rpc_server_test.cc
+    DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_sparse_table_op SERIAL)
   return()
 endif()
 
diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc
index 9f2360ec70..b50830c362 100644
--- a/paddle/fluid/operators/distributed/rpc_server_test.cc
+++ b/paddle/fluid/operators/distributed/rpc_server_test.cc
@@ -30,7 +30,7 @@ namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace distributed = paddle::operators::distributed;
 
-USE_OP(lookup_table);
+USE_NO_KERNEL_OP(lookup_sparse_table);
 
 std::unique_ptr<distributed::RPCServer> g_rpc_service;
 std::unique_ptr<distributed::RequestHandler> g_req_handler;
@@ -42,13 +42,13 @@ framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
   framework::VariableNameMap input({{"W", {"w"}}, {"Ids", {"ids"}}});
   framework::VariableNameMap output({{"Output", {"out"}}});
   auto op = block->AppendOp();
-  op->SetType("lookup_table");
+  op->SetType("lookup_sparse_table");
   op->SetInput("W", {"w"});
   op->SetInput("Ids", {"ids"});
   op->SetOutput("Out", {"out"});
 
   auto& out = *root_block->Var("out");
-  out.SetType(framework::proto::VarType::SELECTED_ROWS);
+  out.SetType(framework::proto::VarType::LOD_TENSOR);
   out.SetShape({10, 10});
 
   return block;
@@ -59,20 +59,19 @@ void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
   w_var->GetMutable<framework::SelectedRows>();
 
   auto out_var = scope->Var("out");
-  out_var->GetMutable<framework::SelectedRows>();
+  out_var->GetMutable<framework::LoDTensor>();
 
   auto ids_var = scope->Var("ids");
-  ids_var->GetMutable<framework::SelectedRows>();
+  ids_var->GetMutable<framework::LoDTensor>();
 }
 
 void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
                          int64_t rows_numel) {
   CreateVarsOnScope(scope, place);
-  auto ids_var = scope->Var("ids")->GetMutable<framework::SelectedRows>();
-  auto rows = ids_var->mutable_rows();
-  for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i * 2);
-  ids_var->mutable_value()->Resize({rows_numel, 1});
-  ids_var->mutable_value()->mutable_data<float>(*place);
+  auto ids_var = scope->Var("ids")->GetMutable<framework::LoDTensor>();
+  int64_t* ids_ptr =
+      ids_var->mutable_data<int64_t>(framework::DDim({rows_numel, 1}), *place);
+  for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2;
 }
 
 void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
@@ -148,11 +147,11 @@ TEST(PREFETCH, CPU) {
     client->AsyncPrefetchVar(ep, ctx, scope, in_var_name, out_var_name);
     client->Wait();
     auto var = scope.Var(out_var_name);
-    auto value = var->GetMutable<framework::SelectedRows>()->value();
-    auto ptr = value.mutable_data<float>(place);
+    auto value = var->GetMutable<framework::LoDTensor>();
+    auto ptr = value->mutable_data<float>(place);
 
     for (int64_t i = 0; i < rows_numel; ++i) {
-      EXPECT_EQ(ptr[0 + i * value.dims()[1]], static_cast<float>(i * 2));
+      EXPECT_EQ(ptr[0 + i * value->dims()[1]], static_cast<float>(i * 2));
     }
   }
 
diff --git a/paddle/fluid/operators/extract_rows_op.cc b/paddle/fluid/operators/extract_rows_op.cc
new file mode 100644
index 0000000000..9a297d03cf
--- /dev/null
+++ b/paddle/fluid/operators/extract_rows_op.cc
@@ -0,0 +1,103 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class ExtractRowsOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ExtractRowsOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ExtractRowsOp should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("X")[0],
+                      framework::proto::VarType::SELECTED_ROWS,
+                      "The type of input(X) must be SelectedRows.");
+    auto in_dims = ctx->GetInputDim("X");
+
+    ctx->SetOutputDim(
+        "Out", framework::make_ddim(std::vector<int64_t>{in_dims[0], 1}));
+  }
+};
+
+class ExtractRowsOp : public framework::OperatorBase {
+ public:
+  ExtractRowsOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>();
+    auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+
+    auto in_rows = in.rows();
+    auto out_dim = framework::make_ddim(
+        std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1});
+    auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place());
+
+    if (paddle::platform::is_gpu_place(in.place())) {
+#ifdef PADDLE_WITH_CUDA
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto *dev_ctx = pool.Get(in.place());
+      auto src_ptr = in_rows.Data(in.place());
+      auto stream =
+          reinterpret_cast<const platform::CUDADeviceContext &>(*dev_ctx)
+              .stream();
+      memory::Copy(boost::get<platform::CUDAPlace>(out->place()), dst_ptr,
+                   boost::get<platform::CUDAPlace>(in.place()), src_ptr,
+                   in_rows.size() * sizeof(int64_t), stream);
+#else
+      PADDLE_THROW("Not compiled with CUDA.");
+#endif
+    } else {
+      memory::Copy(platform::CPUPlace(), dst_ptr, platform::CPUPlace(),
+                   in_rows.data(), in_rows.size() * sizeof(int64_t));
+    }
+  }
+};
+
+class ExtractRowsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(SelectedRows). The input tensor of extract_rows operator,"
+             " and its type is SelectedRows.");
+    AddOutput("Out", "(Tensor). The the rows of input(X).");
+
+    AddComment(R"DOC(
+    ExtractRows Operator.
+
+The function of extract_rows_op is extracting the rows from the input(X)
+whose type is SelectedRows.
+
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(extract_rows, ops::ExtractRowsOp, ops::ExtractRowsOpMaker,
+                  ops::ExtractRowsOpInferShape);
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index bda4994322..3e8f3ec5c5 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -33,19 +33,15 @@ class LookupTableOp : public framework::OperatorWithKernel {
     auto table_dims = ctx->GetInputDim("W");
     auto ids_dims = ctx->GetInputDim("Ids");
 
-    auto ids_var_type = ctx->GetInputsVarType("Ids").front();
-    // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
-    // is LoDTensor, this tensor contains the ids to be looked up in W
-    // and it must be a column vector with rank = 2 while the 2nd dimension
-    // size must be 1, when Ids's type is SelectedRows, the rows of Ids
-    // contains the ids to be looked up in W;
-    if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
-      PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
-      PADDLE_ENFORCE_EQ(ids_dims[1], 1);
-    }
+    PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
+    PADDLE_ENFORCE_EQ(ids_dims[1], 1);
 
     ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]});
-    ctx->ShareLoD("Ids", /*->*/ "Out");
+
+    if (ctx->GetOutputsVarType("Out")[0] ==
+        framework::proto::VarType::LOD_TENSOR) {
+      ctx->ShareLoD("Ids", /*->*/ "Out");
+    }
   }
 
  protected:
@@ -62,17 +58,12 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("W",
              "(Tensor) The input represents embedding tensors, "
              "which is a learnable parameter.");
-    AddInput(
-        "Ids",
-        "(Tensor or SelectedRows) Ids's type can be Tensor or "
-        "SelectedRows, when Ids's type is Tensor, this tensor contains "
-        "the ids to be looked up in W and it must be a column vector with "
-        "rank = 2 while the 2nd dimension size must be 1; when Ids's type is "
-        "SelectedRows, the rows of Ids contains the ids to be looked up "
-        "in W.");
-    AddOutput("Out",
-              "(Tensor or SelectedRows) The lookup results, which have the "
-              "same type as W.");
+    AddInput("Ids",
+             "An input with type int32 or int64 "
+             "contains the ids to be looked up in W. "
+             "Ids must be a column vector with rank = 2. "
+             "The 2nd dimension size must be 1.");
+    AddOutput("Out", "The lookup results, which have the same type as W.");
     AddAttr<bool>("is_sparse",
                   "(boolean, default false) "
                   "Sparse update.")
@@ -90,15 +81,10 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
 Lookup Table Operator.
 
 This operator is used to perform lookups on the parameter W,
-then concatenated into a dense or sparse tensor.
-
-The type of Ids(Input) is SelectedRows, Tensor or LoDTensor, when Ids's
-type is SelectedRows, the rows of Ids contains the ids to be looked up in W;
-when Ids's type is Tensor, this tensor contains the ids to be looked up in W
-and it must be a column vector with rank = 2 while the 2nd dimension size must be 1,
-at this time, Ids can carry the LoD (Level of Details) information, or not, and
-the output only shares the LoD information with input Ids.
+then concatenated into a dense tensor.
 
+The input Ids can carry the LoD (Level of Details) information,
+or not. And the output only shares the LoD information with input Ids.
 
 )DOC");
   }
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index 77722c50d3..27483372b9 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -23,7 +23,7 @@ namespace operators {
 
 template <typename T, int BlockDimX, int BlockDimY, int GridDimX,
           bool PaddingFlag>
-__global__ void LookupTable(T* output, const T* table, const int64_t* ids,
+__global__ void LookupTable(T *output, const T *table, const int64_t *ids,
                             const int64_t N, const int64_t K, const int64_t D,
                             const int64_t padding_idx) {
   int idx = threadIdx.x;
@@ -33,8 +33,8 @@ __global__ void LookupTable(T* output, const T* table, const int64_t* ids,
     int64_t id = ids[idy];
     PADDLE_ASSERT(id >= 0);
     PADDLE_ASSERT(id < N);
-    T* out = output + idy * D;
-    const T* tab = table + id * D;
+    T *out = output + idy * D;
+    const T *tab = table + id * D;
     for (int i = idx; i < D; i += BlockDimX) {
       if (PaddingFlag) {
         if (id == padding_idx)
@@ -50,7 +50,7 @@ __global__ void LookupTable(T* output, const T* table, const int64_t* ids,
 }
 
 template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
-__global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids,
+__global__ void LookupTableGrad(T *table, const T *output, const int64_t *ids,
                                 const int64_t N, const int64_t K,
                                 const int64_t D) {
   int idx = threadIdx.x;
@@ -60,8 +60,8 @@ __global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids,
     int id = ids[idy];
     PADDLE_ASSERT(id >= 0);
     PADDLE_ASSERT(id < N);
-    const T* out = output + idy * D;
-    T* tab = table + id * D;
+    const T *out = output + idy * D;
+    T *tab = table + id * D;
     for (int i = idx; i < D; i += BlockDimX) {
       paddle::platform::CudaAtomicAdd(&tab[i], out[i]);
     }
@@ -72,36 +72,19 @@ __global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids,
 template <typename T>
 class LookupTableCUDAKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* table_t = context.Input<LoDTensor>("W");
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *table_t = context.Input<LoDTensor>("W");
+    auto *ids_t = context.Input<LoDTensor>("Ids");
+    auto *output_t = context.Output<LoDTensor>("Out");
     int64_t padding_idx = context.Attr<int64_t>("padding_idx");
-    auto* ids_var = context.InputVar("Ids");
-    Tensor* output_t = context.Output<Tensor>("Out");
-
-    int64_t* ids;
-    int64_t K;
-
-    // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
-    // is LoDTensor, this tensor contains the ids to be looked up in W;
-    // when Ids's type is SelectedRows, the rows of Ids contains the
-    // ids to be looked up in W.
-    if (ids_var->IsType<framework::LoDTensor>()) {
-      auto* ids_t = context.Input<LoDTensor>("Ids");
-      ids = const_cast<int64_t*>(ids_t->data<int64_t>());
-      K = ids_t->numel();
-    } else if (ids_var->IsType<framework::SelectedRows>()) {
-      auto* ids_t = context.Input<framework::SelectedRows>("Ids");
-      ids = const_cast<int64_t*>(ids_t->rows().CUDAData(context.GetPlace()));
-      K = ids_t->rows().size();
-      output_t->Resize({K, table_t->dims()[1]});
-    } else {
-      PADDLE_THROW("Unsupported Variable Type of Ids");
-    }
 
     size_t N = table_t->dims()[0];
     size_t D = table_t->dims()[1];
-    auto* table = table_t->data<T>();
-    auto* output = output_t->mutable_data<T>(context.GetPlace());
+    size_t K = ids_t->numel();
+
+    auto *ids = ids_t->data<int64_t>();
+    auto *table = table_t->data<T>();
+    auto *output = output_t->mutable_data<T>(context.GetPlace());
 
     dim3 threads(128, 8);
     dim3 grids(8, 1);
@@ -122,19 +105,19 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
 template <typename T>
 class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto& dev_ctx =
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto &dev_ctx =
         context.template device_context<platform::CUDADeviceContext>();
     bool is_sparse = context.Attr<bool>("is_sparse");
     // Since paddings are not trainable and fixed in forward, the gradient of
     // paddings makes no sense and we don't deal with it in backward.
     if (is_sparse) {
-      auto* ids = context.Input<LoDTensor>("Ids");
-      auto* table = context.Input<LoDTensor>("W");
-      auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto* d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+      auto *ids = context.Input<LoDTensor>("Ids");
+      auto *table = context.Input<LoDTensor>("W");
+      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
 
-      auto* ids_data = ids->data<int64_t>();
+      auto *ids_data = ids->data<int64_t>();
       auto ids_dim = ids->dims();
 
       auto stream = dev_ctx.stream();
@@ -150,12 +133,12 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
 
       d_table->set_rows(new_rows);
 
-      auto* d_table_value = d_table->mutable_value();
+      auto *d_table_value = d_table->mutable_value();
       d_table_value->Resize({ids_dim[0], table->dims()[1]});
       d_table_value->mutable_data<T>(context.GetPlace());
 
-      auto* d_table_data = d_table_value->data<T>();
-      auto* d_output_data = d_output->data<T>();
+      auto *d_table_data = d_table_value->data<T>();
+      auto *d_output_data = d_output->data<T>();
       PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
       memory::Copy(gpu_place, d_table_data, gpu_place, d_output_data,
                    d_output->numel() * sizeof(T), stream);
@@ -168,9 +151,9 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
       int N = d_table_t->dims()[0];
       int D = d_table_t->dims()[1];
       int K = ids_t->numel();
-      const int64_t* ids = ids_t->data<int64_t>();
-      const T* d_output = d_output_t->data<T>();
-      T* d_table = d_table_t->mutable_data<T>(context.GetPlace());
+      const int64_t *ids = ids_t->data<int64_t>();
+      const T *d_output = d_output_t->data<T>();
+      T *d_table = d_table_t->mutable_data<T>(context.GetPlace());
 
       auto t = framework::EigenVector<T>::Flatten(*d_table_t);
       t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index d482506bf0..c9f074ca0e 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -36,43 +36,13 @@ template <typename T>
 class LookupTableKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
+    auto *ids_t = context.Input<LoDTensor>("Ids");      // int tensor
+    auto *output_t = context.Output<LoDTensor>("Out");  // float tensor
     auto *table_var = context.InputVar("W");
-    auto *ids_var = context.InputVar("Ids");
-    Tensor *output_t = context.Output<Tensor>("Out");
-    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
-
-    DDim table_dim;
 
-    if (table_var->IsType<LoDTensor>()) {
-      table_dim = context.Input<LoDTensor>("W")->dims();
-    } else if (table_var->IsType<SelectedRows>()) {
-      auto *table_t = context.Input<SelectedRows>("W");
-      table_dim = table_t->value().dims();
-    } else {
-      PADDLE_THROW(
-          "The parameter W of a LookupTable "
-          "must be either LoDTensor or SelectedRows");
-    }
-
-    int64_t *ids;
-    int64_t ids_numel;
-
-    // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
-    // is LoDTensor, this tensor contains the ids to be looked up in W;
-    // when Ids's type is SelectedRows, the rows of Ids contains the
-    // ids to be looked up in W.
-    if (ids_var->IsType<LoDTensor>()) {
-      auto *ids_t = context.Input<LoDTensor>("Ids");
-      ids = const_cast<int64_t *>(ids_t->data<int64_t>());
-      ids_numel = ids_t->numel();
-    } else if (ids_var->IsType<SelectedRows>()) {
-      auto *ids_t = context.Input<SelectedRows>("Ids");
-      ids = const_cast<int64_t *>(ids_t->rows().data());
-      ids_numel = ids_t->rows().size();
-      output_t->Resize({ids_numel, table_dim[1]});
-    } else {
-      PADDLE_THROW("Unsupported Variable Type of Ids");
-    }
+    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+    int64_t *ids = const_cast<int64_t *>(ids_t->data<int64_t>());
+    int64_t ids_numel = ids_t->numel();
 
     if (table_var->IsType<LoDTensor>()) {
       auto *table_t = context.Input<LoDTensor>("W");
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index 080c185420..3712955b3b 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -142,14 +142,20 @@ class L2DecayRegularizer(WeightDecayRegularizer):
             dtype="float32", shape=param.shape, lod_level=param.lod_level)
 
         if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
+            idx = block.create_var(
+                dtype="int64",
+                shape=param.shape,
+                type=core.VarDesc.VarType.LOD_TENSOR)
             decay = block.create_var(
                 dtype="float32",
                 shape=param.shape,
                 type=core.VarDesc.VarType.SELECTED_ROWS)
+            block.append_op(
+                type='extract_rows', inputs={'X': grad}, outputs={'Out': idx})
             block.append_op(
                 type='lookup_table',
                 inputs={'W': param,
-                        'Ids': grad},
+                        'Ids': idx},
                 outputs={'Out': decay},
                 attrs={'is_sparse': True})
             param = decay
@@ -216,14 +222,20 @@ class L1DecayRegularizer(WeightDecayRegularizer):
             dtype="float32", shape=param.shape, lod_level=param.lod_level)
 
         if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
+            idx = block.create_var(
+                dtype="int64",
+                shape=param.shape,
+                type=core.VarDesc.VarType.LOD_TENSOR)
             decay = block.create_var(
                 dtype="float32",
                 shape=param.shape,
                 type=core.VarDesc.VarType.SELECTED_ROWS)
+            block.append_op(
+                type='extract_rows', inputs={'X': grad}, outputs={'Out': idx})
             block.append_op(
                 type='lookup_table',
                 inputs={'W': param,
-                        'Ids': grad},
+                        'Ids': idx},
                 outputs={'Out': decay},
                 attrs={'is_sparse': True})
 
diff --git a/python/paddle/fluid/tests/unittests/test_extract_rows_op.py b/python/paddle/fluid/tests/unittests/test_extract_rows_op.py
new file mode 100644
index 0000000000..6a41c44fe6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_extract_rows_op.py
@@ -0,0 +1,58 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from op_test import OpTest
+
+
+class TestExtractRows(OpTest):
+    def check_with_place(self, place):
+        scope = core.Scope()
+
+        # create and initialize Variable
+        feature_len = 12
+        rows = [0, 4, 4, 7]
+        np_array = np.ones((len(rows), feature_len)).astype("float32")
+
+        in_x = scope.var('X').get_selected_rows()
+        in_x.set_height(len(rows))
+        in_x.set_rows(rows)
+        in_x_tensor = in_x.get_tensor()
+        in_x_tensor.set(np_array, place)
+
+        # create Out Variable
+        out_tensor = scope.var('Out').get_tensor()
+
+        # create and run lookup_table operator
+        extract_rows_op = Operator("extract_rows", X='X', Out='Out')
+        extract_rows_op.run(scope, place)
+
+        # get result from Out
+        result_array = np.array(out_tensor)
+        result_array = [ele[0] for ele in result_array]
+        assert result_array == rows
+
+    def test_concat_rows(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.check_with_place(place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
index f8d5785fbf..e16ab1d15f 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
@@ -49,53 +49,6 @@ class TestLookupTableOpWithPadding(TestLookupTableOp):
         pass
 
 
-class TestLookupTableIdsIsSelectedRows(OpTest):
-    def check_with_place(self, place):
-        scope = core.Scope()
-
-        # create and initialize Variable
-        height = 10
-        rows = [0, 4, 4, 7]
-        row_numel = 12
-
-        # create and initialize W Variable
-        W = scope.var('W').get_tensor()
-        W_array = np.full((height, row_numel), 1.0).astype("float32")
-        for i in range(height):
-            W_array[i] *= i
-        W.set(W_array, place)
-
-        # create and initialize Ids Variable
-        ids_selected_rows = scope.var('Ids').get_selected_rows()
-        ids_selected_rows.set_height(len(rows))
-        ids_selected_rows.set_rows(rows)
-        np_array = np.ones((len(rows), row_numel)).astype("float32")
-        ids_tensor = ids_selected_rows.get_tensor()
-        ids_tensor.set(np_array, place)
-
-        # create Out Variable
-        Out = scope.var('Out').get_selected_rows()
-
-        # create and run lookup_table operator
-        concat_rows_op = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
-        concat_rows_op.run(scope, place)
-
-        # get result from Out
-        Out_tensor = Out.get_tensor()
-        result_array = np.array(Out_tensor)
-
-        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
-        for idx, row in enumerate(rows):
-            assert (row == result_array[idx]).all()
-
-    def test_concat_rows(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        for place in places:
-            self.check_with_place(place)
-
-
 class TestLookupTableWIsSelectedRows(OpTest):
     def check_with_place(self, place):
         scope = core.Scope()

From 4dbcb975f7817c9f88acac90fbfc80aeab1a09d5 Mon Sep 17 00:00:00 2001
From: Wu Yi <typhoonzero1986@gmail.com>
Date: Fri, 27 Jul 2018 16:51:56 +0800
Subject: [PATCH 37/37] Enable dist se resnext (#12365)

* enable dist se resnext

* small batch size
---
 python/paddle/fluid/tests/unittests/dist_se_resnext.py    | 8 ++------
 .../paddle/fluid/tests/unittests/test_dist_se_resnext.py  | 2 +-
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
index 72bc1729b0..bf7816b246 100644
--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
@@ -278,7 +278,7 @@ class DistSeResneXt2x2:
 
     def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True):
         test_program, avg_cost, train_reader, test_reader, batch_acc, predict = get_model(
-            batch_size=20)
+            batch_size=2)
         if is_dist:
             t = get_transpiler(trainer_id,
                                fluid.default_main_program(), endpoints,
@@ -294,11 +294,7 @@ class DistSeResneXt2x2:
         strategy.num_threads = 1
         strategy.allow_op_delay = False
         exe = fluid.ParallelExecutor(
-            True,
-            loss_name=avg_cost.name,
-            exec_strategy=strategy,
-            num_trainers=trainers,
-            trainer_id=trainer_id)
+            True, loss_name=avg_cost.name, exec_strategy=strategy)
 
         feed_var_list = [
             var for var in trainer_prog.global_block().vars.itervalues()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
index e3e7036f08..5ca13881bf 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
@@ -56,7 +56,7 @@ class TestDistSeResneXt2x2(unittest.TestCase):
             except os.error:
                 retry_times -= 1
 
-    def non_test_with_place(self):
+    def test_with_place(self):
         # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
         required_envs = {
             "PATH": os.getenv("PATH"),