Merge remote-tracking branch 'upstream/develop' into windows/build

6 years ago · 30ddc07a7e
parent 1b75fd2236 e65cbd3b06
commit 30ddc07a7e
143 changed files with 3587 additions and 4511 deletions
--- a/AUTHORS.md
+++ b/AUTHORS.md
@ -43,6 +43,7 @@
 | qingqing01 | Qing-Qing Dang |
 | reyoung | Yang Yu |
 | Superjom | Chun-Wei Yan |
+| tensor-tang | Jian Tang |
 | tianbingsz | Tian-Bing Xu |
 | tpatejko | Tomasz Patejko |
 | typhoonzero | Yi Wu |
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@ -184,11 +184,10 @@ endif ()

 set(module "inference")
 copy(inference_lib DEPS ${inference_deps}
-        SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
-        ${src_dir}/${module}/api/paddle_inference_api.h
-        ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h
-        DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
-        )
+  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
+       ${src_dir}/${module}/api/paddle_*.h
+       ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}

 set(module "platform")
 copy(platform_lib DEPS profiler_py_proto
@ -223,12 +222,12 @@ copy(third_party DEPS fluid_lib_dist
        DSTS ${FLUID_INFERENCE_INSTALL_DIR} ${FLUID_INFERENCE_INSTALL_DIR}
        )

-# only need libpaddle_fluid.so/a and paddle_inference_api.h for inference-only library
+# only need libpaddle_fluid.so/a and paddle_*.h for inference-only library
 copy(inference_api_lib DEPS fluid_lib_dist
-        SRCS ${FLUID_INSTALL_DIR}/paddle/fluid/inference/libpaddle_fluid.*
-        ${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_inference_api.h
-        DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include
-        )
+  SRCS ${FLUID_INSTALL_DIR}/paddle/fluid/inference/libpaddle_fluid.*
+       ${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_*.h
+  DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include
+)

 add_custom_target(inference_lib_dist DEPENDS third_party inference_api_lib)

--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@ -34,4 +34,5 @@ if(TENSORRT_FOUND)
        "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
    include_directories(${TENSORRT_INCLUDE_DIR})
    list(APPEND EXTERNAL_LIBS ${TENSORRT_LIBRARY})
+    add_definitions(-DPADDLE_WITH_TENSORRT)
 endif()
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -103,7 +103,7 @@ paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 's
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
-paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode'], varargs=None, keywords=None, defaults=(False, -100, False))
+paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax'], varargs=None, keywords=None, defaults=(False, -100, False, False))
 paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1))
@ -184,6 +184,7 @@ paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name']
 paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
 paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
@ -273,6 +274,7 @@ paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, k
 paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.prior_box ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False))
+paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, None))
 paddle.fluid.layers.multi_box_head ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False))
 paddle.fluid.layers.bipartite_match ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@ -30,8 +30,8 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
      local_scopes_(local_scopes),
      places_(places),
      graph_(std::move(graph)),
-      pool_(strategy.num_threads_ +
-            1),  // add one more thread for generate op_deps
+      pool_(strategy.num_threads_),
+      prepare_pool_(1),  // add one more thread for generate op_deps
      fetch_ctxs_(places) {
  for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
    int dep = static_cast<int>(op->NotReadyInputSize());
@ -160,7 +160,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
  });
 }
 void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() {
-  atomic_op_deps_ = pool_.enqueue([&] {
+  atomic_op_deps_ = prepare_pool_.enqueue([&] {
    auto *op_deps = new std::unordered_map<OpHandleBase *, std::atomic<int>>;
    for (auto &pair : op_deps_) {
      (*op_deps)[pair.first] = pair.second;
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@ -46,6 +46,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
  std::vector<OpHandleBase *> bootstrap_ops_;

  ::ThreadPool pool_;
+  ::ThreadPool prepare_pool_;
  platform::DeviceContextPool fetch_ctxs_;
  std::atomic<int> remaining_;

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@ -359,6 +359,7 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
 void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                                  bool create_local_scope, bool create_vars,
                                  bool keep_kids) {
+  PADDLE_ENFORCE_NOT_NULL(scope);
  Scope* local_scope = scope;
  if (create_vars) {
    if (create_local_scope) {
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@ -5,6 +5,7 @@ file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n")


 # Usage: pass_library(target inference) will append to paddle_inference_pass.h
+unset(INFER_IR_PASSES CACHE) # clear the global variable
 function(pass_library TARGET DEST)
    set(options "")
    set(oneValueArgs "")
@ -15,10 +16,11 @@ function(pass_library TARGET DEST)
    if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference")
        message(STATUS "add pass ${TARGET} ${DEST}")
        file(APPEND ${pass_file} "USE_PASS(${TARGET});\n")
-        set(PASS_LIBRARY ${TARGET} ${PASS_LIBRARY} PARENT_SCOPE)
+        set(INFER_IR_PASSES ${INFER_IR_PASSES} ${TARGET} CACHE INTERNAL "")
    endif()
 endfunction()

+
 cc_library(node SRCS node.cc DEPS proto_desc)
 cc_library(graph SRCS graph.cc DEPS node pretty_log)
 cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@ -91,10 +91,10 @@ void FindWhileOp(Graph* graph) {
 #undef OP_SET_IN
 #undef OP_SET_OUT

-  auto* X = graph->RetriveNode(34);
-  auto* LSTMOUT = graph->RetriveNode(81);
-  auto* cell_init = graph->RetriveNode(6);
-  auto* hidden_init = graph->RetriveNode(8);
+  auto* X = graph->RetrieveNode(34);
+  auto* LSTMOUT = graph->RetrieveNode(81);
+  auto* cell_init = graph->RetrieveNode(6);
+  auto* hidden_init = graph->RetrieveNode(8);

  auto* lstm_op = graph->CreateOpNode(&op_desc);
  PrepareParameters(graph, param);
@ -212,11 +212,11 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,

  float* out_data = out->mutable_data<float>(platform::CPUPlace());
  std::array<const float*, 4> tensors{
-      {W_forget_w0.data<float>(), W_input_w0.data<float>(),
-        W_output_w0.data<float>(), W_cell_w0.data<float>()}};
+      W_forget_w0.data<float>(), W_input_w0.data<float>(),
+        W_output_w0.data<float>(), W_cell_w0.data<float>()};
  std::array<const float*, 4> tensors1{
-      {W_forget_w1.data<float>(), W_input_w1.data<float>(),
-        W_output_w1.data<float>(), W_cell_w1.data<float>()}};
+      W_forget_w1.data<float>(), W_input_w1.data<float>(),
+        W_output_w1.data<float>(), W_cell_w1.data<float>()};

  for (int row = 0; row < D; row++) {
    for (int col = 0; col < 4; col++) {
@ -239,8 +239,8 @@ void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
                     const LoDTensor& B_output, const LoDTensor& B_cell,
                     LoDTensor* out) {
  std::array<const float*, 4> tensors{
-      {B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
-        B_cell.data<float>()}};
+      B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
+        B_cell.data<float>()};

  PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1);
  int D = B_forget.dims()[0];
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@ -84,8 +84,6 @@ void CheckProgram(const ProgramDesc &program) {

 Graph::Graph(const ProgramDesc &program) : program_(program) {
  CheckProgram(program_);
-  // Make the nodes id start from 0.
-  Node::ResetId();
  auto var_nodes = InitFromProgram(program_);
  ResolveHazard(var_nodes);
 }
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@ -116,13 +116,17 @@ class Graph {
  // Create a normal variable with non-null VarDesc.
  ir::Node *CreateVarNode(VarDesc *var_desc) {
    PADDLE_ENFORCE(var_desc);
-    return AddNode(new ir::Node(var_desc));
+    auto *x = AddNode(new ir::Node(var_desc));
+    x->SetId(num_node_created_++);
+    return x;
  }

  // Create a normal runnable operator with OpDesc.
  ir::Node *CreateOpNode(OpDesc *op_desc) {
    PADDLE_ENFORCE(op_desc);
-    return AddNode(new ir::Node(op_desc));
+    auto *x = AddNode(new ir::Node(op_desc));
+    x->SetId(num_node_created_++);
+    return x;
  }

  // Create a control dependency var that connects 2 operations. The
@ -132,13 +136,17 @@ class Graph {
    // TODO(panyx0718): control var name should be really unique.
    const std::string name = string::Sprintf(
        "%s@%llu", ir::Node::kControlDepVarName, node_set_.size());
-    return AddNode(new ir::Node(name, ir::Node::Type::kVariable));
+    auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable));
+    x->SetId(num_node_created_++);
+    return x;
  }

  // A more free style way of creating a graph node. Mostly use for test
  // or "copy" from another node. Avoid using it if possible.
  ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type) {
-    return AddNode(new ir::Node(name, type));
+    auto *x = AddNode(new ir::Node(name, type));
+    x->SetId(num_node_created_++);
+    return x;
  }

  // Clear all node information of the graph and return the ownership of the
@ -160,7 +168,7 @@ class Graph {
  }

  // NOTE low performance, but simple and secure.
-  Node *RetriveNode(int id) {
+  Node *RetrieveNode(int id) {
    for (auto &node : nodes_) {
      if (node.second->id() == id) {
        return node.second.get();
@ -169,6 +177,7 @@ class Graph {
    return nullptr;
  }

+  const ProgramDesc &program() const { return program_; }
  std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
      const ProgramDesc &program);

@ -190,6 +199,7 @@ class Graph {
  std::map<std::string, std::function<void(void)>> attr_dels_;
  std::map<ir::Node *, std::unique_ptr<ir::Node>> nodes_;
  std::unordered_set<ir::Node *> node_set_;
+  size_t num_node_created_{0};  // help to generate a unique node id.
 };

 bool IsControlDepVar(const ir::Node &var);
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@ -167,10 +167,12 @@ struct HitGroup {

  bool Match(Node *node, PDNode *pat) {
    if (nodes_.count(node)) {
-      if (!roles.count(pat)) return false;
-      return roles[pat] == node;
+      if (roles.count(pat) && roles[pat] == node) return true;
+      return false;
+    } else {
+      if (roles.count(pat) && roles[pat] != node) return false;
+      return true;
    }
-    return !roles.count(pat) || roles.at(pat) == node;
  }

  void Register(Node *node, PDNode *pat) {
@ -198,7 +200,6 @@ GraphPatternDetector::DetectPatterns() {
  std::vector<GraphPatternDetector::subgraph_t> result;
  std::vector<HitGroup> init_groups;
  std::array<std::vector<HitGroup>, 2> bi_records;
-  // PADDLE_ENFORCE(!pattern_.edges().empty(), "At least one edge is needed");
  auto *first_pnode = pattern_.edges().empty() ? pattern().nodes().front().get()
                                               : pattern_.edges().front().first;
  if (!pdnodes2nodes_.count(first_pnode)) return result;
@ -228,11 +229,12 @@ GraphPatternDetector::DetectPatterns() {
        VLOG(80) << "check " << source->id() << " -- " << target->id();
        // TODO(Superjomn) add some prune strategies.
        for (const auto &group : pre_groups) {
-          HitGroup new_group = group;
-          if (IsNodesLink(source, target) &&
-              new_group.Match(source, edge.first)) {
-            new_group.Register(source, edge.first);
-            if (new_group.Match(target, edge.second)) {
+          if (IsNodesLink(source, target)) {
+            HitGroup new_group = group;
+            bool flag = new_group.Match(source, edge.first) &&
+                        new_group.Match(target, edge.second);
+            if (flag) {
+              new_group.Register(source, edge.first);
              new_group.Register(target, edge.second);
              cur_groups.push_back(new_group);
              // TODO(Superjomn) need to unique
@ -261,14 +263,16 @@ GraphPatternDetector::DetectPatterns() {
  return result;
 }

-bool GraphItemCMP(const std::pair<PDNode *, Node *> &a,
+struct GraphItemLessThan {
+  bool operator()(const std::pair<PDNode *, Node *> &a,
                  const std::pair<PDNode *, Node *> &b) {
-  if (a.first != b.first) {
-    return a.first < b.first;
-  } else {
-    return a.second < b.second;
+    if (a.first != b.first) {
+      return a.first < b.first;
+    } else {
+      return a.second < b.second;
+    }
  }
-}
+};

 // TODO(Superjomn) enhance the function as it marks unique unique as duplicates
 // see https://github.com/PaddlePaddle/Paddle/issues/13550
@ -282,7 +286,7 @@ void GraphPatternDetector::UniquePatterns(
  for (auto &g : *subgraphs) {
    // Sort the items in the sub-graph, and transform to a string key.
    std::vector<std::pair<PDNode *, Node *>> sorted_keys(g.begin(), g.end());
-    std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemCMP);
+    std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemLessThan());
    std::stringstream ss;
    for (auto &item : sorted_keys) {
      ss << item.first << ":" << item.second;
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@ -310,8 +310,8 @@ void GraphSafeRemoveNodes(Graph* graph,
                          const std::unordered_set<const Node*>& nodes);

 // Some pre-defined patterns those can be reused in multiple passes.
-// The related Fluid Layer or Op should be one pattern here for better reusage
-// accross different fusion.
+// The related Fluid Layer or Op should be one pattern here for better re-usage
+// across different fusion.
 namespace patterns {

 struct KeyCounter {
--- a/paddle/fluid/framework/ir/graph_to_program_pass.cc
+++ b/paddle/fluid/framework/ir/graph_to_program_pass.cc
@ -35,10 +35,11 @@ std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl(
      new proto::ProgramDesc(*program.Proto()));

  auto block = program_pb->mutable_blocks(kRootBlockIndex);
+  block->set_idx(kRootBlockIndex);
  block->clear_vars();
  std::unordered_set<std::string> visited_vars;
  for (ir::Node* n : graph->Nodes()) {
-    if (n->NodeType() == ir::Node::Type::kVariable) {
+    if (n->IsVar()) {
      if (n->Var() && visited_vars.count(n->Var()->Name()) == 0) {
        visited_vars.insert(n->Var()->Name());
        block->add_vars()->MergeFrom(*n->Var()->Proto());
--- a/paddle/fluid/framework/ir/graph_traits.cc
+++ b/paddle/fluid/framework/ir/graph_traits.cc
@ -66,6 +66,76 @@ NodesDFSIterator &NodesDFSIterator::operator=(const NodesDFSIterator &other) {
 }
 Node *NodesDFSIterator::operator->() { return stack_.top(); }

+inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
+  return node.inputs.size() == n;
+}
+
+NodesTSIterator::NodesTSIterator(const std::vector<Node *> &source) {
+  PADDLE_ENFORCE(!source.empty(),
+                 "Start points of topological sorting should not be empty!");
+  // CHECK all the inputs' in-degree is 0
+  for (auto *node : source) {
+    PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
+  }
+
+  std::unordered_set<Node *> visited;
+  std::unordered_set<Node *> to_visit{source.begin(), source.end()};
+
+  std::vector<Node *> inlink_visited;
+  while (!to_visit.empty()) {
+    std::vector<Node *> queue(to_visit.begin(), to_visit.end());
+    for (auto *p : queue) {
+      inlink_visited.clear();
+
+      std::copy_if(p->inputs.begin(), p->inputs.end(),
+                   std::back_inserter(inlink_visited),
+                   [&](Node *x) -> bool { return visited.count(x) != 0; });
+
+      if (inlink_visited.size() == p->inputs.size()) {
+        sorted_.push_back(p);
+        for (auto *_ : p->outputs) {
+          if (!visited.count(_)) {
+            to_visit.insert(_);
+          }
+        }
+
+        to_visit.erase(p);
+        visited.insert(p);
+      }
+    }
+  }
+}
+
+NodesTSIterator::NodesTSIterator(const NodesTSIterator &other)
+    : sorted_(other.sorted_), cursor_(other.cursor_) {}
+
+Node &NodesTSIterator::operator*() {
+  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
+  return *sorted_[cursor_];
+}
+
+NodesTSIterator &NodesTSIterator::operator++() {
+  if (++cursor_ >= sorted_.size()) {
+    sorted_.clear();
+    cursor_ = 0;
+  }
+  return *this;
+}
+NodesTSIterator &NodesTSIterator::operator=(const NodesTSIterator &other) {
+  cursor_ = other.cursor_;
+  sorted_ = other.sorted_;
+  return *this;
+}
+
+bool NodesTSIterator::operator==(const NodesTSIterator &other) {
+  return sorted_ == other.sorted_ && cursor_ == other.cursor_;
+}
+
+Node *NodesTSIterator::operator->() {
+  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
+  return sorted_[cursor_];
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_traits.h
+++ b/paddle/fluid/framework/ir/graph_traits.h
@ -62,6 +62,32 @@ struct NodesDFSIterator
  std::unordered_set<Node *> visited_;
 };

+// Topological sorting iterator on nodes.
+struct NodesTSIterator
+    : public std::iterator<std::forward_iterator_tag, Node *> {
+  NodesTSIterator() = default;
+  NodesTSIterator(const std::vector<Node *> &source);
+  NodesTSIterator(NodesTSIterator &&other)
+      : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
+    other.cursor_ = 0;
+  }
+  NodesTSIterator(const NodesTSIterator &other);
+
+  Node &operator*();
+  NodesTSIterator &operator++();
+  // TODO(Superjomn) current implementation just compare the first
+  // element, need to compare the graph and all the elements in the queue and
+  // set.
+  NodesTSIterator &operator=(const NodesTSIterator &other);
+  bool operator==(const NodesTSIterator &other);
+  bool operator!=(const NodesTSIterator &other) { return !(*this == other); }
+  Node *operator->();
+
+ private:
+  std::vector<Node *> sorted_;
+  size_t cursor_{0};
+};
+
 /*
 * GraphTraits contains some graph traversal algorithms.
 *
@ -76,6 +102,14 @@ struct GraphTraits {
                                            NodesDFSIterator());
  }

+  static iterator_range<NodesTSIterator> TS(const Graph &g) {
+    auto start_points = ExtractStartPoints(g);
+    PADDLE_ENFORCE(!start_points.empty());
+    NodesTSIterator x(start_points);
+    return iterator_range<NodesTSIterator>(NodesTSIterator(start_points),
+                                           NodesTSIterator());
+  }
+
 private:
  // The nodes those have no input will be treated as start points.
  static std::vector<Node *> ExtractStartPoints(const Graph &g) {
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@ -119,37 +119,30 @@ class Node {
  int id_;

 private:
+  // ID can only set by a Graph.
+  void SetId(int id) { id_ = id; }
+
  friend class Graph;
  friend std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
                                                 Node::Type type);

  explicit Node(const std::string& name, Type type)
-      : name_(name),
-        var_desc_(nullptr),
-        op_desc_(nullptr),
-        type_(type),
-        id_(count_++) {}
+      : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {}

  explicit Node(VarDesc* var_desc)
      : name_(var_desc->Name()),
        var_desc_(new VarDesc(*var_desc)),
        op_desc_(nullptr),
-        type_(Type::kVariable),
-        id_(count_++) {}
+        type_(Type::kVariable) {}

  explicit Node(OpDesc* op_desc)
      : name_(op_desc->Type()),
        var_desc_(nullptr),
        op_desc_(new OpDesc(*op_desc, op_desc->Block())),
-        type_(Type::kOperation),
-        id_(count_++) {}
+        type_(Type::kOperation) {}

  Node() = delete;

-  static int count_;
-  // Please don't use this API or make this public.
-  static void ResetId() { count_ = 0; }
-
  boost::any wrapper_;
  std::function<void(void)> wrapper_deleter_;
  std::type_index wrapper_type_ = std::type_index(typeid(void));
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@ -93,6 +93,7 @@ class Pass {
 protected:
  virtual std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const {
    LOG(FATAL) << "Calling virtual Pass not implemented.";
+    return graph;
  }

 private:
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@ -57,60 +57,58 @@ static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
  }
 }

-void NaiveExecutor::Prepare(Scope *parent_scope,
-                            const ProgramDesc &program_desc, int block_id,
-                            bool with_feed_fetch_ops) {
-  if (!parent_scope) {
+void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
+                            int block_id, bool with_feed_fetch_ops) {
+  if (!scope) {
    scope_ = new framework::Scope;
  } else {
-    scope_ = &parent_scope->NewScope();
+    scope_ = scope;
  }
-  CreateVariables(program_desc, scope_, block_id);
+
+  VLOG(3) << "NaiveExecutor init with scope " << scope;
  CreateOps(program_desc, block_id, with_feed_fetch_ops);
 }

 void NaiveExecutor::Run() {
  for (auto &op : ops_) {
-    VLOG(40) << "run " << op->Type();
+    VLOG(3) << std::this_thread::get_id() << " run " << op->Type()
+            << " on scope " << scope_;
    op->Run(*scope_, place_);
  }
 }

-void NaiveExecutor::CreateVariables(const ProgramDesc &desc, Scope *scope,
-                                    int block_id) {
-  PADDLE_ENFORCE(scope);
+void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id,
+                                    bool persistable, Scope *scope) {
+  PADDLE_ENFORCE_NOT_NULL(scope);
+
  auto &global_block = desc.Block(block_id);

-  const Scope *ancestor_scope = scope;
-  while (ancestor_scope->parent()) {
-    ancestor_scope = ancestor_scope->parent();
+  const auto *anc = scope;
+  PADDLE_ENFORCE(anc->parent() != anc);
+  while (anc->parent()) {
+    anc = anc->parent();
  }

-  if (ancestor_scope != scope) {
-    for (auto &var : global_block.AllVars()) {
-      if (var->Name() == framework::kEmptyVarName) {
-        continue;
-      }
-      // Create persistable vars in ancestor scope.
-      if (var->Persistable()) {
-        auto *ptr = const_cast<Scope *>(ancestor_scope)->Var(var->Name());
-        InitializeVariable(ptr, var->GetType());
-        VLOG(30) << "Create Variable " << var->Name()
-                 << " global, which pointer is " << ptr;
-      } else {  // Create temporary variables in local scope.
-        auto *ptr = scope->Var(var->Name());
+  for (auto &var : global_block.AllVars()) {
+    if (var->Name() == framework::kEmptyVarName) {
+      continue;
+    }
+
+    if (persistable == var->Persistable()) {
+      if (persistable) {
+        if (!anc->FindVar(var->Name())) {
+          auto *ptr = const_cast<Scope *>(anc)->Var(var->Name());
+          VLOG(3) << scope << " Create persistable variable " << var->Name()
+                  << ", which pointer is " << ptr;
+          InitializeVariable(ptr, var->GetType());
+        }
+      } else {
+        auto *ptr = const_cast<Scope *>(scope)->Var(var->Name());
+        VLOG(3) << scope << " Create variable " << var->Name()
+                << ", which pointer is " << ptr;
        InitializeVariable(ptr, var->GetType());
-        VLOG(30) << "Create Variable " << var->Name()
-                 << " locally, which pointer is " << ptr;
      }
    }
-  } else {
-    for (auto &var : global_block.AllVars()) {
-      auto *ptr = scope->Var(var->Name());
-      InitializeVariable(ptr, var->GetType());
-      VLOG(30) << "Create variable " << var->Name() << ", which pointer is "
-               << ptr;
-    }
  }
 }

--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
@ -35,8 +35,14 @@ class NaiveExecutor {
  // Create child scope.
  // Create variables.
  // @with_feed_fetch_ops: whether to work with the feed and fetch operators.
-  void Prepare(Scope* parent_scope, const ProgramDesc& program_desc,
-               int block_id, bool with_feed_fetch_ops);
+  void Prepare(Scope* scope, const ProgramDesc& program_desc, int block_id,
+               bool with_feed_fetch_ops);
+
+  // Create variables before head.
+  // Create parameters if persistable is ture, or create the temporary variables
+  // instead.
+  void CreateVariables(const ProgramDesc& desc, int block_id, bool persistable,
+                       Scope* scope);

  // Run all the operators.
  void Run();
@ -49,8 +55,6 @@ class NaiveExecutor {
  void CleanFeedFetchOps();

 protected:
-  void CreateVariables(const ProgramDesc& desc, Scope* scope, int block_id);
-
  void CreateOps(const ProgramDesc& desc, int block_id,
                 bool with_feed_fetch_ops);

--- a/paddle/fluid/framework/naive_executor_test.cc
+++ b/paddle/fluid/framework/naive_executor_test.cc
@ -39,7 +39,7 @@ TEST(NaiveExecutor, Basic) {

  auto place = platform::CPUPlace();
  NaiveExecutor exe(place);
-  exe.Prepare(nullptr, program, 0, false /*with feed fetch ops*/);
+  exe.Prepare(nullptr, program, 0, false);
  auto* a_tensor = exe.FindTensor("a");
  auto* b_tensor = exe.FindTensor("b");
  auto* c_tensor = exe.FindTensor("c");
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@ -15,7 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"

 #include <memory>  // for unique_ptr
+#include <queue>
 #include <set>
+#include <unordered_set>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/string/printf.h"
@ -36,6 +38,16 @@ DEFINE_double(
    "Memory size threshold (GB) when the garbage collector clear tensors."
    "Disabled when this value is less than 0");

+// When in inference scenario, the scopes will not be written by two threads in
+// a mean time, but a scope may be read by multiple threads concurrently, and
+// the mutex will cause serious performance issue.
+// So the mutex is disabled when `ON_INFER`.
+#ifdef ON_INFER
+#define SCOPE_LOCK_GUARD
+#else
+#define SCOPE_LOCK_GUARD std::lock_guard<std::mutex> lock(mutex_);
+#endif
+
 namespace paddle {
 namespace framework {

@ -49,18 +61,18 @@ int64_t GetEagerDeletionThreshold() {
 Scope::~Scope() { DropKids(); }

 Scope& Scope::NewScope() const {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
  kids_.push_back(new Scope(this));
  return *kids_.back();
 }

 Variable* Scope::Var(const std::string& name) {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
  return VarInternal(name);
 }

 Variable* Scope::Var(std::string* name) {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
  if (name != nullptr) {
    *name = new_name;
@ -69,34 +81,34 @@ Variable* Scope::Var(std::string* name) {
 }

 Variable* Scope::FindVar(const std::string& name) const {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
  return FindVarInternal(name);
 }

 Variable* Scope::FindLocalVar(const std::string& name) const {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
  return FindVarLocally(name);
 }

 const Scope* Scope::FindScope(const Variable* var) const {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
  return FindScopeInternal(var);
 }

 void Scope::DropKids() {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
  for (Scope* s : kids_) delete s;
  kids_.clear();
 }

 bool Scope::HasKid(const Scope* scope) const {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  return it != this->kids_.end();
 }

 std::vector<std::string> Scope::LocalVarNames() const {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
  std::vector<std::string> known_vars;
  known_vars.reserve(this->vars_.size());
  for (auto& p : vars_) {
@ -106,9 +118,10 @@ std::vector<std::string> Scope::LocalVarNames() const {
 }

 void Scope::DeleteScope(Scope* scope) const {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
-  PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
+  PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope",
+                 this, scope);
  this->kids_.erase(it);
  // When making memory benchmark on Fluid, we have to delete scope sync.
  if (FLAGS_benchmark || FLAGS_eager_delete_scope) {
@ -119,7 +132,7 @@ void Scope::DeleteScope(Scope* scope) const {
 }

 void Scope::EraseVars(const std::vector<std::string>& var_names) {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
  std::set<std::string> var_set(var_names.begin(), var_names.end());
  for (auto it = vars_.begin(); it != vars_.end();) {
    if (var_set.find(it->first) != var_set.end()) {
@ -132,12 +145,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {

 void Scope::Rename(const std::string& origin_name,
                   const std::string& new_name) const {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
  RenameInternal(origin_name, new_name);
 }

 std::string Scope::Rename(const std::string& origin_name) const {
-  std::lock_guard<std::mutex> lock(mutex_);
+  SCOPE_LOCK_GUARD
  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
  RenameInternal(origin_name, new_name);
  return new_name;
@ -189,5 +202,46 @@ Variable* Scope::FindVarLocally(const std::string& name) const {
  return nullptr;
 }

+std::string GenScopeTreeDebugInfo(Scope* root) {
+  std::stringstream os;
+
+  if (!root) return "";
+
+  // level traversal
+  std::queue<Scope*> queue;
+  queue.push(root);
+
+  std::vector<Scope*> scopes;
+
+  while (!queue.empty()) {
+    auto* end = queue.back();
+    Scope* q = nullptr;
+    while (q != end) {
+      q = queue.front();
+      queue.pop();
+      os << q << " ";
+      scopes.push_back(q);
+
+      for (auto* c : q->kids()) {
+        queue.push(c);
+      }
+    }
+    // end of a level
+    os << "\n------------------------------------------\n";
+  }
+
+  os << "\nDetails:\n\n";
+
+  for (Scope* q : scopes) {
+    os << "====\n";
+    os << q << ":\n";
+    for (auto& var : q->LocalVarNames()) {
+      os << "  - " << var << "\n";
+    }
+  }
+
+  return os.str();
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@ -78,11 +78,11 @@ class Scope {
  /// Drop all kids scopes belonged to this scope.
  void DropKids();

-  std::list<Scope*>& kids() const { return kids_; }
-
  /// Find if a scope exists in the kid scopes
  bool HasKid(const Scope* scope) const;

+  const std::list<Scope*>& kids() const { return kids_; }
+
  // enumerate all the variables current contains.
  std::vector<std::string> LocalVarNames() const;

@ -118,12 +118,17 @@ class Scope {

  // Scope in `kids_` are owned by this class.
  mutable std::list<Scope*> kids_;
-  Scope const* parent_{nullptr};
+  const Scope* parent_{nullptr};

  DISABLE_COPY_AND_ASSIGN(Scope);

 private:
  mutable std::mutex mutex_;
 };
+
+// Generate some debug string about the inherience structure of scope, quite
+// naive.
+std::string GenScopeTreeDebugInfo(Scope*);
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@ -63,6 +63,26 @@ struct TensorCopyVisitor {
  int64_t size_;
 };

+struct TensorFillVisitor {
+  TensorFillVisitor(framework::Tensor* dst, int64_t dst_offset, int64_t size,
+                    float value)
+      : dst_(dst), dst_offset_(dst_offset), size_(size) {}
+
+  template <typename T>
+  void apply() const {
+    // TODO(qiao): support other place
+    platform::CPUPlace cpu;
+    auto* tensor_data = dst_->mutable_data<T>(cpu);
+    auto* start = tensor_data + dst_offset_;
+    auto* end = start + size_;
+    std::fill(start, end, static_cast<T>(0.0));
+  }
+
+  framework::Tensor* dst_;
+  int64_t dst_offset_;
+  int64_t size_;
+};
+
 void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
                       const platform::DeviceContext& dev_ctx) {
  {  // the 1st field, uint32_t version
@ -120,7 +140,17 @@ bool SelectedRows::HasKey(int64_t key) const {
                                                                   : true;
 }

-int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown) {
+int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown,
+                                     bool is_test) {
+  if (is_test) {
+    auto iter = id_to_index_.find(key);
+    if (iter == id_to_index_.end()) {
+      return -1;
+    } else {
+      return iter->second;
+    }
+  }
+
  rwlock_->RDLock();
  auto iter = id_to_index_.find(key);
  if (iter == id_to_index_.end()) {
@ -172,7 +202,7 @@ void SelectedRows::SyncIndex() {
 }

 void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
-                       bool auto_grown) {
+                       bool auto_grown, bool is_test) {
  PADDLE_ENFORCE(value->IsInitialized(),
                 "The value tensor should be initialized.");
  if (ids.numel() == 0) {
@ -183,11 +213,19 @@ void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
                      "output tensor should have the same shape with table "
                      "except the dims[0].");
    for (int i = 0; i < ids.numel(); ++i) {
-      int64_t index = AutoGrownIndex(ids.data<int64_t>()[i], auto_grown);
-      framework::VisitDataType(
-          framework::ToDataType(value_->type()),
-          TensorCopyVisitor(value, i * value_width, *value_.get(),
-                            index * value_width, value_width));
+      auto id = ids.data<int64_t>()[i];
+      int64_t index = AutoGrownIndex(id, auto_grown, is_test);
+      if (index < 0) {
+        VLOG(5) << "id " << id << " not in the table, return 0";
+        framework::VisitDataType(
+            framework::ToDataType(value_->type()),
+            TensorFillVisitor(value, i * value_width, value_width, 0.0));
+      } else {
+        framework::VisitDataType(
+            framework::ToDataType(value_->type()),
+            TensorCopyVisitor(value, i * value_width, *value_.get(),
+                              index * value_width, value_width));
+      }
    }
  }
 }
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@ -105,7 +105,7 @@ class SelectedRows {
   * the value
   */
  void Get(const framework::Tensor& ids, framework::Tensor* value,
-           bool auto_grown = false);
+           bool auto_grown = false, bool is_test = false);

  /*
   * @brief Get the index of the key from id_to_index_ map. If the key not
@ -118,7 +118,7 @@ class SelectedRows {
   *
   * @return index of the key.
   */
-  int64_t AutoGrownIndex(int64_t key, bool auto_grown);
+  int64_t AutoGrownIndex(int64_t key, bool auto_grown, bool is_test = false);

  void SyncIndex();

--- a/Show More
+++ b/Show More