merge develop

7 years ago · f55e8901c8
parent 1600ba86f6 4069262f0e
commit f55e8901c8
448 changed files with 4413 additions and 980 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,6 +5,7 @@ python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/
 python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/
 python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/
 *.DS_Store
+*.vs
 build/
 build_doc/
 *.user
@ -15,6 +16,7 @@ build_doc/
 .cproject
 .pydevproject
 .settings/
+CMakeSettings.json
 Makefile
 .test_env/
 third_party/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -204,11 +204,12 @@ include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)

-set(WITH_ANAKIN OFF CACHE STRING "Disable Anakin first, will add it later." FORCE)
 if(WITH_GPU)
    include(cuda)
    include(tensorrt)
    include(external/anakin)
+elseif()
+    set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in GPU only now." FORCE)
 endif()

 include(cudnn)              # set cudnn libraries, must before configure
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -56,6 +56,10 @@ if(NOT CMAKE_CROSSCOMPILING)
        set(SIMD_FLAG ${SSE3_FLAG})
    endif()
 endif()
+if(UNIX AND NOT APPLE)
+  # except apple from nix*Os family
+  set(LINUX TRUE)
+endif(UNIX AND NOT APPLE)

 if(NOT WITH_GOLANG)
    add_definitions(-DPADDLE_WITHOUT_GOLANG)
@ -104,6 +108,10 @@ if(WITH_GPU)
        if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
            message(FATAL_ERROR "Anakin needs CUDNN >= 7.0 to compile")
        endif()
+        set(ENV{CUDNN_INCLUDE_DIR} ${CUDNN_INCLUDE_DIR})
+        set(ENV{CUDNN_LIBRARY} ${CUDNN_LIBRARY})
+        message(STATUS "cudnn include header is ${CUDNN_INCLUDE_DIR}/cudnn.h")
+        message(STATUS "cudnn library is ${CUDNN_LIBRARY}")
    endif()
 elseif(WITH_AMD_GPU)
    add_definitions(-DPADDLE_WITH_HIP)
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@ -35,9 +35,10 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
 ExternalProject_Add(
    extern_anakin
    ${EXTERNAL_PROJECT_LOG_ARGS}
-    # TODO(luotao): use PaddlePaddle/Anakin later
+    DEPENDS             ${MKLML_PROJECT}
+    # Anakin codes error on Intel(R) Xeon(R) Gold 5117 CPU, temporary do not compile avx512 related code.
    GIT_REPOSITORY      "https://github.com/luotao1/Anakin"
-    GIT_TAG             "3957ae9263eaa0b1986758dac60a88852afb09be"
+    GIT_TAG             "bcf17aabe7921ceb7bce591244b4f9dce7dba5c8"
    PREFIX              ${ANAKIN_SOURCE_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DUSE_GPU_PLACE=YES
--- a/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst
+++ b/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst
@ -4,7 +4,7 @@ Paddle 预测 API
 为了更简单方便的预测部署，Fluid 提供了一套高层 API
 用来隐藏底层不同的优化实现。

-`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/contrib/inference>`__
+`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api>`__
 包括

 -  头文件 ``paddle_inference_api.h`` 定义了所有的接口
@ -104,5 +104,5 @@ engine
 ------------

 -  `inference
-   demos <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/contrib/inference/demo>`__
-  `复杂单线程/多线程例子 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/inference/test_paddle_inference_api_impl.cc>`__
+   demos <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api/demo_ci>`__
+-  `复杂单线程/多线程例子 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/api/api_impl_tester.cc>`__
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -155,10 +155,11 @@ paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale',
 paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.relu ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.log ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
--- a/paddle/fluid/framework/attribute.h
+++ b/paddle/fluid/framework/attribute.h
@ -128,7 +128,8 @@ struct ExtractAttribute {
      attr_value = &boost::get<T>(attr);
    } catch (boost::bad_get& bad_get) {
      PADDLE_THROW("Cannot get attribute %s by type %s, its type is %s",
-                   attr_name_, typeid(T).name(), attr.type().name());
+                   attr_name_, paddle::platform::demangle(typeid(T).name()),
+                   paddle::platform::demangle(attr.type().name()));
    }
    return attr_value;
  }
@ -160,7 +161,7 @@ struct ExtractAttribute<bool> {
      attr_value = &boost::get<bool>(attr);
    } catch (boost::bad_get& bad_get) {
      PADDLE_THROW("Cannot get attribute %s by type bool, its type is %s",
-                   attr_name_, attr.type().name());
+                   attr_name_, paddle::platform::demangle(attr.type().name()));
    }
    return attr_value;
  }
@ -186,7 +187,7 @@ struct ExtractAttribute<int64_t> {
      attr_value = &boost::get<int64_t>(attr);
    } catch (boost::bad_get& bad_get) {
      PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
-                   attr_name_, attr.type().name());
+                   attr_name_, paddle::platform::demangle(attr.type().name()));
    }
    return attr_value;
  }
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@ -3,7 +3,10 @@ cc_library(graph SRCS graph.cc DEPS node)
 cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
 cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
 cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper)
+cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
+cc_library(graph_pattern_detecter SRCS graph_pattern_detecter.cc DEPS graph graph_helper graph_traits)

 cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
 cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
+cc_test(test_graph_pattern_detecter SRCS graph_pattern_detecter_tester.cc DEPS graph_pattern_detecter)
--- a/paddle/fluid/framework/ir/graph_pattern_detecter.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detecter.cc
@ -0,0 +1,187 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <array>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detecter.h"
+#include "paddle/fluid/framework/ir/graph_traits.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+PDNode* PDPattern::NewNode(PDNode::teller_t&& teller, const std::string& name) {
+  nodes_.emplace_back(new PDNode(std::move(teller), name));
+  auto* cur = nodes_.back().get();
+  return cur;
+}
+
+void PDPattern::AddEdge(PDNode* a, PDNode* b) {
+  PADDLE_ENFORCE(a);
+  PADDLE_ENFORCE(b);
+  PADDLE_ENFORCE(a != b, "can't connect to the same nodes.");
+  edges_.emplace_back(a, b);
+}
+
+void GraphPatternDetecter::operator()(Graph* graph,
+                                      GraphPatternDetecter::handle_t handler) {
+  if (!MarkPDNodesInGraph(*graph)) return;
+  auto subgraphs = DetectPatterns();
+  UniquePatterns(&subgraphs);
+  RemoveOverlappedMatch(&subgraphs);
+
+  for (auto& g : subgraphs) {
+    handler(g, graph);
+  }
+}
+
+bool GraphPatternDetecter::MarkPDNodesInGraph(const ir::Graph& graph) {
+  if (graph.Nodes().empty()) return false;
+
+  for (auto& node : GraphTraits::DFS(graph)) {
+    for (const auto& pdnode : pattern_.nodes()) {
+      if (pdnode->Tell(&node)) {
+        pdnodes2nodes_[pdnode.get()].insert(&node);
+      }
+    }
+  }
+  return !pdnodes2nodes_.empty();
+}
+
+struct HitGroup {
+  std::unordered_map<PDNode*, Node*> roles;
+
+  bool Match(Node* node, PDNode* pat) {
+    return !roles.count(pat) || roles.at(pat) == node;
+  }
+
+  void Register(Node* node, PDNode* pat) { roles[pat] = node; }
+};
+
+// Tell whether Node a links to b.
+bool IsNodesLink(Node* a, Node* b) {
+  for (auto* node : a->outputs) {
+    if (b == node) {
+      return true;
+    }
+  }
+  return false;
+}
+
+std::vector<GraphPatternDetecter::subgraph_t>
+GraphPatternDetecter::DetectPatterns() {
+  // Init empty subgraphs.
+  std::vector<GraphPatternDetecter::subgraph_t> result;
+  std::vector<HitGroup> init_groups;
+  PADDLE_ENFORCE(!pattern_.edges().empty(), "At least one edge is needed");
+  auto* first_pnode = pattern_.edges().front().first;
+  if (!pdnodes2nodes_.count(first_pnode)) return result;
+  for (auto* node : pdnodes2nodes_[first_pnode]) {
+    HitGroup group;
+    group.roles[first_pnode] = node;
+    init_groups.emplace_back(group);
+  }
+
+  int step = 0;
+  std::array<std::vector<HitGroup>, 2> bi_records;
+  bi_records[0] = std::move(init_groups);
+
+  // Extend a PDNode to subgraphs by deducing the connection relations defined
+  // in edges of PDNodes.
+  for (const auto& edge : pattern_.edges()) {
+    // Each role has two PDNodes, which indicates two roles.
+    // Detect two Nodes that can match these two roles and they are connected.
+    auto& pre_groups = bi_records[step % 2];
+    auto& cur_groups = bi_records[1 - (step++ % 2)];
+    cur_groups.clear();
+    // source -> target
+    for (Node* source : pdnodes2nodes_[edge.first]) {
+      for (Node* target : pdnodes2nodes_[edge.second]) {
+        // TODO(Superjomn) add some prune strategies.
+        for (const auto& group : pre_groups) {
+          HitGroup new_group = group;
+          if (IsNodesLink(source, target) &&
+              new_group.Match(source, edge.first)) {
+            new_group.Register(source, edge.first);
+            if (new_group.Match(target, edge.second)) {
+              new_group.Register(target, edge.second);
+              cur_groups.push_back(new_group);
+              // TODO(Superjomn) need to unique
+            }
+          }
+        }
+      }
+    }
+  }
+
+  for (auto& group : bi_records[step % 2]) {
+    GraphPatternDetecter::subgraph_t subgraph;
+    for (auto& role : group.roles) {
+      subgraph.emplace(role.first, role.second);
+    }
+    result.emplace_back(subgraph);
+  }
+  return result;
+}
+
+void GraphPatternDetecter::UniquePatterns(
+    std::vector<GraphPatternDetecter::subgraph_t>* subgraphs) {
+  if (subgraphs->empty()) return;
+  std::vector<GraphPatternDetecter::subgraph_t> result;
+
+  std::unordered_set<size_t> set;
+  for (auto& g : *subgraphs) {
+    size_t key = 0;
+    for (auto& item : g) {
+      key ^= std::hash<void*>{}(item.first);
+      key ^= std::hash<void*>{}(item.second);
+    }
+    if (!set.count(key)) {
+      result.emplace_back(g);
+      set.insert(key);
+    }
+  }
+  *subgraphs = result;
+}
+
+void GraphPatternDetecter::RemoveOverlappedMatch(
+    std::vector<subgraph_t>* subgraphs) {
+  std::vector<subgraph_t> result;
+  std::unordered_set<Node*> node_set;
+
+  for (const auto& subgraph : *subgraphs) {
+    bool valid = true;
+    for (auto& item : subgraph) {
+      if (node_set.count(item.second)) {
+        valid = false;
+        break;
+      }
+    }
+    if (valid) {
+      for (auto& item : subgraph) {
+        node_set.insert(item.second);
+      }
+      result.push_back(subgraph);
+    }
+  }
+  *subgraphs = result;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detecter.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detecter.h
@ -0,0 +1,181 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_TESTING
+#include <gtest/gtest_prod.h>
+#endif
+
+#include <numeric>
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/node.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+// Some basic torminolygies:
+//   - PDPattern: a pattern defined as a data flow graph.
+//   - PDNode: the node in the pattern, each PDNode represents an `ir::Node`
+//     that meets some conditions defined in `PDNode.teller`.
+//   - A pattern is defined with PDNodes with edges.
+
+// Pattern detector node. This node helps to build a pattern.
+struct PDNode {
+  // tell whether an ir::Node* is a candidation for a PDNode.
+  using teller_t = std::function<bool(Node*)>;
+
+  PDNode(teller_t&& teller, const std::string& name = "")
+      : teller_(teller), name_(name) {
+    PADDLE_ENFORCE(teller_ != nullptr, "invalid teller functer is set.");
+  }
+
+  PDNode(PDNode&& other) = default;
+
+  std::vector<PDNode*> inlinks;
+  std::vector<PDNode*> outlinks;
+
+  bool Tell(Node* node) const {
+    PADDLE_ENFORCE(teller_ != nullptr, "teller should be set for a PDNode");
+    return teller_(node);
+  }
+
+  const std::string& name() const { return name_; }
+
+  PDNode(const PDNode&) = delete;
+  PDNode& operator=(const PDNode&) = delete;
+
+ private:
+  teller_t teller_;
+  std::string name_;
+};
+
+/*
+ * A pattern in a graph, which defined with PDNode and edges. Most graph
+ * patterns can be divided into PDNodes and link relations between them.
+ *
+ * For example, the FC fusion need to filter the MUL and ELEMENTWISE_ADD
+ * operators from the computation graph, the MUL's output should have only one
+ * consumer which is the ELEMENTWISE_ADD.
+ * This pattern can be defined as with the following pseudo codes
+ *
+ *     // Create two operator PDNodes.
+ *     MUL = PDPattern.NewNode()
+ *     ELE = PDPattern.NewNode()
+ *     // Create the variable PDNodes.
+ *     MUL_out = PDPattern.NewNode()
+ *     // Add teller to define some rules that help to filter the target Nodes.
+ *     MUL.teller = lambda(node): node->IsOp() && node->Op()->Type == "mul";
+ *     ELE.teller = lambda(node): \
+ *                        node->IsOp() && node->Op()->Type == "elementwise_add";
+ *     MUL_out.teller = lambda(node): node->IsVar() && (MUL in node->inputs)
+ *                                                  && (ELE in node->outputs)
+ *
+ * One can add more specific tellers for PDNodes or edges, both the Operator
+ * and Variable Nodes can be ruled in PDNode.teller.
+ *
+ * PDPattern can record the general patterns, such as the pattern represents
+ *   - Op in CPU -> Op in GPU -> Op in CPU, to findout the IO abnormal place.
+ *   - Ops whose inputs and outputs share the same variables
+ */
+class PDPattern {
+ public:
+  using edge_t = std::pair<PDNode*, PDNode*>;
+
+  void AddEdge(PDNode* a, PDNode* b);
+
+  PDNode* NewNode(PDNode::teller_t&& teller, const std::string& name = "");
+
+  const std::vector<std::unique_ptr<PDNode>>& nodes() const { return nodes_; }
+  const std::vector<edge_t>& edges() const { return edges_; }
+
+ private:
+#ifdef PADDLE_WITH_TESTING
+  FRIEND_TEST(PDPattern, AddEdge);
+  FRIEND_TEST(PDPattern, NewNode);
+#endif
+
+  std::vector<std::unique_ptr<PDNode>> nodes_;
+  std::vector<edge_t> edges_;
+};
+
+/*
+ * GraphPatternDetecter helps to detect the specific patterns in the graph.
+ * Input a pattern, output a list of the matched subgraphs/nodes.
+ * This helper can be used to support fuse(conv+batchnorm => batchnorm e.g.).
+ *
+ * The algorithm has three phases:
+ *   1. Mark the nodes that match the defined PDNodes in a PDPattern,
+ *   2. Extend a PDNode to subgraphs by deducing the connection relation defined
+ *      in PAPattern(the edges),
+ *   3. Get the filtered subgraphs and treat them with a pre-defined handler.
+ *
+ * Usage:
+ *    // Create a detector
+ *    GraphPatternDetecter detector;
+ *    // Define the detector's pattern, by adding PDNode and define the edges.
+ *    auto* node0 = detector.mutable_pattern().AddNode(...)
+ *    auto* node1 = detector.mutable_pattern().AddNode(...)
+ *    node0->teller = some lambda.
+ *    node1->teller = some lambda.
+ *    detector.mutable_pattern().AddEdge(node0, node1);
+ *    // Create an handler, to define the behavior of treating the filtered
+ *    // subgraphs that comply with the patterns.
+ *    GraphPatternDetecter::handle_t handler = some labmda
+ *    // Execute the detector.
+ *    detector(&graph, handler);
+ */
+class GraphPatternDetecter {
+ public:
+  using subgraph_t = std::unordered_map<PDNode*, Node*>;
+
+  // Operate on the detected pattern.
+  using handle_t =
+      std::function<void(const subgraph_t& /*hitted pattern*/, Graph*)>;
+
+  void operator()(Graph* graph, handle_t handler);
+
+  const PDPattern& pattern() const { return pattern_; }
+  PDPattern* mutable_pattern() { return &pattern_; }
+
+ private:
+  // Mark the nodes that fits the pattern.
+  bool MarkPDNodesInGraph(const ir::Graph& graph);
+
+  // Detect all the pattern and output the hit records.
+  std::vector<subgraph_t> DetectPatterns();
+
+  // Remove duplicate patterns.
+  void UniquePatterns(std::vector<subgraph_t>* subgraphs);
+
+  // Remove overlapped match subgraphs, when overlapped, keep the previous one.
+  void RemoveOverlappedMatch(std::vector<subgraph_t>* subgraphs);
+
+#ifdef PADDLE_WITH_TESTING
+  FRIEND_TEST(GraphPatternDetecter, MarkPDNodesInGraph);
+  FRIEND_TEST(GraphPatternDetecter, DetectPatterns);
+#endif
+
+ private:
+  using hit_rcd_t =
+      std::pair<Node* /*node in graph*/, PDNode* /*node in pattern*/>;
+  PDPattern pattern_;
+  std::vector<hit_rcd_t> marked_records_;
+  std::unordered_map<const PDNode*, std::unordered_set<Node*>> pdnodes2nodes_;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc
@ -0,0 +1,172 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/graph_pattern_detecter.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void BuildGraph(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
+  ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation);
+  ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation);
+  ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
+  ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable);
+  ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable);
+
+  // o1->v1->o2
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+  // o2->v2->o3
+  // o2->v2->o4
+  o2->outputs.push_back(v2);
+  o3->inputs.push_back(v2);
+  o4->inputs.push_back(v2);
+  v2->inputs.push_back(o2);
+  v2->outputs.push_back(o3);
+  v2->outputs.push_back(o4);
+  // o2->v3->o5
+  o2->outputs.push_back(v3);
+  o5->inputs.push_back(v3);
+  v3->inputs.push_back(o2);
+  v3->outputs.push_back(o5);
+  // o3-v4->o5
+  o3->outputs.push_back(v4);
+  o5->inputs.push_back(v4);
+  v4->inputs.push_back(o3);
+  v4->outputs.push_back(o5);
+}
+
+TEST(PDPattern, NewNode) {
+  PDPattern x;
+  auto* n = x.NewNode([](Node* x) { return true; });
+  ASSERT_TRUE(n);
+  ASSERT_EQ(x.nodes_.size(), 1UL);
+}
+
+TEST(PDPattern, AddEdge) {
+  PDPattern x;
+  auto* a = x.NewNode([](Node* x) { return true; });
+  auto* b = x.NewNode([](Node* x) { return true; });
+  ASSERT_TRUE(a);
+  ASSERT_TRUE(b);
+  x.AddEdge(a, b);
+  ASSERT_EQ(x.nodes_.size(), 2UL);
+  ASSERT_EQ(x.edges_.size(), 1UL);
+  ASSERT_EQ(x.edges_.front().first, a);
+  ASSERT_EQ(x.edges_.front().second, b);
+
+  ASSERT_EQ(x.nodes().size(), 2UL);
+  ASSERT_EQ(x.edges().size(), 1UL);
+  ASSERT_EQ(x.edges().front().first, a);
+  ASSERT_EQ(x.edges().front().second, b);
+}
+
+TEST(GraphPatternDetecter, MarkPDNodesInGraph) {
+  GraphPatternDetecter x;
+  // mark o2, o3, v2
+
+  // The pattern is a graph:
+  //   o2(a node named o2) -> v2(a node named v2)
+  //   v2 -> o3(a node named o3)
+  auto* o2 = x.pattern_.NewNode([](Node* node) {
+    // The teller can be any condition, such as op type, or variable's shape.
+    return node && node->Name() == "op2" && node->IsOp();
+  });
+  auto* o3 = x.pattern_.NewNode([](Node* node) {
+    // The teller can be any condition, such as op type, or variable's shape.
+    return node && node->Name() == "op3" && node->IsOp();
+  });
+  auto* v2 = x.pattern_.NewNode([](Node* node) {
+    // The teller can be any condition, such as op type, or variable's shape.
+    return node && node->Name() == "var2" && node->IsVar();
+  });
+
+  ASSERT_FALSE(o2->Tell(nullptr));
+  ASSERT_FALSE(o3->Tell(nullptr));
+  ASSERT_FALSE(v2->Tell(nullptr));
+
+  x.pattern_.AddEdge(o2, v2);
+  x.pattern_.AddEdge(v2, o3);
+
+  ASSERT_EQ(x.pattern_.edges().size(), 2UL);
+  ASSERT_EQ(x.pattern_.edges()[0].first, o2);
+  ASSERT_EQ(x.pattern_.edges()[0].second, v2);
+  ASSERT_EQ(x.pattern_.edges()[1].first, v2);
+  ASSERT_EQ(x.pattern_.edges()[1].second, o3);
+
+  ProgramDesc program;
+  Graph graph(program);
+  BuildGraph(&graph);
+
+  x.MarkPDNodesInGraph(graph);
+
+  ASSERT_EQ(x.pdnodes2nodes_.size(), 3UL);
+
+  auto subgraphs = x.DetectPatterns();
+  ASSERT_EQ(subgraphs.size(), 1UL);
+}
+
+TEST(GraphPatternDetecter, MultiSubgraph) {
+  ProgramDesc program;
+  Graph graph(program);
+  BuildGraph(&graph);
+
+  GraphPatternDetecter x;
+
+  // The pattern is a graph:
+  //   op -> var
+  auto* any_op = x.mutable_pattern()->NewNode(
+      [](Node* node) {
+        return node->IsOp() && (node->Name() == "op2" || node->Name() == "op3");
+      },
+      "OP0");
+  auto* any_var = x.mutable_pattern()->NewNode(
+      [](Node* node) { return node->IsVar(); }, "VAR");
+  auto* any_op1 = x.mutable_pattern()->NewNode(
+      [](Node* node) { return node->IsOp(); }, "OP1");
+
+  x.mutable_pattern()->AddEdge(any_op, any_var);
+  x.mutable_pattern()->AddEdge(any_var, any_op1);
+
+  int count = 0;
+  GraphPatternDetecter::handle_t handle = [&](
+      const GraphPatternDetecter::subgraph_t& s, Graph* g) {
+    LOG(INFO) << "Detect " << s.at(any_op)->Name() << " -> "
+              << s.at(any_var)->Name() << " -> " << s.at(any_op1)->Name();
+    count++;
+  };
+
+  x(&graph, handle);
+
+  // 1. Detect op3 -> var4 -> op5
+  // 2. Detect op2 -> var2 -> op3
+  // 3. Detect op2 -> var2 -> op4
+  // 4. Detect op2 -> var3 -> op5
+  // But 2 and 3 and 4 overlapped, so keep 2, so the final choices are 1 and 2
+  ASSERT_GE(count, 1UL);
+  ASSERT_LE(count, 2UL);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_traits.cc
+++ b/paddle/fluid/framework/ir/graph_traits.cc
@ -0,0 +1,69 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/graph_traits.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+//
+// NodesDFSIterator
+//
+NodesDFSIterator::NodesDFSIterator(const std::vector<Node *> &source) {
+  for (auto *x : source) stack_.push(x);
+}
+
+NodesDFSIterator::NodesDFSIterator(NodesDFSIterator &&other) noexcept
+    : stack_(std::move(other.stack_)),
+      visited_(std::move(other.visited_)) {}
+
+NodesDFSIterator::NodesDFSIterator(const NodesDFSIterator &other)
+    : stack_(other.stack_), visited_(other.visited_) {}
+
+Node &NodesDFSIterator::operator*() {
+  PADDLE_ENFORCE(!stack_.empty());
+  return *stack_.top();
+}
+
+NodesDFSIterator &NodesDFSIterator::operator++() {
+  PADDLE_ENFORCE(!stack_.empty(), "the iterator exceeds range");
+  visited_.insert(stack_.top());
+  auto *cur = stack_.top();
+  stack_.pop();
+  for (auto *x : cur->outputs) {
+    if (!visited_.count(x)) {
+      stack_.push(x);
+    }
+  }
+  return *this;
+}
+bool NodesDFSIterator::operator==(const NodesDFSIterator &other) {
+  if (stack_.empty()) return other.stack_.empty();
+  if ((!stack_.empty()) && (!other.stack_.empty())) {
+    return stack_.top() == other.stack_.top();
+  }
+  return false;
+}
+
+NodesDFSIterator &NodesDFSIterator::operator=(const NodesDFSIterator &other) {
+  stack_ = other.stack_;
+  visited_ = other.visited_;
+  return *this;
+}
+Node *NodesDFSIterator::operator->() { return stack_.top(); }
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_traits.h
+++ b/paddle/fluid/framework/ir/graph_traits.h
@ -0,0 +1,90 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stack>
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/node.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+template <typename IteratorT>
+class iterator_range {
+  IteratorT begin_, end_;
+
+ public:
+  template <typename Container>
+  explicit iterator_range(Container &&c) : begin_(c.begin()), end_(c.end()) {}
+
+  iterator_range(const IteratorT &begin, const IteratorT &end)
+      : begin_(begin), end_(end) {}
+
+  const IteratorT &begin() const { return begin_; }
+  const IteratorT &end() const { return end_; }
+};
+
+// DFS iterator on nodes.
+struct NodesDFSIterator
+    : public std::iterator<std::forward_iterator_tag, Node *> {
+  NodesDFSIterator() = default;
+  explicit NodesDFSIterator(const std::vector<Node *> &source);
+  NodesDFSIterator(NodesDFSIterator &&other) noexcept;
+  NodesDFSIterator(const NodesDFSIterator &other);
+
+  Node &operator*();
+  NodesDFSIterator &operator++();
+  // TODO(Superjomn) current implementation just compare the first
+  // element, need to compare the graph and all the elements in the queue and
+  // set.
+  NodesDFSIterator &operator=(const NodesDFSIterator &other);
+  bool operator==(const NodesDFSIterator &other);
+  bool operator!=(const NodesDFSIterator &other) { return !(*this == other); }
+  Node *operator->();
+
+ private:
+  std::stack<Node *> stack_;
+  std::unordered_set<Node *> visited_;
+};
+
+/*
+ * GraphTraits contains some graph traversal algorithms.
+ *
+ * Usage:
+ *
+ */
+struct GraphTraits {
+  static iterator_range<NodesDFSIterator> DFS(const Graph &g) {
+    auto start_points = ExtractStartPoints(g);
+    NodesDFSIterator x(start_points);
+    return iterator_range<NodesDFSIterator>(NodesDFSIterator(start_points),
+                                            NodesDFSIterator());
+  }
+
+ private:
+  // The nodes those have no input will be treated as start points.
+  static std::vector<Node *> ExtractStartPoints(const Graph &g) {
+    std::vector<Node *> result;
+    for (auto *node : g.Nodes()) {
+      if (node->inputs.empty()) {
+        result.push_back(node);
+      }
+    }
+    return result;
+  }
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@ -58,6 +58,9 @@ class Node {
    return op_desc_;
  }

+  bool IsOp() const { return type_ == Type::kOperation; }
+  bool IsVar() const { return type_ == Type::kVariable; }
+
  std::vector<Node*> inputs;
  std::vector<Node*> outputs;

--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@ -202,6 +202,52 @@ std::vector<std::string> OpDesc::AttrNames() const {
 }

 void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
+  // NOTICE(minqiyang): pybind11 will take the empty list in python as
+  // the std::vector<int> type in C++; so we have to change the attr's type
+  // here if we meet this issue
+  proto::AttrType attr_type = static_cast<proto::AttrType>(v.which() - 1);
+  if (attr_type == proto::AttrType::INTS &&
+      boost::get<std::vector<int>>(v).size() == 0u) {
+    // Find current attr via attr name and set the correct attribute value
+    const proto::OpProto::Attr &attr = GetProtoAttr(name);
+    switch (attr.type()) {
+      case proto::AttrType::BOOLEANS: {
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to BOOLEANS";
+        this->attrs_[name] = std::vector<bool>();
+        break;
+      }
+      case proto::AttrType::INTS: {
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to INTS";
+        this->attrs_[name] = std::vector<int>();
+        break;
+      }
+      case proto::AttrType::FLOATS: {
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to FLOATS";
+        this->attrs_[name] = std::vector<float>();
+        break;
+      }
+      case proto::AttrType::STRINGS: {
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to STRINGS";
+        this->attrs_[name] = std::vector<std::string>();
+        break;
+      }
+      case proto::AttrType::BLOCKS: {
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to BLOCKS";
+        this->SetBlocksAttr(name, std::vector<BlockDesc *>());
+        return;
+      }
+      default:
+        PADDLE_THROW("Wrong attr type %d", attr.type());
+    }
+    need_update_ = true;
+    return;
+  }
+
  this->attrs_[name] = v;
  need_update_ = true;
 }
@ -229,6 +275,19 @@ Attribute OpDesc::GetAttr(const std::string &name) const {
  return it->second;
 }

+const proto::OpProto::Attr &OpDesc::GetProtoAttr(
+    const std::string &name) const {
+  const proto::OpProto &proto = OpInfoMap::Instance().Get(Type()).Proto();
+  for (int i = 0; i != proto.attrs_size(); ++i) {
+    const proto::OpProto::Attr &attr = proto.attrs(i);
+    if (attr.name() == name) {
+      return attr;
+    }
+  }
+
+  PADDLE_THROW("Attribute %s is not found in proto %s", name, proto.type());
+}
+
 Attribute OpDesc::GetNullableAttr(const std::string &name) const {
  auto it = attrs_.find(name);
  if (it != attrs_.end()) {
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@ -81,6 +81,8 @@ class OpDesc {

  Attribute GetAttr(const std::string &name) const;

+  const proto::OpProto::Attr &GetProtoAttr(const std::string &name) const;
+
  Attribute GetNullableAttr(const std::string &name) const;

  int GetBlockAttrId(const std::string &name) const;
--- a/paddle/fluid/framework/rw_lock.h
+++ b/paddle/fluid/framework/rw_lock.h
@ -0,0 +1,46 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <pthread.h>
+
+namespace paddle {
+namespace framework {
+
+struct RWLock {
+  RWLock() { pthread_rwlock_init(&lock_, nullptr); }
+
+  ~RWLock() { pthread_rwlock_destroy(&lock_); }
+
+  void RDLock() {
+    PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0,
+                      "acquire read lock failed");
+  }
+
+  void WRLock() {
+    PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0,
+                      "acquire write lock failed");
+  }
+
+  void UNLock() {
+    PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
+  }
+
+ private:
+  pthread_rwlock_t lock_;
+};
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@ -120,66 +120,76 @@ bool SelectedRows::HasKey(int64_t key) const {
                                                                   : true;
 }

-std::vector<std::pair<int64_t, int64_t>> SelectedRows::Get(
-    const std::vector<int64_t>& keys, framework::Tensor* value) const {
+int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown) {
+  rwlock_->RDLock();
+  auto iter = id_to_index_.find(key);
+  if (iter == id_to_index_.end()) {
+    rwlock_->UNLock();
+    if (!auto_grown) {
+      PADDLE_THROW("key %d not found", key);
+    }
+    rwlock_->WRLock();
+    auto map_size = id_to_index_.size();
+    auto vector_size = rows_.size();
+    if (map_size != vector_size) {
+      rwlock_->UNLock();
+      PADDLE_THROW(
+          "id_to_index_ size %d should have the same size with rows_ %d",
+          map_size, vector_size);
+    }
+    auto write_iter = id_to_index_.find(key);
+    if (write_iter == id_to_index_.end()) {
+      size_t row_num = rows_.size();
+      if (row_num == value_->dims()[0]) {
+        rwlock_->UNLock();
+        PADDLE_THROW("selected rows is full, then length exceed %d", row_num);
+      }
+      // key logic to put a key into id_to_index_
+      rows_.push_back(key);
+      auto index = static_cast<int64_t>(rows_.size() - 1);
+      id_to_index_[key] = index;
+      rwlock_->UNLock();
+      return index;
+    } else {
+      auto index = write_iter->second;
+      rwlock_->UNLock();
+      return index;
+    }
+  } else {
+    auto index = iter->second;
+    rwlock_->UNLock();
+    return index;
+  }
+}
+
+void SelectedRows::SyncIndex() {
+  rwlock_->WRLock();
+  id_to_index_.clear();
+  for (size_t i = 0; i < rows_.size(); ++i) {
+    id_to_index_[rows_[i]] = i;
+  }
+  rwlock_->UNLock();
+}
+
+void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
+                       bool auto_grown) {
  PADDLE_ENFORCE(value->IsInitialized(),
                 "The value tensor should be initialized.");
-  std::vector<std::pair<int64_t, int64_t>> non_keys_pair;
-  if (keys.empty()) {
+  if (ids.numel() == 0) {
    VLOG(3) << "keys is empty, please check data!";
  } else {
    int64_t value_width = value_->numel() / value_->dims()[0];
    PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
                      "output tensor should have the same shape with table "
                      "except the dims[0].");
-
-    for (size_t i = 0; i < keys.size(); ++i) {
-      int64_t index = Index(keys[i]);
-      if (index == -1) {
-        non_keys_pair.push_back(
-            std::make_pair(keys[i], static_cast<int64_t>(i)));
-      } else {
-        framework::VisitDataType(
-            framework::ToDataType(value_->type()),
-            TensorCopyVisitor(value, i * value_width, *value_.get(),
-                              index * value_width, value_width));
-      }
+    for (size_t i = 0; i < ids.numel(); ++i) {
+      int64_t index = AutoGrownIndex(ids.data<int64_t>()[i], auto_grown);
+      framework::VisitDataType(
+          framework::ToDataType(value_->type()),
+          TensorCopyVisitor(value, i * value_width, *value_.get(),
+                            index * value_width, value_width));
    }
  }
-  return non_keys_pair;
-}
-
-bool SelectedRows::Set(int64_t key, const framework::Tensor& value) {
-  PADDLE_ENFORCE(value.IsInitialized(), "The value should be initialized.");
-  if (value_->IsInitialized()) {
-    PADDLE_ENFORCE_EQ(
-        value.type(), value_->type(),
-        "The type of the value should be same with the original value");
-  }
-  PADDLE_ENFORCE_EQ(value.dims()[0], static_cast<size_t>(1),
-                    "The first dim of value should be 1.");
-  std::lock_guard<std::mutex> lock(*auto_grown_mutex_.get());
-  auto index = Index(key);
-  bool is_new_key = false;
-  if (index == -1) {
-    rows_.push_back(key);
-    index = rows_.size() - 1;
-    is_new_key = true;
-    // whether need to resize the table
-    if (static_cast<int64_t>(rows_.size()) > value_->dims()[0]) {
-      auto dims = value_->dims();
-      dims[0] = (dims[0] + 1) << 1;
-      framework::VisitDataType(framework::ToDataType(value.type()),
-                               ReAllocateVisitor(dims, value_.get()));
-    }
-  }
-
-  framework::VisitDataType(
-      framework::ToDataType(value.type()),
-      TensorCopyVisitor(value_.get(),
-                        index * value_->numel() / value_->dims()[0], value,
-                        static_cast<int64_t>(0), value.numel()));
-  return is_new_key;
 }

 }  // namespace framework
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@ -17,10 +17,12 @@ limitations under the License. */
 #include <algorithm>
 #include <memory>
 #include <mutex>  // NOLINT
+#include <unordered_map>
 #include <utility>
 #include <vector>

 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/memory/memcpy.h"

@ -48,13 +50,13 @@ class SelectedRows {
  SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
      : rows_(rows), height_(height) {
    value_.reset(new Tensor());
-    auto_grown_mutex_.reset(new std::mutex);
+    rwlock_.reset(new RWLock);
  }

  SelectedRows() {
    height_ = 0;
    value_.reset(new Tensor());
-    auto_grown_mutex_.reset(new std::mutex);
+    rwlock_.reset(new RWLock);
  }

  platform::Place place() const { return value_->place(); }
@ -74,47 +76,51 @@ class SelectedRows {
  void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }

  /*
-   * @brief wheter has the specified key in the table.
+   * @brief Get the index of key in rows
+   *
+   * @return -1 if the key does not exists.
+   */
+  int64_t Index(int64_t key) const {
+    auto it = std::find(rows_.begin(), rows_.end(), key);
+    if (it == rows_.end()) {
+      PADDLE_THROW("id %s not in table", key);
+    }
+    return static_cast<int64_t>(std::distance(rows_.begin(), it));
+  }
+
+  /*
+   * @brief whether has the specified key in the table.
   *
   * @return true if the key is exists.
   */
  bool HasKey(int64_t key) const;

  /*
-   * @brief Get value by the key list, if the
+   * @brief Get value by the key list.
+   * Note!!! this interface is only used when selected_rows is used as
+   * parameters
+   * for distribute lookup table.
   *
   * @return a list of pair which contains the non-exists key and the index in
   * the value
   */
-  std::vector<std::pair<int64_t, int64_t>> Get(const std::vector<int64_t>& keys,
-                                               framework::Tensor* value) const;
+  void Get(const framework::Tensor& ids, framework::Tensor* value,
+           bool auto_grown = false);

  /*
-   * @brief Set a key-value pair into the table.
-   *  This function will double the value memory if it's not engouth.
+   * @brief Get the index of the key from id_to_index_ map. If the key not
+   * exist,
+   * add the key into id_to_index_.
   *
-   * @note:
-   *    1. The first dim of the value should be 1
-   *    2. The value should be initialized and the data type
-   *       should be the same with the table.
-   *
-   * @return true if the key is a new one, otherwise false
+   * Note!!! this interface is only used when selected_rows is used as
+   * parameters
+   * for distribute lookup table.
   *
+   * @return index of the key.
   */
-  bool Set(int64_t key, const Tensor& value);
+  int64_t AutoGrownIndex(int64_t key, bool auto_grown);

-  /*
-   * @brief Get the index of key in rows
-   *
-   * @return -1 if the key does not exists.
-   */
-  int64_t Index(int64_t key) const {
-    auto it = std::find(rows_.begin(), rows_.end(), key);
-    if (it == rows_.end()) {
-      return static_cast<int64_t>(-1);
-    }
-    return static_cast<int64_t>(std::distance(rows_.begin(), it));
-  }
+  void SyncIndex();

  DDim GetCompleteDims() const {
    std::vector<int64_t> dims = vectorize(value_->dims());
@ -127,9 +133,10 @@ class SelectedRows {
  // SelectedRows are simply concated when adding together. Until a
  // SelectedRows add a Tensor, will the duplicate rows be handled.
  Vector<int64_t> rows_;
+  std::unordered_map<int64_t, int64_t> id_to_index_;
  std::unique_ptr<Tensor> value_{nullptr};
  int64_t height_;
-  std::unique_ptr<std::mutex> auto_grown_mutex_{nullptr};
+  std::unique_ptr<RWLock> rwlock_{nullptr};
 };

 /*
--- a/paddle/fluid/framework/selected_rows_test.cc
+++ b/paddle/fluid/framework/selected_rows_test.cc
@ -9,8 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/framework/selected_rows.h"
+#include <time.h>
+#include <thread>  // NOLINT
+
 #include "gtest/gtest.h"
+#include "paddle/fluid/framework/selected_rows.h"

 namespace paddle {
 namespace framework {
@ -59,39 +62,129 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
  ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
 }

-TEST_F(SelectedRowsTester, SparseTable) {
+TEST(SelectedRows, SparseTable) {
  platform::CPUPlace cpu;
  SelectedRows table;
+
+  int64_t table_size = 100;
+  int64_t embedding_width = 8;
  // initialize a sparse table
-  table.mutable_value()->Resize(framework::make_ddim({1, 100}));
-  table.mutable_value()->mutable_data<float>(cpu);
-  table.mutable_rows()->push_back(1);
+  table.mutable_value()->Resize(
+      framework::make_ddim({table_size, embedding_width}));
+  auto* data = table.mutable_value()->mutable_data<float>(cpu);
+  for (int64_t i = 0; i < table_size; ++i) {
+    for (int64_t j = 0; j < embedding_width; ++j) {
+      data[i * embedding_width + j] = static_cast<float>(i);
+    }
+  }
+  ASSERT_EQ(table.AutoGrownIndex(10, true), 0);
+  ASSERT_EQ(table.AutoGrownIndex(8, true), 1);
+  ASSERT_EQ(table.AutoGrownIndex(8, true), 1);
+  ASSERT_EQ(table.AutoGrownIndex(6, true), 2);
+  ASSERT_TRUE(table.HasKey(10));
+  ASSERT_TRUE(table.HasKey(8));
+  ASSERT_TRUE(table.HasKey(6));
+  ASSERT_EQ(table.rows().size(), 3);
+
+  framework::Tensor ids;
+  ids.Resize(framework::make_ddim({4}));
+  auto* ids_data = ids.mutable_data<int64_t>(cpu);
+  ids_data[0] = static_cast<int64_t>(6);
+  ids_data[1] = static_cast<int64_t>(6);
+  ids_data[2] = static_cast<int64_t>(8);
+  ids_data[3] = static_cast<int64_t>(10);

-  int64_t key = 10000;
-  int64_t non_key = 999;
-  framework::Tensor value;
-  value.Resize(framework::make_ddim({1, 100}));
-  auto ptr = value.mutable_data<float>(cpu);
-  ptr[0] = static_cast<float>(10);
+  framework::Tensor get_value;
+  auto* value_data = get_value.mutable_data<float>(
+      framework::make_ddim({4, embedding_width}), cpu);
+  table.Get(ids, &get_value);

-  ASSERT_EQ(table.rows().size(), static_cast<size_t>(1));
-  ASSERT_EQ(table.HasKey(key), false);
+  for (int j = 0; j < embedding_width; ++j) {
+    ASSERT_EQ(value_data[0 * embedding_width + j], 2);
+  }
+  for (int j = 0; j < embedding_width; ++j) {
+    ASSERT_EQ(value_data[1 * embedding_width + j], 2);
+  }
+  for (int j = 0; j < embedding_width; ++j) {
+    ASSERT_EQ(value_data[2 * embedding_width + j], 1);
+  }
+  for (int j = 0; j < embedding_width; ++j) {
+    ASSERT_EQ(value_data[3 * embedding_width + j], 0);
+  }
+}

-  table.Set(key, value);
+void f1(SelectedRows* table, int table_size) {
+  for (int i = 1000000; i > 0; --i) {
+    auto id = i % table_size;
+    int64_t index1 = table->AutoGrownIndex(id, true);
+    int64_t index2 = table->AutoGrownIndex(id, false);
+    int64_t index3 = table->AutoGrownIndex(id, true);
+    ASSERT_EQ(index1, index2);
+    ASSERT_EQ(index2, index3);
+  }
+}

-  ASSERT_EQ(table.rows().size(), static_cast<size_t>(2));
-  ASSERT_EQ(table.HasKey(key), true);
-  // check re-allocate
-  ASSERT_EQ(table.value().dims()[0], static_cast<int64_t>(4));
+void f2(SelectedRows* table, int table_size) {
+  for (int i = 0; i < 1000000; ++i) {
+    auto id = i % table_size;
+    int64_t index1 = table->AutoGrownIndex(id, true);
+    int64_t index2 = table->AutoGrownIndex(id, false);
+    int64_t index3 = table->AutoGrownIndex(id, true);
+    ASSERT_EQ(index1, index2);
+    ASSERT_EQ(index2, index3);
+  }
+}

-  framework::Tensor get_value;
-  get_value.mutable_data<float>(framework::make_ddim({2, 100}), cpu);
-  std::vector<int64_t> keys({non_key, key});
-  auto non_key_pairs = table.Get(keys, &get_value);
+void f3(SelectedRows* table, int table_size) {
+  clock_t t1 = clock();
+  for (int i = 100000; i > 0; --i) {
+    auto id1 = table->AutoGrownIndex(i % table_size, true);
+    auto id2 = table->Index(i % table_size);
+    ASSERT_EQ(id1, id2);
+  }
+  clock_t t2 = clock();
+  std::cout << "f3 run time:" << t2 - t1 << std::endl;
+}
+
+void f4(SelectedRows* table, int table_size) {
+  clock_t t1 = clock();
+  for (int i = 0; i < 100000; ++i) {
+    auto id1 = table->AutoGrownIndex(i % table_size, true);
+    auto id2 = table->Index(i % table_size);
+    ASSERT_EQ(id1, id2);
+  }
+  clock_t t2 = clock();
+  std::cout << "f4 run time:" << t2 - t1 << std::endl;
+}
+
+TEST(SelectedRows, MultiThreadAutoIndex) {
+  platform::CPUPlace cpu;
+  SelectedRows table;
+
+  int64_t table_size = 100000;
+  int64_t embedding_width = 8;
+  // initialize a sparse table
+  table.mutable_value()->Resize(
+      framework::make_ddim({table_size, embedding_width}));
+  auto* data = table.mutable_value()->mutable_data<float>(cpu);
+  for (int64_t i = 0; i < table_size; ++i) {
+    for (int64_t j = 0; j < embedding_width; ++j) {
+      data[i * embedding_width + j] = static_cast<float>(i);
+    }
+  }

-  ASSERT_EQ(get_value.data<float>()[100], static_cast<float>(10));
-  ASSERT_EQ(non_key_pairs.size(), static_cast<size_t>(1));
-  ASSERT_EQ(non_key_pairs[0].first, non_key);
+  std::thread t1(f1, &table, table_size);
+  std::thread t11(f1, &table, table_size);
+  std::thread t2(f2, &table, table_size);
+  std::thread t22(f2, &table, table_size);
+  t1.join();
+  t11.join();
+  t2.join();
+  t22.join();
+  std::thread t3(f3, &table, table_size);
+  std::thread t4(f4, &table, table_size);
+  t3.join();
+  t4.join();
 }

 }  // namespace framework
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@ -20,6 +20,9 @@
 DEFINE_int32(io_threadpool_size, 100,
             "number of threads used for doing IO, default 100");

+DEFINE_int32(dist_threadpool_size, 0,
+             "number of threads used for distributed executed.");
+
 namespace paddle {
 namespace framework {

@ -35,6 +38,10 @@ void ThreadPool::Init() {
  if (threadpool_.get() == nullptr) {
    // TODO(Yancey1989): specify the max threads number
    int num_threads = std::thread::hardware_concurrency();
+    if (FLAGS_dist_threadpool_size > 0) {
+      num_threads = FLAGS_dist_threadpool_size;
+      VLOG(1) << "set dist_threadpool_size to " << num_threads;
+    }
    PADDLE_ENFORCE_GT(num_threads, 0);
    threadpool_.reset(new ThreadPool(num_threads));
  }
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@ -60,7 +60,7 @@ cc_library(paddle_inference_tensorrt_subgraph_engine
 inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
 endif()

-if (WITH_ANAKIN) # only needed in CI
+if (WITH_ANAKIN AND WITH_GPU) # only needed in CI
    # compile the libinference_anakin_api.a and anakin.so.
    nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
    #nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin)
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@ -171,6 +171,9 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
      elseif(${TARGET} STREQUAL "tensorrt_engine_op")
          message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
+      elseif(${TARGET} STREQUAL "fc")
+        # HACK: fc only have mkldnn and cpu, which would mismatch the cpu only condition
+        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
      else()
        file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
      endif()
@ -302,12 +305,6 @@ op_library(channel_recv_op DEPS concurrency)

 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})

-# The fully connected layer is deleted when the WITH_MKLDNN flag is OFF
-# Because the fully connected layer has only one MKLDNN's operator
-if(NOT WITH_MKLDNN)
-    list(REMOVE_ITEM GENERAL_OPS fc_op)
-endif(NOT WITH_MKLDNN)
-
 foreach(src ${GENERAL_OPS})
    op_library(${src})
 endforeach()
--- a/paddle/fluid/operators/distributed/rpc_server_test.cc
+++ b/paddle/fluid/operators/distributed/rpc_server_test.cc
@ -78,10 +78,9 @@ void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
                         int64_t rows_numel) {
  CreateVarsOnScope(scope, place);
  auto w = scope->Var("w")->GetMutable<framework::SelectedRows>();
-  auto rows = w->mutable_rows();
-  for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i);
  auto w_value = w->mutable_value();
  w_value->Resize({rows_numel, 10});
+  for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true);

  auto ptr = w_value->mutable_data<float>(*place);

--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@ -190,12 +190,15 @@ bool VariableResponse::ProcSerializedField(
 #endif
  }

+  VLOG(7) << "ProcSerializedField:" << meta_.varname()
+          << ", type:" << meta_.type() << std::endl;
  framework::DDim dims = GetDims(meta_.dims());
  if (meta_.type() == sendrecv::LOD_TENSOR) {
    PADDLE_ENFORCE(meta_.lod_size() >= 0, "lod info should be got first!");
    if (!CopyLodTensorData(input, *dev_ctx_, dims, num_bytes)) {
      return false;
    }
+
    return true;
  }

@ -206,7 +209,9 @@ bool VariableResponse::ProcSerializedField(
    return true;
  }

-  return true;
+  PADDLE_ENFORCE("not supported var types:", meta_.varname(), meta_.type());
+
+  return false;
 }

 };  // namespace distributed
--- a/Show More
+++ b/Show More