Merge branch 'develop' into anakin_bug

7 years ago · 7c09d19842
parent 3373535b21 c108376506
commit 7c09d19842
58 changed files with 2251 additions and 362 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -204,12 +204,11 @@ include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)
 set(WITH_ANAKIN OFF CACHE STRING "Disable Anakin first, will add it later." FORCE)
 if(WITH_GPU)
    include(cuda)
    include(tensorrt)
    include(external/anakin)
 else()
  set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
 endif()
 include(cudnn)              # set cudnn libraries, must before configure
--- a/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst
+++ b/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst
@ -4,7 +4,7 @@ Paddle 预测 API
 为了更简单方便的预测部署，Fluid 提供了一套高层 API
 用来隐藏底层不同的优化实现。
-`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/contrib/inference>`__
+`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api>`__
 包括
 -  头文件 ``paddle_inference_api.h`` 定义了所有的接口
@ -104,5 +104,5 @@ engine
 ------------
 -  `inference
-   demos <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/contrib/inference/demo>`__
+   demos <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api/demo_ci>`__
-  `复杂单线程/多线程例子 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/inference/test_paddle_inference_api_impl.cc>`__
+-  `复杂单线程/多线程例子 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/api/api_impl_tester.cc>`__
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -6,7 +6,7 @@ paddle.fluid.Program.create_block ArgSpec(args=['self', 'parent_idx'], varargs=N
 paddle.fluid.Program.current_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Program.get_desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Program.inference_optimize ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Program.inference_optimize ArgSpec(args=['self', 'export_for_deployment'], varargs=None, keywords=None, defaults=(True,))
 paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Program.optimized_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None)
@ -18,6 +18,9 @@ paddle.fluid.Operator.all_attrs ArgSpec(args=['self'], varargs=None, keywords=No
 paddle.fluid.Operator.attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Operator.attr_type ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Operator.block_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Operator.block_attr_id ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Operator.blocks_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Operator.blocks_attr_ids ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Operator.has_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Operator.has_kernel ArgSpec(args=['self', 'op_type'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Operator.input ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
@ -52,7 +55,7 @@ paddle.fluid.Inferencer.__init__ ArgSpec(args=['self', 'infer_func', 'param_path
 paddle.fluid.Inferencer.infer ArgSpec(args=['self', 'inputs', 'return_numpy'], varargs=None, keywords=None, defaults=(True,))
 paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True))
 paddle.fluid.InferenceTranspiler.__init__ 
@ -74,7 +77,7 @@ paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_pro
 paddle.fluid.io.load_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.io.load_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.io.load_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
-paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True))
 paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.io.get_inference_program ArgSpec(args=['target_vars', 'main_program'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False))
@ -156,6 +159,8 @@ paddle.fluid.layers.relu ArgSpec(args=['x'], varargs=None, keywords=None, defaul
 paddle.fluid.layers.log ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
@ -324,7 +329,7 @@ paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array
 paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True))
 paddle.fluid.transpiler.InferenceTranspiler.__init__ 
--- a/paddle/fluid/framework/details/exception_holder.h
+++ b/paddle/fluid/framework/details/exception_holder.h
@ -14,6 +14,7 @@
 #pragma once
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 namespace paddle {
@ -22,27 +23,24 @@ namespace details {
 class ExceptionHolder {
 public:
-  void Catch(const platform::EnforceNotMet& exp) {
+  void Catch(std::exception_ptr eptr) {
-    std::lock_guard<std::mutex> lock(mu_);
+    try {
-    exception_.reset(new platform::EnforceNotMet(exp));
+      std::rethrow_exception(eptr);
-    type_ = kEnforceNotMet;
+    } catch (platform::EOFException exp) {
-  }
+      Catch(exp);
-
+    } catch (platform::EnforceNotMet exp) {
-  void Catch(const platform::EOFException& exp) {
+      Catch(exp);
-    std::lock_guard<std::mutex> lock(mu_);
+    } catch (...) {
-    // EOFException will not cover up existing EnforceNotMet.
+      LOG(FATAL) << "Unknown exception caught";
    if (exception_.get() == nullptr) {
      exception_.reset(new platform::EOFException(exp));
      type_ = kEOF;
    }
  }
-  bool ExceptionCatched() const {
+  bool IsCaught() const {
    std::lock_guard<std::mutex> lock(mu_);
    return exception_.get() != nullptr;
  }
-  void Throw() {
+  void ReThrow() {
    std::lock_guard<std::mutex> lock(mu_);
    switch (type_) {
      case kNone:
@ -50,27 +48,41 @@ class ExceptionHolder {
      case kEnforceNotMet: {
        auto e = *static_cast<platform::EnforceNotMet*>(exception_.get());
        throw e;
        break;
      }
      case kEOF: {
        auto e = *static_cast<platform::EOFException*>(exception_.get());
        throw e;
        break;
      }
      default:
        LOG(FATAL) << "Unknown exception.";
    }
-    exception_.reset();
+    ClearImpl();
    type_ = kNone;
  }
  void Clear() {
    std::lock_guard<std::mutex> lock(mu_);
    ClearImpl();
  }
 private:
  void ClearImpl() {
    exception_.reset();
    type_ = kNone;
  }
- private:
+  void Catch(const platform::EnforceNotMet& exp) {
    std::lock_guard<std::mutex> lock(mu_);
    exception_.reset(new platform::EnforceNotMet(exp));
    type_ = kEnforceNotMet;
  }
  void Catch(const platform::EOFException& exp) {
    std::lock_guard<std::mutex> lock(mu_);
    // EOFException will not cover up existing EnforceNotMet.
    if (exception_.get() == nullptr) {
      exception_.reset(new platform::EOFException(exp));
      type_ = kEOF;
    }
  }
  enum ExceptionType { kNone, kEnforceNotMet, kEOF };
  ExceptionType type_{kNone};
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@ -107,11 +107,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
    auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
    if (timeout) {
-      if (exception_holder_.ExceptionCatched()) {
+      if (exception_holder_.IsCaught()) {
        for (auto &run_op_future : run_op_futures_) {
          run_op_future.wait();
        }
-        exception_holder_.Throw();
+        exception_holder_.ReThrow();
      } else {
        continue;
      }
@ -220,12 +220,8 @@ void ThreadedSSAGraphExecutor::RunOp(
      running_ops_--;
      ready_var_q->Extend(op->Outputs());
      VLOG(10) << op << " " << op->Name() << "Signal posted";
    } catch (platform::EOFException ex) {
      exception_holder_.Catch(ex);
    } catch (platform::EnforceNotMet ex) {
      exception_holder_.Catch(ex);
    } catch (...) {
-      LOG(FATAL) << "Unknown exception catched";
+      exception_holder_.Catch(std::current_exception());
    }
  };
  if (pool_) {
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@ -3,7 +3,10 @@ cc_library(graph SRCS graph.cc DEPS node)
 cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
 cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
 cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper)
 cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
 cc_library(graph_pattern_detecter SRCS graph_pattern_detecter.cc DEPS graph graph_helper graph_traits)
 cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
 cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
 cc_test(test_graph_pattern_detecter SRCS graph_pattern_detecter_tester.cc DEPS graph_pattern_detecter)
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@ -28,6 +28,38 @@ namespace paddle {
 namespace framework {
 namespace ir {
 /*
 * The graph is a Directed Acyclic Single Static Assignment Graph.
 *
 * In more detail, the following properties must hold:
 *
 *   The graph shouldn't contain cycle. Each node is a black-box to the graph
 *   so the node itself could be a loop operator.
 *
 *   Each Variable-type node has only one input (thus single static assignment).
 *
 *   The output/input of operator is variable and the output/input of variable
 *   is operator.
 *
 * The following data harzards in Program are addressed in the Graph:
 *
 *   Write-After-Read
 *     a = op1(x)
 *     x = op2(b)
 *     A control-dependency connection is created bettwen op1 and op2 such that
 *     op1->op2, so as to ensure correct order.
 *
 *   Write-After-Write
 *     x = op1(a)
 *     x = op2(b)
 *     A control-dependency connection is created between op1 and op2 such that
 *     op1->op2, so as to ensure correct order.
 *
 * Other properties currently hold, but is not enforced yet:
 *
 *   Variable-type node (not control dep) with the same variable name share
 *   the same underlying VarDesc.
 */
 class Graph {
 public:
  explicit Graph(const ProgramDesc &program);
--- a/paddle/fluid/framework/ir/graph_pattern_detecter.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detecter.cc
@ -0,0 +1,186 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detecter.h"
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/platform/enforce.h"
 namespace paddle {
 namespace framework {
 namespace ir {
 PDNode* PDPattern::NewNode(PDNode::teller_t&& teller, const std::string& name) {
  nodes_.emplace_back(new PDNode(std::move(teller), name));
  auto* cur = nodes_.back().get();
  return cur;
 }
 void PDPattern::AddEdge(PDNode* a, PDNode* b) {
  PADDLE_ENFORCE(a);
  PADDLE_ENFORCE(b);
  PADDLE_ENFORCE(a != b, "can't connect to the same nodes.");
  edges_.emplace_back(a, b);
 }
 void GraphPatternDetecter::operator()(Graph* graph,
                                      GraphPatternDetecter::handle_t handler) {
  if (!MarkPDNodesInGraph(*graph)) return;
  auto subgraphs = DetectPatterns();
  UniquePatterns(&subgraphs);
  RemoveOverlappedMatch(&subgraphs);
  for (auto& g : subgraphs) {
    handler(g, graph);
  }
 }
 bool GraphPatternDetecter::MarkPDNodesInGraph(const ir::Graph& graph) {
  if (graph.Nodes().empty()) return false;
  for (auto& node : GraphTraits::DFS(graph)) {
    for (const auto& pdnode : pattern_.nodes()) {
      if (pdnode->Tell(&node)) {
        pdnodes2nodes_[pdnode.get()].insert(&node);
      }
    }
  }
  return !pdnodes2nodes_.empty();
 }
 struct HitGroup {
  std::unordered_map<PDNode*, Node*> roles;
  bool Match(Node* node, PDNode* pat) {
    return !roles.count(pat) || roles.at(pat) == node;
  }
  void Register(Node* node, PDNode* pat) { roles[pat] = node; }
 };
 // Tell whether Node a links to b.
 bool IsNodesLink(Node* a, Node* b) {
  for (auto* node : a->outputs) {
    if (b == node) {
      return true;
    }
  }
  return false;
 }
 std::vector<GraphPatternDetecter::subgraph_t>
 GraphPatternDetecter::DetectPatterns() {
  // Init empty subgraphs.
  std::vector<GraphPatternDetecter::subgraph_t> result;
  std::vector<HitGroup> init_groups;
  PADDLE_ENFORCE(!pattern_.edges().empty(), "At least one edge is needed");
  auto* first_pnode = pattern_.edges().front().first;
  if (!pdnodes2nodes_.count(first_pnode)) return result;
  for (auto* node : pdnodes2nodes_[first_pnode]) {
    HitGroup group;
    group.roles[first_pnode] = node;
    init_groups.emplace_back(group);
  }
  int step = 0;
  std::array<std::vector<HitGroup>, 2> bi_records;
  bi_records[0] = std::move(init_groups);
  // Extend a PDNode to subgraphs by deducing the connection relations defined
  // in edges of PDNodes.
  for (const auto& edge : pattern_.edges()) {
    // Each role has two PDNodes, which indicates two roles.
    // Detect two Nodes that can match these two roles and they are connected.
    auto& pre_groups = bi_records[step % 2];
    auto& cur_groups = bi_records[1 - (step++ % 2)];
    cur_groups.clear();
    // source -> target
    for (Node* source : pdnodes2nodes_[edge.first]) {
      for (Node* target : pdnodes2nodes_[edge.second]) {
        // TODO(Superjomn) add some prune strategies.
        for (const auto& group : pre_groups) {
          HitGroup new_group = group;
          if (IsNodesLink(source, target) &&
              new_group.Match(source, edge.first)) {
            new_group.Register(source, edge.first);
            if (new_group.Match(target, edge.second)) {
              new_group.Register(target, edge.second);
              cur_groups.push_back(new_group);
              // TODO(Superjomn) need to unique
            }
          }
        }
      }
    }
  }
  for (auto& group : bi_records[step % 2]) {
    GraphPatternDetecter::subgraph_t subgraph;
    for (auto& role : group.roles) {
      subgraph.emplace(role.first, role.second);
    }
    result.emplace_back(subgraph);
  }
  return result;
 }
 void GraphPatternDetecter::UniquePatterns(
    std::vector<GraphPatternDetecter::subgraph_t>* subgraphs) {
  if (subgraphs->empty()) return;
  std::vector<GraphPatternDetecter::subgraph_t> result;
  std::unordered_set<size_t> set;
  for (auto& g : *subgraphs) {
    size_t key = 0;
    for (auto& item : g) {
      key ^= std::hash<void*>{}(item.first);
      key ^= std::hash<void*>{}(item.second);
    }
    if (!set.count(key)) {
      result.emplace_back(g);
      set.insert(key);
    }
  }
  *subgraphs = result;
 }
 void GraphPatternDetecter::RemoveOverlappedMatch(
    std::vector<subgraph_t>* subgraphs) {
  std::vector<subgraph_t> result;
  std::unordered_set<Node*> node_set;
  for (const auto& subgraph : *subgraphs) {
    bool valid = true;
    for (auto& item : subgraph) {
      if (node_set.count(item.second)) {
        valid = false;
        break;
      }
    }
    if (valid) {
      for (auto& item : subgraph) {
        node_set.insert(item.second);
      }
      result.push_back(subgraph);
    }
  }
  *subgraphs = result;
 }
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detecter.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detecter.h
@ -0,0 +1,181 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #ifdef PADDLE_WITH_TESTING
 #include <gtest/gtest_prod.h>
 #endif
 #include <numeric>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 namespace paddle {
 namespace framework {
 namespace ir {
 // Some basic torminolygies:
 //   - PDPattern: a pattern defined as a data flow graph.
 //   - PDNode: the node in the pattern, each PDNode represents an `ir::Node`
 //     that meets some conditions defined in `PDNode.teller`.
 //   - A pattern is defined with PDNodes with edges.
 // Pattern detector node. This node helps to build a pattern.
 struct PDNode {
  // tell whether an ir::Node* is a candidation for a PDNode.
  using teller_t = std::function<bool(Node*)>;
  PDNode(teller_t&& teller, const std::string& name = "")
      : teller_(teller), name_(name) {
    PADDLE_ENFORCE(teller_ != nullptr, "invalid teller functer is set.");
  }
  PDNode(PDNode&& other) = default;
  std::vector<PDNode*> inlinks;
  std::vector<PDNode*> outlinks;
  bool Tell(Node* node) const {
    PADDLE_ENFORCE(teller_ != nullptr, "teller should be set for a PDNode");
    return teller_(node);
  }
  const std::string& name() const { return name_; }
  PDNode(const PDNode&) = delete;
  PDNode& operator=(const PDNode&) = delete;
 private:
  teller_t teller_;
  std::string name_;
 };
 /*
 * A pattern in a graph, which defined with PDNode and edges. Most graph
 * patterns can be divided into PDNodes and link relations between them.
 *
 * For example, the FC fusion need to filter the MUL and ELEMENTWISE_ADD
 * operators from the computation graph, the MUL's output should have only one
 * consumer which is the ELEMENTWISE_ADD.
 * This pattern can be defined as with the following pseudo codes
 *
 *     // Create two operator PDNodes.
 *     MUL = PDPattern.NewNode()
 *     ELE = PDPattern.NewNode()
 *     // Create the variable PDNodes.
 *     MUL_out = PDPattern.NewNode()
 *     // Add teller to define some rules that help to filter the target Nodes.
 *     MUL.teller = lambda(node): node->IsOp() && node->Op()->Type == "mul";
 *     ELE.teller = lambda(node): \
 *                        node->IsOp() && node->Op()->Type == "elementwise_add";
 *     MUL_out.teller = lambda(node): node->IsVar() && (MUL in node->inputs)
 *                                                  && (ELE in node->outputs)
 *
 * One can add more specific tellers for PDNodes or edges, both the Operator
 * and Variable Nodes can be ruled in PDNode.teller.
 *
 * PDPattern can record the general patterns, such as the pattern represents
 *   - Op in CPU -> Op in GPU -> Op in CPU, to findout the IO abnormal place.
 *   - Ops whose inputs and outputs share the same variables
 */
 class PDPattern {
 public:
  using edge_t = std::pair<PDNode*, PDNode*>;
  void AddEdge(PDNode* a, PDNode* b);
  PDNode* NewNode(PDNode::teller_t&& teller, const std::string& name = "");
  const std::vector<std::unique_ptr<PDNode>>& nodes() const { return nodes_; }
  const std::vector<edge_t>& edges() const { return edges_; }
 private:
 #ifdef PADDLE_WITH_TESTING
  FRIEND_TEST(PDPattern, AddEdge);
  FRIEND_TEST(PDPattern, NewNode);
 #endif
  std::vector<std::unique_ptr<PDNode>> nodes_;
  std::vector<edge_t> edges_;
 };
 /*
 * GraphPatternDetecter helps to detect the specific patterns in the graph.
 * Input a pattern, output a list of the matched subgraphs/nodes.
 * This helper can be used to support fuse(conv+batchnorm => batchnorm e.g.).
 *
 * The algorithm has three phases:
 *   1. Mark the nodes that match the defined PDNodes in a PDPattern,
 *   2. Extend a PDNode to subgraphs by deducing the connection relation defined
 *      in PAPattern(the edges),
 *   3. Get the filtered subgraphs and treat them with a pre-defined handler.
 *
 * Usage:
 *    // Create a detector
 *    GraphPatternDetecter detector;
 *    // Define the detector's pattern, by adding PDNode and define the edges.
 *    auto* node0 = detector.mutable_pattern().AddNode(...)
 *    auto* node1 = detector.mutable_pattern().AddNode(...)
 *    node0->teller = some lambda.
 *    node1->teller = some lambda.
 *    detector.mutable_pattern().AddEdge(node0, node1);
 *    // Create an handler, to define the behavior of treating the filtered
 *    // subgraphs that comply with the patterns.
 *    GraphPatternDetecter::handle_t handler = some labmda
 *    // Execute the detector.
 *    detector(&graph, handler);
 */
 class GraphPatternDetecter {
 public:
  using subgraph_t = std::unordered_map<PDNode*, Node*>;
  // Operate on the detected pattern.
  using handle_t =
      std::function<void(const subgraph_t& /*hitted pattern*/, Graph*)>;
  void operator()(Graph* graph, handle_t handler);
  const PDPattern& pattern() const { return pattern_; }
  PDPattern* mutable_pattern() { return &pattern_; }
 private:
  // Mark the nodes that fits the pattern.
  bool MarkPDNodesInGraph(const ir::Graph& graph);
  // Detect all the pattern and output the hit records.
  std::vector<subgraph_t> DetectPatterns();
  // Remove duplicate patterns.
  void UniquePatterns(std::vector<subgraph_t>* subgraphs);
  // Remove overlapped match subgraphs, when overlapped, keep the previous one.
  void RemoveOverlappedMatch(std::vector<subgraph_t>* subgraphs);
 #ifdef PADDLE_WITH_TESTING
  FRIEND_TEST(GraphPatternDetecter, MarkPDNodesInGraph);
  FRIEND_TEST(GraphPatternDetecter, DetectPatterns);
 #endif
 private:
  using hit_rcd_t =
      std::pair<Node* /*node in graph*/, PDNode* /*node in pattern*/>;
  PDPattern pattern_;
  std::vector<hit_rcd_t> marked_records_;
  std::unordered_map<const PDNode*, std::unordered_set<Node*>> pdnodes2nodes_;
 };
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc
@ -0,0 +1,172 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/ir/graph_pattern_detecter.h"
 #include <gtest/gtest.h>
 namespace paddle {
 namespace framework {
 namespace ir {
 void BuildGraph(Graph* g) {
  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
  ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation);
  ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation);
  ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation);
  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
  ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable);
  ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable);
  // o1->v1->o2
  o1->outputs.push_back(v1);
  o2->inputs.push_back(v1);
  v1->inputs.push_back(o1);
  v1->outputs.push_back(o2);
  // o2->v2->o3
  // o2->v2->o4
  o2->outputs.push_back(v2);
  o3->inputs.push_back(v2);
  o4->inputs.push_back(v2);
  v2->inputs.push_back(o2);
  v2->outputs.push_back(o3);
  v2->outputs.push_back(o4);
  // o2->v3->o5
  o2->outputs.push_back(v3);
  o5->inputs.push_back(v3);
  v3->inputs.push_back(o2);
  v3->outputs.push_back(o5);
  // o3-v4->o5
  o3->outputs.push_back(v4);
  o5->inputs.push_back(v4);
  v4->inputs.push_back(o3);
  v4->outputs.push_back(o5);
 }
 TEST(PDPattern, NewNode) {
  PDPattern x;
  auto* n = x.NewNode([](Node* x) { return true; });
  ASSERT_TRUE(n);
  ASSERT_EQ(x.nodes_.size(), 1UL);
 }
 TEST(PDPattern, AddEdge) {
  PDPattern x;
  auto* a = x.NewNode([](Node* x) { return true; });
  auto* b = x.NewNode([](Node* x) { return true; });
  ASSERT_TRUE(a);
  ASSERT_TRUE(b);
  x.AddEdge(a, b);
  ASSERT_EQ(x.nodes_.size(), 2UL);
  ASSERT_EQ(x.edges_.size(), 1UL);
  ASSERT_EQ(x.edges_.front().first, a);
  ASSERT_EQ(x.edges_.front().second, b);
  ASSERT_EQ(x.nodes().size(), 2UL);
  ASSERT_EQ(x.edges().size(), 1UL);
  ASSERT_EQ(x.edges().front().first, a);
  ASSERT_EQ(x.edges().front().second, b);
 }
 TEST(GraphPatternDetecter, MarkPDNodesInGraph) {
  GraphPatternDetecter x;
  // mark o2, o3, v2
  // The pattern is a graph:
  //   o2(a node named o2) -> v2(a node named v2)
  //   v2 -> o3(a node named o3)
  auto* o2 = x.pattern_.NewNode([](Node* node) {
    // The teller can be any condition, such as op type, or variable's shape.
    return node && node->Name() == "op2" && node->IsOp();
  });
  auto* o3 = x.pattern_.NewNode([](Node* node) {
    // The teller can be any condition, such as op type, or variable's shape.
    return node && node->Name() == "op3" && node->IsOp();
  });
  auto* v2 = x.pattern_.NewNode([](Node* node) {
    // The teller can be any condition, such as op type, or variable's shape.
    return node && node->Name() == "var2" && node->IsVar();
  });
  ASSERT_FALSE(o2->Tell(nullptr));
  ASSERT_FALSE(o3->Tell(nullptr));
  ASSERT_FALSE(v2->Tell(nullptr));
  x.pattern_.AddEdge(o2, v2);
  x.pattern_.AddEdge(v2, o3);
  ASSERT_EQ(x.pattern_.edges().size(), 2UL);
  ASSERT_EQ(x.pattern_.edges()[0].first, o2);
  ASSERT_EQ(x.pattern_.edges()[0].second, v2);
  ASSERT_EQ(x.pattern_.edges()[1].first, v2);
  ASSERT_EQ(x.pattern_.edges()[1].second, o3);
  ProgramDesc program;
  Graph graph(program);
  BuildGraph(&graph);
  x.MarkPDNodesInGraph(graph);
  ASSERT_EQ(x.pdnodes2nodes_.size(), 3UL);
  auto subgraphs = x.DetectPatterns();
  ASSERT_EQ(subgraphs.size(), 1UL);
 }
 TEST(GraphPatternDetecter, MultiSubgraph) {
  ProgramDesc program;
  Graph graph(program);
  BuildGraph(&graph);
  GraphPatternDetecter x;
  // The pattern is a graph:
  //   op -> var
  auto* any_op = x.mutable_pattern()->NewNode(
      [](Node* node) {
        return node->IsOp() && (node->Name() == "op2" || node->Name() == "op3");
      },
      "OP0");
  auto* any_var = x.mutable_pattern()->NewNode(
      [](Node* node) { return node->IsVar(); }, "VAR");
  auto* any_op1 = x.mutable_pattern()->NewNode(
      [](Node* node) { return node->IsOp(); }, "OP1");
  x.mutable_pattern()->AddEdge(any_op, any_var);
  x.mutable_pattern()->AddEdge(any_var, any_op1);
  int count = 0;
  GraphPatternDetecter::handle_t handle = [&](
      const GraphPatternDetecter::subgraph_t& s, Graph* g) {
    LOG(INFO) << "Detect " << s.at(any_op)->Name() << " -> "
              << s.at(any_var)->Name() << " -> " << s.at(any_op1)->Name();
    count++;
  };
  x(&graph, handle);
  // 1. Detect op3 -> var4 -> op5
  // 2. Detect op2 -> var2 -> op3
  // 3. Detect op2 -> var2 -> op4
  // 4. Detect op2 -> var3 -> op5
  // But 2 and 3 and 4 overlapped, so keep 2, so the final choices are 1 and 2
  ASSERT_GE(count, 1UL);
  ASSERT_LE(count, 2UL);
 }
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
@ -36,7 +36,7 @@ class SumOpMaker : public OpProtoAndCheckerMaker {
 public:
  void Make() {
    AddInput("X", "").AsDuplicable();
-    AddOutput("Out", "");
+    AddOutput("Out", "").AsDuplicable();
    AddComment("");
  }
 };
@ -59,11 +59,27 @@ class SumOpVarTypeInference : public VarTypeInference {
    block->Var(out_var_name)->SetType(default_var_type);
  }
 };
 class DummyOpMaker : public OpProtoAndCheckerMaker {
 public:
  void Make() {
    AddInput("X", "").AsDuplicable();
    AddOutput("Out", "").AsDuplicable();
    AddComment("");
  }
 };
 class DummyOpVarTypeInference : public VarTypeInference {
 public:
  void operator()(const OpDesc &op_desc, BlockDesc *block) const override {}
 };
 }  // namespace framework
 }  // namespace paddle
 REGISTER_OPERATOR(sum, paddle::framework::NOP, paddle::framework::SumOpMaker,
                  paddle::framework::SumOpVarTypeInference);
 REGISTER_OPERATOR(dummy, paddle::framework::NOP, paddle::framework::SumOpMaker,
                  paddle::framework::SumOpVarTypeInference);
 REGISTER_OPERATOR(sum_without_infer_var_type, paddle::framework::NOP,
                  paddle::framework::SumOpMaker);
@ -110,5 +126,83 @@ TEST(GraphTest, Basic) {
  }
  ASSERT_EQ(nodes.size(), 5);
 }
 TEST(GraphTest, WriteAfterRead) {
  // void Test() {
  ProgramDesc prog;
  auto *op = prog.MutableBlock(0)->AppendOp();
  op->SetType("sum");
  op->SetInput("X", {"a"});
  op->SetOutput("Out", {"b"});
  op->SetAttr("op_role", 1);
  op = prog.MutableBlock(0)->AppendOp();
  op->SetType("dummy");
  op->SetInput("X", {"c"});
  op->SetOutput("Out", {"a"});
  op->SetAttr("op_role", 1);
  prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
  prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
  prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
  std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
  ir::Node *control_dep1 = nullptr;
  ir::Node *control_dep2 = nullptr;
  for (ir::Node *n : g->Nodes()) {
    if (n->Name() == "sum") {
      ASSERT_EQ(n->outputs[0]->Name(), "b");
      ASSERT_TRUE(ir::IsControlDepVar(*n->outputs[1]));
      control_dep1 = n->outputs[1];
      ASSERT_EQ(n->outputs.size(), 2);
    }
    if (n->Name() == "dummy") {
      ASSERT_EQ(n->inputs[0]->Name(), "c");
      ASSERT_TRUE(ir::IsControlDepVar(*n->inputs[1]));
      control_dep2 = n->inputs[1];
      ASSERT_EQ(n->inputs.size(), 2);
    }
  }
  ASSERT_EQ(control_dep1, control_dep2);
 }
 TEST(GraphTest, WriteAfterWrite) {
  // void Test() {
  ProgramDesc prog;
  auto *op = prog.MutableBlock(0)->AppendOp();
  op->SetType("sum");
  op->SetInput("X", {"a"});
  op->SetOutput("Out", {"b"});
  op->SetAttr("op_role", 1);
  op = prog.MutableBlock(0)->AppendOp();
  op->SetType("dummy");
  op->SetInput("X", {"c"});
  op->SetOutput("Out", {"b"});
  op->SetAttr("op_role", 1);
  prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
  prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
  prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
  std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
  ir::Node *control_dep1 = nullptr;
  ir::Node *control_dep2 = nullptr;
  for (ir::Node *n : g->Nodes()) {
    if (n->Name() == "sum") {
      ASSERT_EQ(n->outputs[0]->Name(), "b");
      ASSERT_TRUE(ir::IsControlDepVar(*n->outputs[1]));
      ASSERT_EQ(n->outputs.size(), 2);
      control_dep1 = n->outputs[1];
    }
    if (n->Name() == "dummy") {
      ASSERT_EQ(n->inputs[0]->Name(), "c");
      ASSERT_TRUE(ir::IsControlDepVar(*n->inputs[1]));
      control_dep2 = n->inputs[1];
      ASSERT_EQ(n->inputs.size(), 2);
      ASSERT_EQ(control_dep1, control_dep2);
    }
  }
 }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_traits.cc
+++ b/paddle/fluid/framework/ir/graph_traits.cc
@ -0,0 +1,69 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/ir/graph_traits.h"
 namespace paddle {
 namespace framework {
 namespace ir {
 //
 // NodesDFSIterator
 //
 NodesDFSIterator::NodesDFSIterator(const std::vector<Node *> &source) {
  for (auto *x : source) stack_.push(x);
 }
 NodesDFSIterator::NodesDFSIterator(NodesDFSIterator &&other) noexcept
    : stack_(std::move(other.stack_)),
      visited_(std::move(other.visited_)) {}
 NodesDFSIterator::NodesDFSIterator(const NodesDFSIterator &other)
    : stack_(other.stack_), visited_(other.visited_) {}
 Node &NodesDFSIterator::operator*() {
  PADDLE_ENFORCE(!stack_.empty());
  return *stack_.top();
 }
 NodesDFSIterator &NodesDFSIterator::operator++() {
  PADDLE_ENFORCE(!stack_.empty(), "the iterator exceeds range");
  visited_.insert(stack_.top());
  auto *cur = stack_.top();
  stack_.pop();
  for (auto *x : cur->outputs) {
    if (!visited_.count(x)) {
      stack_.push(x);
    }
  }
  return *this;
 }
 bool NodesDFSIterator::operator==(const NodesDFSIterator &other) {
  if (stack_.empty()) return other.stack_.empty();
  if ((!stack_.empty()) && (!other.stack_.empty())) {
    return stack_.top() == other.stack_.top();
  }
  return false;
 }
 NodesDFSIterator &NodesDFSIterator::operator=(const NodesDFSIterator &other) {
  stack_ = other.stack_;
  visited_ = other.visited_;
  return *this;
 }
 Node *NodesDFSIterator::operator->() { return stack_.top(); }
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_traits.h
+++ b/paddle/fluid/framework/ir/graph_traits.h
@ -0,0 +1,90 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <stack>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 namespace paddle {
 namespace framework {
 namespace ir {
 template <typename IteratorT>
 class iterator_range {
  IteratorT begin_, end_;
 public:
  template <typename Container>
  explicit iterator_range(Container &&c) : begin_(c.begin()), end_(c.end()) {}
  iterator_range(const IteratorT &begin, const IteratorT &end)
      : begin_(begin), end_(end) {}
  const IteratorT &begin() const { return begin_; }
  const IteratorT &end() const { return end_; }
 };
 // DFS iterator on nodes.
 struct NodesDFSIterator
    : public std::iterator<std::forward_iterator_tag, Node *> {
  NodesDFSIterator() = default;
  explicit NodesDFSIterator(const std::vector<Node *> &source);
  NodesDFSIterator(NodesDFSIterator &&other) noexcept;
  NodesDFSIterator(const NodesDFSIterator &other);
  Node &operator*();
  NodesDFSIterator &operator++();
  // TODO(Superjomn) current implementation just compare the first
  // element, need to compare the graph and all the elements in the queue and
  // set.
  NodesDFSIterator &operator=(const NodesDFSIterator &other);
  bool operator==(const NodesDFSIterator &other);
  bool operator!=(const NodesDFSIterator &other) { return !(*this == other); }
  Node *operator->();
 private:
  std::stack<Node *> stack_;
  std::unordered_set<Node *> visited_;
 };
 /*
 * GraphTraits contains some graph traversal algorithms.
 *
 * Usage:
 *
 */
 struct GraphTraits {
  static iterator_range<NodesDFSIterator> DFS(const Graph &g) {
    auto start_points = ExtractStartPoints(g);
    NodesDFSIterator x(start_points);
    return iterator_range<NodesDFSIterator>(NodesDFSIterator(start_points),
                                            NodesDFSIterator());
  }
 private:
  // The nodes those have no input will be treated as start points.
  static std::vector<Node *> ExtractStartPoints(const Graph &g) {
    std::vector<Node *> result;
    for (auto *node : g.Nodes()) {
      if (node->inputs.empty()) {
        result.push_back(node);
      }
    }
    return result;
  }
 };
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@ -58,6 +58,9 @@ class Node {
    return op_desc_;
  }
  bool IsOp() const { return type_ == Type::kOperation; }
  bool IsVar() const { return type_ == Type::kVariable; }
  std::vector<Node*> inputs;
  std::vector<Node*> outputs;
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@ -238,7 +238,20 @@ Attribute OpDesc::GetNullableAttr(const std::string &name) const {
  }
 }
-int OpDesc::GetBlockAttr(const std::string &name) const {
+std::vector<int> OpDesc::GetBlocksAttrIds(const std::string &name) const {
  auto it = attrs_.find(name);
  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
  auto blocks = boost::get<std::vector<BlockDesc *>>(it->second);
  std::vector<int> ids;
  for (auto n : blocks) {
    ids.push_back(n->ID());
  }
  return ids;
 }
 int OpDesc::GetBlockAttrId(const std::string &name) const {
  auto it = attrs_.find(name);
  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
  return boost::get<BlockDesc *>(it->second)->ID();
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@ -83,7 +83,9 @@ class OpDesc {
  Attribute GetNullableAttr(const std::string &name) const;
-  int GetBlockAttr(const std::string &name) const;
+  int GetBlockAttrId(const std::string &name) const;
  std::vector<int> GetBlocksAttrIds(const std::string &name) const;
  void Rename(const std::string &old_name, const std::string &new_name);
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@ -58,7 +58,7 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
      for (const std::string &attr_name : op->AttrNames()) {
        if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) {
          int sub_block_id =
-              o.Block(block_id).Op(op_id)->GetBlockAttr(attr_name);
+              o.Block(block_id).Op(op_id)->GetBlockAttrId(attr_name);
          op->SetBlockAttr(attr_name, MutableBlock(sub_block_id));
        }
      }
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@ -112,5 +112,6 @@ Tensor& Tensor::Resize(const DDim& dims) {
 const DDim& Tensor::dims() const { return dims_; }
 int64_t Tensor::numel() const { return product(dims_); }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@ -59,6 +59,14 @@ inline T* Tensor::mutable_data(platform::Place place) {
 }
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
  int rank = src.dims().size();
  PADDLE_ENFORCE_GE(
      rank, 2,
      "'ReshapeToMatrix()' is only used for flatten high rank "
      "tensors to matrixs. Can not be used in reshaping vectors.");
  if (rank == 2) {
    return src;
  }
  Tensor res;
  res.ShareDataWith(src);
  res.Resize(flatten_to_2d(src.dims(), num_col_dims));
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@ -20,6 +20,9 @@
 DEFINE_int32(io_threadpool_size, 100,
             "number of threads used for doing IO, default 100");
 DEFINE_int32(dist_threadpool_size, 0,
             "number of threads used for distributed executed.");
 namespace paddle {
 namespace framework {
@ -35,6 +38,10 @@ void ThreadPool::Init() {
  if (threadpool_.get() == nullptr) {
    // TODO(Yancey1989): specify the max threads number
    int num_threads = std::thread::hardware_concurrency();
    if (FLAGS_dist_threadpool_size > 0) {
      num_threads = FLAGS_dist_threadpool_size;
      VLOG(1) << "set dist_threadpool_size to " << num_threads;
    }
    PADDLE_ENFORCE_GT(num_threads, 0);
    threadpool_.reset(new ThreadPool(num_threads));
  }
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@ -22,6 +22,9 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/platform/profiler.h"
 DEFINE_bool(profile, false, "Turn on profiler for fluid");
 namespace paddle {
 namespace {
@ -58,6 +61,15 @@ bool NativePaddlePredictor::Init(
    std::shared_ptr<framework::Scope> parent_scope) {
  VLOG(3) << "Predictor::init()";
  if (FLAGS_profile) {
    LOG(WARNING) << "Profiler is actived, might affect the performance";
    LOG(INFO) << "You can turn off by set gflags '-profile false'";
    auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll
                                           : platform::ProfilerState::kCPU;
    platform::EnableProfiler(tracking_device);
  }
  if (config_.use_gpu) {
    place_ = paddle::platform::CUDAPlace(config_.device);
  } else {
@ -102,6 +114,10 @@ bool NativePaddlePredictor::Init(
 }
 NativePaddlePredictor::~NativePaddlePredictor() {
  if (FLAGS_profile) {
    platform::DisableProfiler(platform::EventSortingKey::kTotal,
                              "./profile.log");
  }
  if (sub_scope_) {
    scope_->DeleteScope(sub_scope_);
  }
--- a/paddle/fluid/operators/.flatten_op.cc.swp
+++ b/paddle/fluid/operators/.flatten_op.cc.swp
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@ -170,6 +170,9 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
      elseif(${TARGET} STREQUAL "tensorrt_engine_op")
          message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
      elseif(${TARGET} STREQUAL "fc")
        # HACK: fc only have mkldnn and cpu, which would mismatch the cpu only condition
        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
      else()
        file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
      endif()
@ -300,12 +303,6 @@ op_library(channel_recv_op DEPS concurrency)
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 # The fully connected layer is deleted when the WITH_MKLDNN flag is OFF
 # Because the fully connected layer has only one MKLDNN's operator
 if(NOT WITH_MKLDNN)
    list(REMOVE_ITEM GENERAL_OPS fc_op)
 endif(NOT WITH_MKLDNN)
 foreach(src ${GENERAL_OPS})
    op_library(${src})
 endforeach()
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@ -28,23 +28,26 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
    auto x_dims = ctx->GetInputDim("X");
    auto label_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input(X)'s rank should be 2.");
+    int rank = x_dims.size();
-    PADDLE_ENFORCE_EQ(label_dims.size(), 2UL,
+    PADDLE_ENFORCE_EQ(rank, label_dims.size(),
-                      "Input(Label)'s rank should be 2.");
+                      "Input(X) and Input(Label) shall have the same rank.");
-    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
-                      "The 1st dimension of Input(X) and Input(Label) should "
+                      framework::slice_ddim(label_dims, 0, rank - 1),
-                      "be equal.");
+                      "Input(X) and Input(Label) shall have the same shape "
                      "except the last dimension.");
    if (ctx->Attrs().Get<bool>("soft_label")) {
-      PADDLE_ENFORCE_EQ(x_dims[1], label_dims[1],
+      PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1],
-                        "If Attr(soft_label) == true, the 2nd dimension of "
+                        "If Attr(soft_label) == true, the last dimension of "
                        "Input(X) and Input(Label) should be equal.");
    } else {
-      PADDLE_ENFORCE_EQ(label_dims[1], 1UL,
+      PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1UL,
-                        "If Attr(softLabel) == false, the 2nd dimension of "
+                        "If Attr(softLabel) == false, the last dimension of "
                        "Input(Label) should be 1.");
    }
-    ctx->SetOutputDim("Y", {x_dims[0], 1});
+    auto y_dims = x_dims;
    y_dims[rank - 1] = 1;
    ctx->SetOutputDim("Y", y_dims);
    ctx->ShareLoD("X", /*->*/ "Y");
  }
@ -74,24 +77,28 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
    auto x_dims = ctx->GetInputDim("X");
    auto label_dims = ctx->GetInputDim("Label");
    auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
+    int rank = x_dims.size();
-    PADDLE_ENFORCE_EQ(dy_dims.size(), 2, "Input(Y@Grad)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(dy_dims.size(), rank,
-    PADDLE_ENFORCE_EQ(label_dims.size(), 2, "Input(Label)'s rank should be 2.");
+                      "Input(Y@Grad) and Input(X) should have the same rank.");
-    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
+    PADDLE_ENFORCE_EQ(label_dims.size(), rank,
-                      "The 1st dimension of Input(X) and Input(Label) should "
+                      "Input(Label) and Input(X) should have the same rank.");
-                      "be equal.");
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
-    PADDLE_ENFORCE_EQ(x_dims[0], dy_dims[0],
+                      framework::slice_ddim(label_dims, 0, rank - 1),
-                      "The 1st dimension of Input(X) and Input(Y@Grad) should "
+                      "The Input(X) and Input(Label) should have the same "
-                      "be equal.");
+                      "shape except the last dimension.");
-    PADDLE_ENFORCE_EQ(dy_dims[1], 1,
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
-                      "The 2nd dimension of Input(Y@Grad) should be 1.");
+                      framework::slice_ddim(dy_dims, 0, rank - 1),
                      "The Input(X) and Input(Y@Grad) should have the same "
                      "shape except the last dimension.");
    PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1,
                      "The last dimension of Input(Y@Grad) should be 1.");
    if (ctx->Attrs().Get<bool>("soft_label")) {
-      PADDLE_ENFORCE_EQ(x_dims[1], label_dims[1],
+      PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1],
-                        "When Attr(soft_label) == true, the 2nd dimension of "
+                        "When Attr(soft_label) == true, the last dimension of "
                        "Input(X) and Input(Label) should be equal.");
    } else {
-      PADDLE_ENFORCE_EQ(label_dims[1], 1,
+      PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1,
-                        "When Attr(soft_label) == false, the 2nd dimension of "
+                        "When Attr(soft_label) == false, the last dimension of "
                        "Input(Label) should be 1.");
    }
    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
@ -113,18 +120,20 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
    AddInput("X",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape [N x D],"
+             "(Tensor, default Tensor<float>), a tensor whose last dimension "
-             " where N is the batch size and D is the number of classes. "
+             "size is equal to the number of classes. This input is a "
-             "This input is a probability computed by the previous operator, "
+             "probability computed by the previous operator, which is almost "
-             "which is almost always the result of a softmax operator.");
+             "always the result of a softmax operator.");
-    AddInput("Label",
+    AddInput(
-             "(Tensor), the ground truth which is a 2-D tensor. When "
+        "Label",
-             "soft_label is set to false, Label is a Tensor<int64> with shape "
+        "(Tensor), the tensor which represents the ground truth. It has the "
-             "[N x 1]. When soft_label is set to true, Label is a "
+        "same shape with 'X' except the last dimension. When soft_label is set "
-             "Tensor<float/double> with shape [N x D].");
+        "to false, the last dimension size is 1; when soft_label is set to "
        "true, the last dimension size is equal to the number of classes.");
    AddOutput("Y",
-              "(Tensor, default Tensor<float>), a 2-D tensor with shape "
+              "(Tensor, default Tensor<float>), a tensor whose shape is same "
-              "[N x 1]. The cross entropy loss.");
+              "with 'X' except that the last dimension size is 1. It "
              "represents the cross entropy loss.");
    AddAttr<bool>("soft_label",
                  "(bool, default false), a flag indicating whether to "
                  "interpretate the given labels as soft labels.")
@ -132,6 +141,12 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 CrossEntropy Operator.
 The input 'X' and 'Label' will first be logically flattened to 2-D matrixs. 
 The matrix's second dimension(row length) is as same as the original last 
 dimension, and the first dimension(column length) is the product of all other 
 original dimensions. Then the softmax computation will take palce on each raw 
 of flattened matrixs.
 It supports both standard cross-entropy and soft-label cross-entropy loss
 computation.
 1) One-hot cross-entropy:
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@ -33,8 +33,13 @@ class CrossEntropyOpKernel : public framework::OpKernel<T> {
    auto* y = ctx.Output<Tensor>("Y");
    y->mutable_data<T>(ctx.GetPlace());
    int rank = x->dims().size();
    Tensor x_2d = framework::ReshapeToMatrix(*x, rank - 1);
    Tensor labels_2d = framework::ReshapeToMatrix(*labels, rank - 1);
    Tensor y_2d = framework::ReshapeToMatrix(*y, rank - 1);
    math::CrossEntropyFunctor<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), y, x, labels,
+        ctx.template device_context<DeviceContext>(), &y_2d, &x_2d, &labels_2d,
        ctx.Attr<bool>("soft_label"));
  }
 };
@ -98,9 +103,12 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
    auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
    auto* label = ctx.Input<Tensor>("Label");
    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    int64_t class_num = x->dims()[1];
+    // Following computation only depends on the last dimension size. So it's
    // unnecessary to convert tensors to 2-D views.
    int rank = x->dims().size();
    int64_t class_num = x->dims()[rank - 1];
    if (ctx.Attr<bool>("soft_label")) {
      XeSoftlabelGradFunctor<T> functor(dx_data, dy->data<T>(), x->data<T>(),
                                        label->data<T>(),
--- a/Show More
+++ b/Show More