Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into optimize-thread-pool

test=develop
7 years ago · fe4cd50286
parent ac415c0094 d6a6a13039
commit fe4cd50286
120 changed files with 4363 additions and 470 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -62,7 +62,6 @@ option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
 option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
-option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
 option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -50,11 +50,7 @@ if(NOT WITH_PROFILER)
 endif(NOT WITH_PROFILER)

 if(NOT CMAKE_CROSSCOMPILING)
-    if(WITH_AVX AND AVX512F_FOUND)
-        set(SIMD_FLAG ${AVX512F_FLAG})
-    elseif(WITH_AVX AND AVX2_FOUND)
-        set(SIMD_FLAG ${AVX2_FLAG})
-    elseif(WITH_AVX AND AVX_FOUND)
+    if(WITH_AVX AND AVX_FOUND)
        set(SIMD_FLAG ${AVX_FLAG})
    elseif(SSE3_FOUND)
        set(SIMD_FLAG ${SSE3_FLAG})
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@ -89,7 +89,9 @@ CHECK_CXX_SOURCE_RUNS("
 #include <immintrin.h>
 int main()
 {
-    __m512i a = _mm512_undefined_epi32();
+    __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
+                                  13, -5, 6, -7, 9, 2, -6, 3);
+    __m512i result = _mm512_abs_epi32 (a);
    return 0;
 }" AVX512F_FOUND)

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -67,8 +67,8 @@ paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size',
 paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type', 'is_test'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None))
 paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
-paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
-paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
+paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
+paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
 paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False))
 paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
@ -103,7 +103,7 @@ paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 's
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
-paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100))
+paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode'], varargs=None, keywords=None, defaults=(False, -100, False))
 paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1))
@ -174,9 +174,11 @@ paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None
 paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
 paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
 paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))
+paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
 paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@ -35,13 +35,15 @@ if(WITH_GPU)
          all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
 endif()

+cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
+
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)

 if(WITH_GPU)
-  cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto reference_count_pass)
+  cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto reference_count_pass sequential_execution_pass)
 else()
-  cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto)
+  cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto sequential_execution_pass)
 endif()

 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@ -16,6 +16,7 @@ limitations under the License. */

 #include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
+#include "paddle/fluid/framework/details/sequential_execution_pass.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"

@ -27,6 +28,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
 public:
  explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
      : ir::PassBuilder(), strategy_(strategy) {
+    if (strategy_.enable_sequential_execution_) {
+      AppendPass("sequential_execution_pass");
+    }
+
    // Add a graph viz pass to record a graph.
    if (!strategy_.debug_graphviz_path_.empty()) {
      auto viz_pass = AppendPass("graph_viz_pass");
@ -110,6 +115,11 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
      pass->Erase("nccl_ctxs");
      pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
 #endif
+    } else if (pass->Type() == "sequential_execution_pass") {
+      pass->Erase(kAllOpDescs);
+      pass->Set<const std::vector<OpDesc *>>(
+          kAllOpDescs,
+          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
    }
    graph = pass->Apply(std::move(graph));
  }
@ -125,3 +135,4 @@ USE_PASS(multi_batch_merge_pass);
 USE_PASS(multi_devices_pass);
 USE_PASS(multi_devices_check_pass);
 USE_PASS(multi_devices_print_pass);
+USE_PASS(sequential_execution_pass);
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@ -69,6 +69,8 @@ struct BuildStrategy {

  bool enable_data_balance_{false};

+  bool enable_sequential_execution_{false};
+
  bool fuse_broadcast_op_{false};

  // User normally doesn't need to call this API.
--- a/paddle/fluid/framework/details/rpc_op_handle.cc
+++ b/paddle/fluid/framework/details/rpc_op_handle.cc
@ -29,22 +29,19 @@ RPCOpHandle::RPCOpHandle(ir::Node *node, const framework::OpDesc &op_desc,
      place_(place) {}

 void RPCOpHandle::RunImpl() {
-  // TODO(wuyi): need further analysis whether wait VarDummyHandle.
-  // Wait input done
  for (auto *in : inputs_) {
    auto &p = static_cast<VarHandle *>(in)->place_;
-    // FIXME(Yancey1989): need a better solution instead of use DebugString()
-    if (ir::IsControlDepVar(*in->Node())) {  // HACK
+    if (ir::IsControlDepVar(*in->Node())) {
      continue;
    }
    if (in->GeneratedOp()) {
      in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(p));
    }
  }
-  auto &tmp_scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
-  // FIXME(wuyi): can not use RunAndRecordEvent here, for it will cause dead
-  // lock.
-  op_->Run(*tmp_scope, place_);
+  this->RunAndRecordEvent([this] {
+    op_->Run(*local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(),
+             place_);
+  });
 }

 std::string RPCOpHandle::Name() const { return name_; }
--- a/paddle/fluid/framework/details/sequential_execution_pass.cc
+++ b/paddle/fluid/framework/details/sequential_execution_pass.cc
@ -0,0 +1,109 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/sequential_execution_pass.h"
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/op_proto_maker.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+static bool IsSameOpDesc(OpDesc *op1, OpDesc *op2) {
+  return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
+         op1->Outputs() == op2->Outputs();
+}
+
+std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  // FIXME(zjl): Insert dependencies between some distributed ops may cause
+  // the multi_devices_graph_pass fails. So we skip these ops here.
+  // Indeed, maybe we should not insert dependencies between these ops
+  // casually, which may cause deadlock easily.
+  // We should add more skipped distributed ops when found errors in
+  // multi_devices_graph_pass
+  static std::unordered_set<std::string> skip_dist_ops{
+      "send", "recv", "send_barrier", "fetch_barrier"};
+
+  auto &ops = Get<const std::vector<OpDesc *>>(kAllOpDescs);
+  std::vector<ir::Node *> op_node_list;
+  op_node_list.reserve(ops.size());
+
+  std::unordered_map<ir::Node *, size_t> op_deps;
+  std::unordered_map<ir::Node *, std::unordered_set<ir::Node *>> pending_ops;
+  std::unordered_set<ir::Node *> ready_ops;
+
+  for (ir::Node *node : graph->Nodes()) {
+    if (!node->IsOp()) continue;
+    std::unordered_set<ir::Node *> preceding_ops;
+    for (auto *in : node->inputs) {
+      PADDLE_ENFORCE(in->IsVar(),
+                     "Preceding Node of Op Nodes must be Var Node");
+      if (in->inputs.empty()) continue;
+      PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp(),
+                     "Preceding Op Node of Var Node must be unique");
+      preceding_ops.insert(in->inputs[0]);
+      pending_ops[in->inputs[0]].insert(node);
+    }
+    op_deps[node] = preceding_ops.size();
+    if (preceding_ops.empty()) {
+      ready_ops.insert(node);
+    }
+  }
+
+  for (auto *op_desc : ops) {
+    ir::Node *found_node = nullptr;
+    for (auto *node : ready_ops) {
+      if (IsSameOpDesc(op_desc, node->Op())) {
+        PADDLE_ENFORCE(found_node == nullptr,
+                       "Found multiple op_desc in graph: %s", op_desc->Type());
+        found_node = node;
+      }
+    }
+
+    PADDLE_ENFORCE_NOT_NULL(found_node, "Cannot find op_desc in graph: %s",
+                            op_desc->Type());
+    for (auto *pending_op : pending_ops[found_node]) {
+      if (--op_deps.at(pending_op) == 0) {
+        ready_ops.insert(pending_op);
+      }
+    }
+    ready_ops.erase(found_node);
+    if (skip_dist_ops.count(op_desc->Type()) == 0) {
+      op_node_list.push_back(found_node);
+    }
+  }
+
+  for (size_t i = 1; i < op_node_list.size(); ++i) {
+    auto *dep_var = graph->CreateControlDepVar();
+    op_node_list[i]->inputs.push_back(dep_var);
+    op_node_list[i - 1]->outputs.push_back(dep_var);
+    dep_var->outputs.push_back(op_node_list[i]);
+    dep_var->inputs.push_back(op_node_list[i - 1]);
+    VLOG(10) << "Add dependencies between " << op_node_list[i - 1]->Name()
+             << " and " << op_node_list[i]->Name();
+  }
+  return graph;
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(sequential_execution_pass,
+              paddle::framework::details::SequentialExecutionPass)
+    .RequirePassAttr(paddle::framework::details::kAllOpDescs);
--- a/paddle/fluid/framework/details/sequential_execution_pass.h
+++ b/paddle/fluid/framework/details/sequential_execution_pass.h
@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+constexpr char kAllOpDescs[] = "all_op_descs";
+
+class SequentialExecutionPass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@ -85,8 +85,10 @@ Executor::Executor(const platform::Place& place) : place_(place) {}

 void Executor::Close() {
 #ifdef PADDLE_WITH_DISTRIBUTE
+  // TODO(typhoonzero): complete message will need to use real trainer_id,
+  // except 0.
  ::paddle::operators::distributed::RPCClient::GetInstance<
-      ::paddle::operators::distributed::GRPCClient>()
+      ::paddle::operators::distributed::GRPCClient>(0)
      ->SendComplete();
 #endif
 }
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@ -41,6 +41,7 @@ pass_library(conv_bn_fuse_pass inference)
 pass_library(seqconv_eltadd_relu_fuse_pass inference)
 if(WITH_MKLDNN)
    pass_library(mkldnn_placement_pass base)
+    pass_library(depthwise_conv_mkldnn_pass base)
    pass_library(conv_bias_mkldnn_fuse_pass inference)
    pass_library(conv_relu_mkldnn_fuse_pass inference)
    pass_library(conv_elementwise_add_mkldnn_fuse_pass inference)
@ -59,6 +60,7 @@ cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
 if (WITH_MKLDNN)
+    cc_test(test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
    cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
    cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
 endif ()
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
@ -31,7 +31,8 @@ class ConvReLUFusePass : public FusePassBase {
  virtual ~ConvReLUFusePass() {}

 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 };

 }  // namespace ir
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h"

 #include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_proto_maker.h"

 namespace paddle {
 namespace framework {
@ -36,6 +37,8 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
    op->SetInput("X", inputs);
  }
  op->SetOutput("Out", outputs);
+  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
+              static_cast<int>(OpRole::kForward));
 }

 // a->OP0->b
--- a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+#define GET_NODE(id, pattern)                               \
+  PADDLE_ENFORCE(subgraph.count(pattern.RetrieveNode(#id)), \
+                 "pattern has no Node called %s", #id);     \
+  auto* id = subgraph.at(pattern.RetrieveNode(#id));        \
+  PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
+
+std::unique_ptr<ir::Graph> DepthwiseConvMKLDNNPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init("depthwise_conv_mkldnn_pass", graph.get());
+  GraphPatternDetector gpd;
+
+  auto* pattern = gpd.mutable_pattern();
+  pattern->NewNode("depthwise_conv")
+      ->assert_is_op("depthwise_conv2d")
+      ->assert_op_attr("use_mkldnn", true);
+
+  int found_depthwise_conv_mkldnn_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(3) << "handle DepthwiseConvMKLDNN fuse";
+    GET_NODE(depthwise_conv, (*pattern));
+    depthwise_conv->Op()->SetType("conv2d");
+    found_depthwise_conv_mkldnn_count++;
+  };
+
+  gpd(graph.get(), handler);
+  AddStatis(found_depthwise_conv_mkldnn_count);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(depthwise_conv_mkldnn_pass,
+              paddle::framework::ir::DepthwiseConvMKLDNNPass);
--- a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h
+++ b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h
@ -0,0 +1,34 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class DepthwiseConvMKLDNNPass : public FusePassBase {
+ public:
+  virtual ~DepthwiseConvMKLDNNPass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc
@ -0,0 +1,123 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs, bool use_mkldnn = false) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetAttr("use_mkldnn", use_mkldnn);
+  op->SetAttr("name", name);
+  op->SetInput("Input", {inputs[0]});
+  op->SetInput("Filter", {inputs[1]});
+  op->SetInput("Bias", {inputs[2]});
+  op->SetOutput("Out", outputs);
+}
+
+// (a, weights, bias)->depthwise conv mkldnn->b
+// (b, weights2, bias2)->depthwise conv no mkldnn->c
+// (c, weights3, bias3)->conv mkldnn->d
+// (d, weights3, bias3)->conv no mkldnn->e
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+  for (auto& v : std::vector<std::string>(
+           {"a", "b", "c", "d", "e", "weights", "bias", "weights2", "bias2",
+            "weights3", "bias3", "weights4", "bias4"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::SELECTED_ROWS);
+    if (v == "weights" || v == "bias" || v == "weights2" || v == "bias2" ||
+        v == "weights3" || v == "bias3" || v == "weights4" || v == "bias4") {
+      var->SetPersistable(true);
+    }
+  }
+
+  // depthwise conv with MKL-DNN
+  SetOp(&prog, "depthwise_conv2d", "conv1",
+        std::vector<std::string>({"a", "weights", "bias"}),
+        std::vector<std::string>({"b"}), true);
+  // depthwise conv without MKL-DNN
+  SetOp(&prog, "depthwise_conv2d", "conv2",
+        std::vector<std::string>({"b", "weights2", "bias2"}),
+        std::vector<std::string>({"c"}), false);
+  // conv with MKL-DNN
+  SetOp(&prog, "conv2d", "conv3",
+        std::vector<std::string>({"c", "weights3", "bias3"}),
+        std::vector<std::string>({"d"}), true);
+  // conv without MKL-dNN
+  SetOp(&prog, "conv2d", "conv4",
+        std::vector<std::string>({"d", "weights4", "bias4"}),
+        std::vector<std::string>({"e"}), false);
+
+  return prog;
+}
+
+TEST(DepthwiseConvMKLDNNPass, basic) {
+  auto prog = BuildProgramDesc();
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("depthwise_conv_mkldnn_pass");
+
+  struct counters {
+    int mkldnn_depthwise_conv_nodes;
+    int other_depthwise_conv_nodes;
+    int mkldnn_conv_nodes;
+    int other_conv_nodes;
+  };
+
+  counters before{1, 1, 1, 1};
+
+  graph = pass->Apply(std::move(graph));
+
+  // initialize counters before loop
+  counters after{0, 0, 0, 0};
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (op->Type() == "conv2d") {
+        if (boost::get<bool>(op->GetAttr("use_mkldnn")))
+          after.mkldnn_conv_nodes++;
+        else
+          after.other_conv_nodes++;
+      } else if (op->Type() == "depthwise_conv2d") {
+        if (boost::get<bool>(op->GetAttr("use_mkldnn")))
+          after.mkldnn_depthwise_conv_nodes++;
+        else
+          after.other_depthwise_conv_nodes++;
+      }
+    }
+  }
+
+  EXPECT_EQ(after.other_depthwise_conv_nodes,
+            before.other_depthwise_conv_nodes);
+  EXPECT_EQ(after.other_conv_nodes, before.other_conv_nodes);
+  EXPECT_EQ(after.mkldnn_depthwise_conv_nodes,
+            before.mkldnn_depthwise_conv_nodes - 1);
+  EXPECT_EQ(after.mkldnn_conv_nodes, before.mkldnn_conv_nodes + 1);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(depthwise_conv_mkldnn_pass);
--- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/fc_fuse_pass.h"

 #include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_proto_maker.h"

 namespace paddle {
 namespace framework {
@ -32,6 +33,8 @@ void SetOp(ProgramDesc* prog, const std::string& type,
    op->SetInput("X", inputs);
  }
  op->SetOutput("Out", outputs);
+  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
+              static_cast<int>(OpRole::kForward));
 }

 // a->OP0->b
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@ -23,8 +23,67 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
+namespace {
+
+void CheckProgram(const ProgramDesc &program) {
+#define _INT(role) static_cast<int>(role)
+
+  std::map<int, bool> visit;
+  for (OpDesc *op : program.Block(0).AllOps()) {
+    // For backward compatibility, some program doesn't have role added.
+    if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue;
+    int role_id =
+        boost::get<int>(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
+    visit[role_id] = true;
+    switch (role_id) {
+      case _INT(OpRole::kForward):
+        if (visit.find(_INT(OpRole::kBackward)) != visit.end()) {
+          LOG(ERROR)
+              << "Cannot add backward operator before forward operator %s."
+              << op->Type();
+        }
+        break;
+      case _INT(OpRole::kBackward):
+      case _INT(OpRole::kBackward) | _INT(OpRole::kLoss):
+        PADDLE_ENFORCE(
+            visit.find(_INT(OpRole::kOptimize)) == visit.end(),
+            "Cannot add backward operator %s after optimize operator.",
+            op->Type());
+        break;
+      case _INT(OpRole::kForward) | _INT(OpRole::kLoss):
+        PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) |
+                                  _INT(OpRole::kLoss)) == visit.end(),
+                       "Cannot add backward|loss operator before "
+                       "forward|loss operator %s.",
+                       op->Type());
+        PADDLE_ENFORCE(
+            visit.find(_INT(OpRole::kOptimize)) == visit.end(),
+            "Cannot add forward|loss operator %s after optimize operator.",
+            op->Type());
+        break;
+      case _INT(OpRole::kOptimize):
+      case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched):
+        PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(),
+                       "Optimize operators %s must follow backward operator.",
+                       op->Type());
+        break;
+      case _INT(OpRole::kLRSched):
+      case _INT(OpRole::kDist):
+      case _INT(OpRole::kRPC):
+      case _INT(OpRole::kNotSpecified):
+        break;
+      default:
+        LOG(FATAL) << "Unknown operator role. Don't add new role because "
+                      "you don't know what you are doing.";
+    }
+  }
+
+#undef _INT
+}
+}  // namespace

 Graph::Graph(const ProgramDesc &program) : program_(program) {
+  CheckProgram(program_);
  // Make the nodes id start from 0.
  Node::ResetId();
  auto var_nodes = InitFromProgram(program_);
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@ -259,6 +259,15 @@ GraphPatternDetector::DetectPatterns() {
  return result;
 }

+bool GraphItemCMP(const std::pair<PDNode *, Node *> &a,
+                  const std::pair<PDNode *, Node *> &b) {
+  if (a.first != b.first) {
+    return a.first < b.first;
+  } else {
+    return a.second < b.second;
+  }
+}
+
 // TODO(Superjomn) enhance the function as it marks unique unique as duplicates
 // see https://github.com/PaddlePaddle/Paddle/issues/13550
 void GraphPatternDetector::UniquePatterns(
@ -267,12 +276,16 @@ void GraphPatternDetector::UniquePatterns(
  std::vector<GraphPatternDetector::subgraph_t> result;

  std::unordered_set<size_t> set;
+  std::hash<std::string> hasher;
  for (auto &g : *subgraphs) {
-    size_t key = 0;
-    for (auto &item : g) {
-      key ^= std::hash<void *>{}(item.first);
-      key ^= std::hash<void *>{}(item.second);
+    // Sort the items in the sub-graph, and transform to a string key.
+    std::vector<std::pair<PDNode *, Node *>> sorted_keys(g.begin(), g.end());
+    std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemCMP);
+    std::stringstream ss;
+    for (auto &item : sorted_keys) {
+      ss << item.first << ":" << item.second;
    }
+    auto key = hasher(ss.str());
    if (!set.count(key)) {
      result.emplace_back(g);
      set.insert(key);
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@ -418,7 +418,7 @@ void LoDTensor::MergeLoDTensor(
    PADDLE_ENFORCE_EQ(new_lod.size(), lod.size());
    for (size_t j = 0; j < lod.size(); ++j) {
      auto &sub_lod = new_lod[j];
-      auto &offset = sub_lod.back();
+      size_t offset = sub_lod.back();
      for (size_t k = 1; k < lod[j].size(); ++k) {
        sub_lod.push_back(lod[j][k] + offset);
      }
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@ -153,6 +153,12 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
    memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
+  } else if (platform::is_cuda_pinned_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    auto src_pinned_place = boost::get<platform::CUDAPinnedPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
+    memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size,
+                 nullptr);
  }
 #endif
 }
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@ -1,3 +1,6 @@
+if(WITH_TESTING)
+  include(test.cmake) # some generic cmake funtion for inference
+endif()
 # analysis and tensorrt must be added before creating static library,
 # otherwise, there would be undefined reference to them in static library.
 add_subdirectory(analysis)
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@ -20,22 +20,17 @@ cc_test(test_node SRCS node_tester.cc DEPS analysis)
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
 cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis paddle_fluid)

-function (inference_analysis_test TARGET)
-    if(WITH_TESTING)
-        set(options "")
-        set(oneValueArgs "")
-        set(multiValueArgs SRCS ARGS EXTRA_DEPS)
-        cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-        set(mem_opt "")
-        if(WITH_GPU)
-            set(mem_opt "--fraction_of_gpu_memory_to_use=0.5")
-        endif()
-        cc_test(${TARGET}
-                SRCS "${analysis_test_SRCS}"
-                DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS}
-                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS})
-        set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
-    endif(WITH_TESTING)
+function(inference_analysis_test TARGET)
+  if(WITH_TESTING)
+     set(options "")
+     set(oneValueArgs "")
+     set(multiValueArgs SRCS ARGS EXTRA_DEPS)
+     cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+     inference_base_test(${TARGET}
+             SRCS ${analysis_test_SRCS}
+             DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS}
+             ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR} ${analysis_test_ARGS})
+  endif()
 endfunction(inference_analysis_test)

 inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api)
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@ -79,6 +79,7 @@ class Analyzer : public OrderedRegistry<PassManager> {
      "conv_bn_fuse_pass",              //
      "conv_eltwiseadd_bn_fuse_pass",   //
 #ifdef PADDLE_WITH_MKLDNN
+      "depthwise_conv_mkldnn_pass",             //
      "conv_bias_mkldnn_fuse_pass",             //
      "conv_relu_mkldnn_fuse_pass",             //
      "conv_elementwise_add_mkldnn_fuse_pass",  //
--- a/Show More
+++ b/Show More