Merge remote-tracking branch 'origin/develop' into fix/sequence_pad

test=develop
7 years ago · d4e8d7077f
parent 7f3c6ea411 daed473d4a
commit d4e8d7077f
42 changed files with 1836 additions and 120 deletions
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -67,8 +67,8 @@ paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size',
 paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type', 'is_test'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None))
 paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
-paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
-paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
+paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
+paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
 paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False))
 paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
@ -103,7 +103,7 @@ paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 's
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
-paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100))
+paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode'], varargs=None, keywords=None, defaults=(False, -100, False))
 paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1))
@ -178,6 +178,7 @@ paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], var
 paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
 paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))
+paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
 paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@ -35,13 +35,15 @@ if(WITH_GPU)
          all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
 endif()

+cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
+
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)

 if(WITH_GPU)
-  cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto reference_count_pass)
+  cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto reference_count_pass sequential_execution_pass)
 else()
-  cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto)
+  cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto sequential_execution_pass)
 endif()

 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@ -16,6 +16,7 @@ limitations under the License. */

 #include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
+#include "paddle/fluid/framework/details/sequential_execution_pass.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"

@ -27,6 +28,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
 public:
  explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
      : ir::PassBuilder(), strategy_(strategy) {
+    if (strategy_.enable_sequential_execution_) {
+      AppendPass("sequential_execution_pass");
+    }
+
    // Add a graph viz pass to record a graph.
    if (!strategy_.debug_graphviz_path_.empty()) {
      auto viz_pass = AppendPass("graph_viz_pass");
@ -110,6 +115,11 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
      pass->Erase("nccl_ctxs");
      pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
 #endif
+    } else if (pass->Type() == "sequential_execution_pass") {
+      pass->Erase(kAllOpDescs);
+      pass->Set<const std::vector<OpDesc *>>(
+          kAllOpDescs,
+          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
    }
    graph = pass->Apply(std::move(graph));
  }
@ -125,3 +135,4 @@ USE_PASS(multi_batch_merge_pass);
 USE_PASS(multi_devices_pass);
 USE_PASS(multi_devices_check_pass);
 USE_PASS(multi_devices_print_pass);
+USE_PASS(sequential_execution_pass);
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@ -69,6 +69,8 @@ struct BuildStrategy {

  bool enable_data_balance_{false};

+  bool enable_sequential_execution_{false};
+
  bool fuse_broadcast_op_{false};

  // User normally doesn't need to call this API.
--- a/paddle/fluid/framework/details/sequential_execution_pass.cc
+++ b/paddle/fluid/framework/details/sequential_execution_pass.cc
@ -0,0 +1,109 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/sequential_execution_pass.h"
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/op_proto_maker.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+static bool IsSameOpDesc(OpDesc *op1, OpDesc *op2) {
+  return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
+         op1->Outputs() == op2->Outputs();
+}
+
+std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  // FIXME(zjl): Insert dependencies between some distributed ops may cause
+  // the multi_devices_graph_pass fails. So we skip these ops here.
+  // Indeed, maybe we should not insert dependencies between these ops
+  // casually, which may cause deadlock easily.
+  // We should add more skipped distributed ops when found errors in
+  // multi_devices_graph_pass
+  static std::unordered_set<std::string> skip_dist_ops{
+      "send", "recv", "send_barrier", "fetch_barrier"};
+
+  auto &ops = Get<const std::vector<OpDesc *>>(kAllOpDescs);
+  std::vector<ir::Node *> op_node_list;
+  op_node_list.reserve(ops.size());
+
+  std::unordered_map<ir::Node *, size_t> op_deps;
+  std::unordered_map<ir::Node *, std::unordered_set<ir::Node *>> pending_ops;
+  std::unordered_set<ir::Node *> ready_ops;
+
+  for (ir::Node *node : graph->Nodes()) {
+    if (!node->IsOp()) continue;
+    std::unordered_set<ir::Node *> preceding_ops;
+    for (auto *in : node->inputs) {
+      PADDLE_ENFORCE(in->IsVar(),
+                     "Preceding Node of Op Nodes must be Var Node");
+      if (in->inputs.empty()) continue;
+      PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp(),
+                     "Preceding Op Node of Var Node must be unique");
+      preceding_ops.insert(in->inputs[0]);
+      pending_ops[in->inputs[0]].insert(node);
+    }
+    op_deps[node] = preceding_ops.size();
+    if (preceding_ops.empty()) {
+      ready_ops.insert(node);
+    }
+  }
+
+  for (auto *op_desc : ops) {
+    ir::Node *found_node = nullptr;
+    for (auto *node : ready_ops) {
+      if (IsSameOpDesc(op_desc, node->Op())) {
+        PADDLE_ENFORCE(found_node == nullptr,
+                       "Found multiple op_desc in graph: %s", op_desc->Type());
+        found_node = node;
+      }
+    }
+
+    PADDLE_ENFORCE_NOT_NULL(found_node, "Cannot find op_desc in graph: %s",
+                            op_desc->Type());
+    for (auto *pending_op : pending_ops[found_node]) {
+      if (--op_deps.at(pending_op) == 0) {
+        ready_ops.insert(pending_op);
+      }
+    }
+    ready_ops.erase(found_node);
+    if (skip_dist_ops.count(op_desc->Type()) == 0) {
+      op_node_list.push_back(found_node);
+    }
+  }
+
+  for (size_t i = 1; i < op_node_list.size(); ++i) {
+    auto *dep_var = graph->CreateControlDepVar();
+    op_node_list[i]->inputs.push_back(dep_var);
+    op_node_list[i - 1]->outputs.push_back(dep_var);
+    dep_var->outputs.push_back(op_node_list[i]);
+    dep_var->inputs.push_back(op_node_list[i - 1]);
+    VLOG(10) << "Add dependencies between " << op_node_list[i - 1]->Name()
+             << " and " << op_node_list[i]->Name();
+  }
+  return graph;
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(sequential_execution_pass,
+              paddle::framework::details::SequentialExecutionPass)
+    .RequirePassAttr(paddle::framework::details::kAllOpDescs);
--- a/paddle/fluid/framework/details/sequential_execution_pass.h
+++ b/paddle/fluid/framework/details/sequential_execution_pass.h
@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+constexpr char kAllOpDescs[] = "all_op_descs";
+
+class SequentialExecutionPass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@ -41,6 +41,7 @@ pass_library(conv_bn_fuse_pass inference)
 pass_library(seqconv_eltadd_relu_fuse_pass inference)
 if(WITH_MKLDNN)
    pass_library(mkldnn_placement_pass base)
+    pass_library(depthwise_conv_mkldnn_pass base)
    pass_library(conv_bias_mkldnn_fuse_pass inference)
    pass_library(conv_relu_mkldnn_fuse_pass inference)
    pass_library(conv_elementwise_add_mkldnn_fuse_pass inference)
@ -59,6 +60,7 @@ cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
 if (WITH_MKLDNN)
+    cc_test(test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
    cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
    cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
 endif ()
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
@ -31,7 +31,8 @@ class ConvReLUFusePass : public FusePassBase {
  virtual ~ConvReLUFusePass() {}

 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 };

 }  // namespace ir
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h"

 #include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_proto_maker.h"

 namespace paddle {
 namespace framework {
@ -36,6 +37,8 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
    op->SetInput("X", inputs);
  }
  op->SetOutput("Out", outputs);
+  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
+              static_cast<int>(OpRole::kForward));
 }

 // a->OP0->b
--- a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+#define GET_NODE(id, pattern)                               \
+  PADDLE_ENFORCE(subgraph.count(pattern.RetrieveNode(#id)), \
+                 "pattern has no Node called %s", #id);     \
+  auto* id = subgraph.at(pattern.RetrieveNode(#id));        \
+  PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
+
+std::unique_ptr<ir::Graph> DepthwiseConvMKLDNNPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init("depthwise_conv_mkldnn_pass", graph.get());
+  GraphPatternDetector gpd;
+
+  auto* pattern = gpd.mutable_pattern();
+  pattern->NewNode("depthwise_conv")
+      ->assert_is_op("depthwise_conv2d")
+      ->assert_op_attr("use_mkldnn", true);
+
+  int found_depthwise_conv_mkldnn_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(3) << "handle DepthwiseConvMKLDNN fuse";
+    GET_NODE(depthwise_conv, (*pattern));
+    depthwise_conv->Op()->SetType("conv2d");
+    found_depthwise_conv_mkldnn_count++;
+  };
+
+  gpd(graph.get(), handler);
+  AddStatis(found_depthwise_conv_mkldnn_count);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(depthwise_conv_mkldnn_pass,
+              paddle::framework::ir::DepthwiseConvMKLDNNPass);
--- a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h
+++ b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h
@ -0,0 +1,34 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class DepthwiseConvMKLDNNPass : public FusePassBase {
+ public:
+  virtual ~DepthwiseConvMKLDNNPass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc
@ -0,0 +1,123 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs, bool use_mkldnn = false) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetAttr("use_mkldnn", use_mkldnn);
+  op->SetAttr("name", name);
+  op->SetInput("Input", {inputs[0]});
+  op->SetInput("Filter", {inputs[1]});
+  op->SetInput("Bias", {inputs[2]});
+  op->SetOutput("Out", outputs);
+}
+
+// (a, weights, bias)->depthwise conv mkldnn->b
+// (b, weights2, bias2)->depthwise conv no mkldnn->c
+// (c, weights3, bias3)->conv mkldnn->d
+// (d, weights3, bias3)->conv no mkldnn->e
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+  for (auto& v : std::vector<std::string>(
+           {"a", "b", "c", "d", "e", "weights", "bias", "weights2", "bias2",
+            "weights3", "bias3", "weights4", "bias4"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::SELECTED_ROWS);
+    if (v == "weights" || v == "bias" || v == "weights2" || v == "bias2" ||
+        v == "weights3" || v == "bias3" || v == "weights4" || v == "bias4") {
+      var->SetPersistable(true);
+    }
+  }
+
+  // depthwise conv with MKL-DNN
+  SetOp(&prog, "depthwise_conv2d", "conv1",
+        std::vector<std::string>({"a", "weights", "bias"}),
+        std::vector<std::string>({"b"}), true);
+  // depthwise conv without MKL-DNN
+  SetOp(&prog, "depthwise_conv2d", "conv2",
+        std::vector<std::string>({"b", "weights2", "bias2"}),
+        std::vector<std::string>({"c"}), false);
+  // conv with MKL-DNN
+  SetOp(&prog, "conv2d", "conv3",
+        std::vector<std::string>({"c", "weights3", "bias3"}),
+        std::vector<std::string>({"d"}), true);
+  // conv without MKL-dNN
+  SetOp(&prog, "conv2d", "conv4",
+        std::vector<std::string>({"d", "weights4", "bias4"}),
+        std::vector<std::string>({"e"}), false);
+
+  return prog;
+}
+
+TEST(DepthwiseConvMKLDNNPass, basic) {
+  auto prog = BuildProgramDesc();
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("depthwise_conv_mkldnn_pass");
+
+  struct counters {
+    int mkldnn_depthwise_conv_nodes;
+    int other_depthwise_conv_nodes;
+    int mkldnn_conv_nodes;
+    int other_conv_nodes;
+  };
+
+  counters before{1, 1, 1, 1};
+
+  graph = pass->Apply(std::move(graph));
+
+  // initialize counters before loop
+  counters after{0, 0, 0, 0};
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (op->Type() == "conv2d") {
+        if (boost::get<bool>(op->GetAttr("use_mkldnn")))
+          after.mkldnn_conv_nodes++;
+        else
+          after.other_conv_nodes++;
+      } else if (op->Type() == "depthwise_conv2d") {
+        if (boost::get<bool>(op->GetAttr("use_mkldnn")))
+          after.mkldnn_depthwise_conv_nodes++;
+        else
+          after.other_depthwise_conv_nodes++;
+      }
+    }
+  }
+
+  EXPECT_EQ(after.other_depthwise_conv_nodes,
+            before.other_depthwise_conv_nodes);
+  EXPECT_EQ(after.other_conv_nodes, before.other_conv_nodes);
+  EXPECT_EQ(after.mkldnn_depthwise_conv_nodes,
+            before.mkldnn_depthwise_conv_nodes - 1);
+  EXPECT_EQ(after.mkldnn_conv_nodes, before.mkldnn_conv_nodes + 1);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(depthwise_conv_mkldnn_pass);
--- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/fc_fuse_pass.h"

 #include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_proto_maker.h"

 namespace paddle {
 namespace framework {
@ -32,6 +33,8 @@ void SetOp(ProgramDesc* prog, const std::string& type,
    op->SetInput("X", inputs);
  }
  op->SetOutput("Out", outputs);
+  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
+              static_cast<int>(OpRole::kForward));
 }

 // a->OP0->b
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@ -23,8 +23,62 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
+namespace {
+
+void CheckProgram(const ProgramDesc &program) {
+  std::map<int, bool> visit;
+#define _INT(role) static_cast<int>(role)
+
+  for (size_t i = 0; i < program.Size(); ++i) {
+    for (OpDesc *op : program.Block(i).AllOps()) {
+      // For backward compatibility, some program doesn't have role added.
+      if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue;
+      int role_id = boost::get<int>(
+          op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
+      visit[role_id] = true;
+      switch (role_id) {
+        case _INT(OpRole::kForward):
+          PADDLE_ENFORCE(
+              visit.find(_INT(OpRole::kBackward)) == visit.end(),
+              "Cannot add forward operator before backward operator.");
+          break;
+        case _INT(OpRole::kBackward):
+        case _INT(OpRole::kBackward) | _INT(OpRole::kLoss):
+          PADDLE_ENFORCE(
+              visit.find(_INT(OpRole::kOptimize)) == visit.end(),
+              "Cannot add backward operator before optimize operator.");
+          break;
+        case _INT(OpRole::kForward) | _INT(OpRole::kLoss):
+          PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) |
+                                    _INT(OpRole::kLoss)) == visit.end(),
+                         "Cannot add backward|loss operator before "
+                         "forward|loss operator.");
+          PADDLE_ENFORCE(
+              visit.find(_INT(OpRole::kOptimize)) == visit.end(),
+              "Cannot add backward operator before optimize operator.");
+          break;
+        case _INT(OpRole::kOptimize):
+        case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched):
+          PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(),
+                         "Optimize operators must follow backward operator.");
+          break;
+        case _INT(OpRole::kLRSched):
+        case _INT(OpRole::kDist):
+        case _INT(OpRole::kRPC):
+        case _INT(OpRole::kNotSpecified):
+          break;
+        default:
+          LOG(FATAL) << "Unknown operator role. Don't add new role because "
+                        "you don't know what you are doing.";
+      }
+    }
+  }
+#undef _INT
+}
+}  // namespace

 Graph::Graph(const ProgramDesc &program) : program_(program) {
+  CheckProgram(program_);
  // Make the nodes id start from 0.
  Node::ResetId();
  auto var_nodes = InitFromProgram(program_);
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@ -79,6 +79,7 @@ class Analyzer : public OrderedRegistry<PassManager> {
      "conv_bn_fuse_pass",              //
      "conv_eltwiseadd_bn_fuse_pass",   //
 #ifdef PADDLE_WITH_MKLDNN
+      "depthwise_conv_mkldnn_pass",             //
      "conv_bias_mkldnn_fuse_pass",             //
      "conv_relu_mkldnn_fuse_pass",             //
      "conv_elementwise_add_mkldnn_fuse_pass",  //
--- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"

@ -130,6 +131,8 @@ void SetOp(framework::ProgramDesc* prog, const std::string& type,
  op->SetType(type);
  op->SetInput("Xs", inputs);
  op->SetOutput("Xs", outputs);
+  op->SetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName(),
+              static_cast<int>(framework::OpRole::kForward));
 }

 TEST(DataFlowGraph, Build_IR_Graph) {
--- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
@ -0,0 +1,132 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
+using DataLayout = platform::DataLayout;
+using ScopedSpatialTransformerDescriptor =
+    platform::ScopedSpatialTransformerDescriptor;
+template <typename T>
+using CudnnDataType = platform::CudnnDataType<T>;
+
+template <typename T>
+class CUDNNGridSampleOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace");
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+    auto* input = ctx.Input<Tensor>("X");
+    auto* grid = ctx.Input<Tensor>("Grid");
+    auto* output = ctx.Output<Tensor>("Output");
+
+    int n = input->dims()[0];
+    int c = input->dims()[1];
+    int h = input->dims()[2];
+    int w = input->dims()[3];
+    const int size[4] = {n, c, h, w};
+
+    const T* input_data = input->data<T>();
+    const T* grid_data = grid->data<T>();
+    T* output_data = output->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
+
+    ScopedSpatialTransformerDescriptor st_desc;
+    cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
+        st_desc.descriptor<T>(4, size);
+
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor output_desc;
+    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+        DataLayout::kNCHW, framework::vectorize2int(input->dims()));
+    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+        DataLayout::kNCHW, framework::vectorize2int(output->dims()));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnSpatialTfSamplerForward(
+        handle, cudnn_st_desc, CudnnDataType<T>::kOne(), cudnn_input_desc,
+        input_data, grid_data, CudnnDataType<T>::kZero(), cudnn_output_desc,
+        output_data));
+  }
+};
+
+template <typename T>
+class CUDNNGridSampleGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace");
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+    auto* input = ctx.Input<Tensor>("X");
+    auto* grid = ctx.Input<Tensor>("Grid");
+    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
+
+    auto output_grad_dims = output_grad->dims();
+    const int n = output_grad_dims[0];
+    const int c = output_grad_dims[1];
+    const int h = output_grad_dims[2];
+    const int w = output_grad_dims[3];
+    const int size[4] = {n, c, h, w};
+
+    ScopedSpatialTransformerDescriptor st_dest;
+    cudnnSpatialTransformerDescriptor_t cudnn_st_dest =
+        st_dest.descriptor<T>(4, size);
+
+    const T* input_data = input->data<T>();
+    const T* grid_data = grid->data<T>();
+    const T* output_grad_data = output_grad->data<T>();
+    T* input_grad_data =
+        input_grad->mutable_data<T>(output_grad_dims, ctx.GetPlace());
+    T* grid_grad_data =
+        grid_grad->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());
+
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor input_grad_desc;
+    ScopedTensorDescriptor output_grad_desc;
+    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+        DataLayout::kNCHW, framework::vectorize2int(input->dims()));
+    cudnnTensorDescriptor_t cudnn_input_grad_desc =
+        input_grad_desc.descriptor<T>(
+            DataLayout::kNCHW, framework::vectorize2int(input_grad->dims()));
+    cudnnTensorDescriptor_t cudnn_output_grad_desc =
+        output_grad_desc.descriptor<T>(
+            DataLayout::kNCHW, framework::vectorize2int(output_grad->dims()));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnSpatialTfSamplerBackward(
+        handle, cudnn_st_dest, CudnnDataType<T>::kOne(), cudnn_input_desc,
+        input_data, CudnnDataType<T>::kZero(), cudnn_input_grad_desc,
+        input_grad_data, CudnnDataType<T>::kOne(), cudnn_output_grad_desc,
+        output_grad_data, grid_data, CudnnDataType<T>::kZero(),
+        grid_grad_data));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace plat = paddle::platform;
+REGISTER_OP_KERNEL(grid_sampler, CUDNN, plat::CUDAPlace,
+                   paddle::operators::CUDNNGridSampleOpKernel<float>,
+                   paddle::operators::CUDNNGridSampleOpKernel<double>);
+REGISTER_OP_KERNEL(grid_sampler_grad, CUDNN, plat::CUDAPlace,
+                   paddle::operators::CUDNNGridSampleGradOpKernel<float>,
+                   paddle::operators::CUDNNGridSampleGradOpKernel<double>);
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@ -0,0 +1,203 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/grid_sampler_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cudnn_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class GridSampleOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of GridSampleOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grid"),
+                   "Input(Grid) of GridSampleOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Output"),
+                   "Output(Output) of GridSampleOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto grid_dims = ctx->GetInputDim("Grid");
+    PADDLE_ENFORCE(x_dims.size() == 4,
+                   "Input(X) of GridSampleOp should be 4-D Tensor.");
+    PADDLE_ENFORCE(grid_dims.size() == 4,
+                   "Input(Grid) of GridSampleOp should be 4-D Tensor.");
+    PADDLE_ENFORCE(grid_dims[3] == 2, "Input(Grid) dims[3] should be 2.");
+    PADDLE_ENFORCE_EQ(grid_dims[0], x_dims[0],
+                      "Input(X) and Input(Grid) dims[0] should be equal.");
+    PADDLE_ENFORCE_EQ(
+        grid_dims[1], x_dims[2],
+        "Input(X) dims[2] and Input(Grid) dims[1] should be equal.");
+    PADDLE_ENFORCE_EQ(
+        grid_dims[2], x_dims[3],
+        "Input(X) dims[3] and Input(Grid) dims[2] should be equal.");
+
+    ctx->SetOutputDim("Output", x_dims);
+    ctx->ShareLoD("X", "Output");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_CUDA
+    if (platform::CanCUDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kCUDNN;
+    }
+#endif
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+        framework::DataLayout::kAnyLayout, library_);
+  }
+};
+
+class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor) The input data of GridSampleOp, "
+             "This is a 4-D tensor with shape of [N, C, H, W]");
+    AddInput(
+        "Grid",
+        "(Tensor) The input grid of GridSampleOp generated by AffineGridOp, "
+        "This is a 4-D tensor with shape of [N, H, W, 2] is the concatenation "
+        "of x and y coordinates with shape [N, H, W] in last dimention");
+    AddOutput("Output", "(Tensor) Output tensor with shape [N, C, H, W]");
+    AddAttr<bool>(
+        "use_cudnn",
+        "(bool, default true) Only used in cudnn kernel, need install cudnn")
+        .SetDefault(true);
+
+    AddComment(R"DOC(
+      This operation samples input X by using bilinear interpolation based on 
+      flow field grid, which is usually gennerated by affine_grid. The grid of
+      shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates 
+      with shape [N, H, W] each, where grid_x is indexing the 4th dimension 
+      (in width dimension) of input data x and grid_y is indexng the 3rd 
+      dimention (in height dimension), finally results is the bilinear 
+      interpolation value of 4 nearest corner points.
+
+      Step 1:
+        Get (x, y) grid coordinates and scale to [0, H-1/W-1].
+
+        grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1)
+        grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
+
+      Step 2:
+        Indices input data X with grid (x, y) in each [H, W] area, and bilinear 
+        interpolate point value by 4 nearest points.
+
+          wn ------- y_n ------- en
+          |           |           |
+          |          d_n          |
+          |           |           |
+         x_w --d_w-- grid--d_e-- x_e
+          |           |           |
+          |          d_s          |
+          |           |           |
+          ws ------- y_s ------- wn
+
+        x_w = floor(x)              // west side x coord
+        x_e = x_w + 1               // east side x coord
+        y_n = floor(y)              // north side y coord
+        y_s = y_s + 1               // south side y coord
+
+        d_w = grid_x - x_w          // distance to west side
+        d_e = x_e - grid_x          // distance to east side
+        d_n = grid_y - y_n          // distance to north side
+        d_s = y_s - grid_y          // distance to south side
+
+        wn = X[:, :, y_n, x_w]      // north-west point value
+        en = X[:, :, y_n, x_e]      // north-east point value
+        ws = X[:, :, y_s, x_w]      // south-east point value
+        es = X[:, :, y_s, x_w]      // north-east point value
+
+        output = wn * d_e * d_s + en * d_w * d_s
+               + ws * d_e * d_n + es * d_w * d_n
+        )DOC");
+  }
+};
+
+class GridSampleOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    auto input_dims = ctx->GetInputDim("X");
+    auto grid_dims = ctx->GetInputDim("Grid");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), input_dims);
+    }
+    if (ctx->HasOutput(framework::GradVarName("Grid"))) {
+      ctx->SetOutputDim(framework::GradVarName("Grid"), grid_dims);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_CUDA
+    if (platform::CanCUDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kCUDNN;
+    }
+#endif
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+        framework::DataLayout::kAnyLayout, library_);
+  }
+};
+
+class GridSampleGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType("grid_sampler_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("Grid", Input("Grid"));
+    op->SetInput(framework::GradVarName("Output"), OutputGrad("Output"));
+
+    op->SetAttrMap(Attrs());
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Grid"), InputGrad("Grid"));
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(grid_sampler, ops::GridSampleOp, ops::GridSampleOpMaker,
+                  ops::GridSampleGradMaker);
+REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    grid_sampler,
+    ops::GridSampleOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GridSampleOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    grid_sampler_grad,
+    ops::GridSampleGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GridSampleGradOpKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ b/paddle/fluid/operators/grid_sampler_op.h
--- a/paddle/fluid/operators/math/pooling.cc
+++ b/paddle/fluid/operators/math/pooling.cc
@ -31,7 +31,7 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                  const framework::Tensor& input, const std::vector<int>& ksize,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings, PoolProcess pool_process,
-                  framework::Tensor* output) {
+                  bool exclusive, framework::Tensor* output) {
    const int batch_size = input.dims()[0];
    const int input_height = input.dims()[2];
    const int input_width = input.dims()[3];
@ -68,7 +68,8 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                pool_process.compute(input_data[h * input_width + w], &ele);
              }
            }
-            int pool_size = (hend - hstart) * (wend - wstart);
+            int pool_size = exclusive ? (hend - hstart) * (wend - wstart)
+                                      : ksize_height * ksize_width;
            pool_process.finalize(static_cast<T>(pool_size), &ele);
            output_data[ph * output_width + pw] = ele;
          }
@ -93,7 +94,7 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
      const framework::Tensor& output, const framework::Tensor& output_grad,
      const std::vector<int>& ksize, const std::vector<int>& strides,
      const std::vector<int>& paddings, PoolProcess pool_grad_process,
-      framework::Tensor* input_grad) {
+      bool exclusive, framework::Tensor* input_grad) {
    const int batch_size = input.dims()[0];
    const int input_height = input.dims()[2];
    const int input_width = input.dims()[3];
@ -124,7 +125,8 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
            int wstart = pw * stride_width - padding_width;
            int wend = std::min(wstart + ksize_width, input_width);
            wstart = std::max(wstart, 0);
-            int pool_size = (hend - hstart) * (wend - wstart);
+            int pool_size = exclusive ? (hend - hstart) * (wend - wstart)
+                                      : ksize_height * ksize_width;
            float scale = 1.0 / pool_size;
            for (int h = hstart; h < hend; ++h) {
              for (int w = wstart; w < wend; ++w) {
@ -249,7 +251,7 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                  const framework::Tensor& input, const std::vector<int>& ksize,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings, PoolProcess pool_process,
-                  framework::Tensor* output) {
+                  bool exclusive, framework::Tensor* output) {
    const int batch_size = input.dims()[0];
    const int input_depth = input.dims()[2];
    const int input_height = input.dims()[3];
@ -300,7 +302,9 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                }
              }
              int pool_size =
-                  (dend - dstart) * (hend - hstart) * (wend - wstart);
+                  exclusive
+                      ? (dend - dstart) * (hend - hstart) * (wend - wstart)
+                      : ksize_depth * ksize_height * ksize_width;
              pool_process.finalize(static_cast<T>(pool_size), &ele);
              output_data[output_idx] = ele;
            }
@ -326,7 +330,7 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
      const framework::Tensor& output, const framework::Tensor& output_grad,
      const std::vector<int>& ksize, const std::vector<int>& strides,
      const std::vector<int>& paddings, PoolProcess pool_grad_process,
-      framework::Tensor* input_grad) {
+      bool exclusive, framework::Tensor* input_grad) {
    const int batch_size = input.dims()[0];
    const int input_depth = input.dims()[2];
    const int input_height = input.dims()[3];
@ -369,7 +373,9 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
              wstart = std::max(wstart, 0);

              int pool_size =
-                  (dend - dstart) * (hend - hstart) * (wend - wstart);
+                  exclusive
+                      ? (dend - dstart) * (hend - hstart) * (wend - wstart)
+                      : ksize_depth * ksize_height * ksize_width;
              float scale = 1.0 / pool_size;
              for (int d = dstart; d < dend; ++d) {
                for (int h = hstart; h < hend; ++h) {
--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/fluid/operators/math/pooling.cu
@ -29,7 +29,7 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
                             const int ksize_width, const int stride_height,
                             const int stride_width, const int padding_height,
                             const int padding_width, PoolProcess pool_process,
-                             T* output_data) {
+                             bool exclusive, T* output_data) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
    int pw = index % output_width;
@ -52,7 +52,8 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
        pool_process.compute(input_data[h * input_width + w], &ele);
      }
    }
-    int pool_size = (hend - hstart) * (wend - wstart);
+    int pool_size = exclusive ? (hend - hstart) * (wend - wstart)
+                              : ksize_height * ksize_width;
    pool_process.finalize(static_cast<T>(pool_size), &ele);
    output_data[index] = ele;
  }
@ -65,7 +66,7 @@ __global__ void KernelPool2DGrad(
    const int input_width, const int output_height, const int output_width,
    const int ksize_height, const int ksize_width, const int stride_height,
    const int stride_width, const int padding_height, const int padding_width,
-    PoolProcess pool_process, T* input_grad) {
+    PoolProcess pool_process, bool exclusive, T* input_grad) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
    int offsetW = index % input_width + padding_width;
@ -95,7 +96,8 @@ __global__ void KernelPool2DGrad(
        int wend = min(wstart + ksize_width, input_width);
        hstart = max(hstart, 0);
        wstart = max(wstart, 0);
-        int pool_size = (hend - hstart) * (wend - wstart);
+        int pool_size = exclusive ? (hend - hstart) * (wend - wstart)
+                                  : ksize_height * ksize_width;
        int output_sub_idx = ph * output_width + pw;
        pool_process.compute(input, output_data[output_sub_idx],
                             output_grad[output_sub_idx],
@ -163,7 +165,7 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
                  const framework::Tensor& input, const std::vector<int>& ksize,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings, PoolProcess pool_process,
-                  framework::Tensor* output) {
+                  bool exclusive, framework::Tensor* output) {
    const int batch_size = input.dims()[0];
    const int input_channels = input.dims()[1];
    const int input_height = input.dims()[2];
@ -189,7 +191,8 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
    KernelPool2D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
        nthreads, input_data, input_channels, input_height, input_width,
        output_height, output_width, ksize_height, ksize_width, stride_height,
-        stride_width, padding_height, padding_width, pool_process, output_data);
+        stride_width, padding_height, padding_width, pool_process, exclusive,
+        output_data);
  }
 };

@ -208,7 +211,7 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
                  const std::vector<int>& ksize,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings, PoolProcess pool_process,
-                  framework::Tensor* input_grad) {
+                  bool exclusive, framework::Tensor* input_grad) {
    const int batch_size = input.dims()[0];
    const int input_channels = input.dims()[1];
    const int input_height = input.dims()[2];
@ -236,7 +239,7 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
        nthreads, input_data, output_data, output_grad_data, input_channels,
        input_height, input_width, output_height, output_width, ksize_height,
        ksize_width, stride_height, stride_width, padding_height, padding_width,
-        pool_process, input_grad_data);
+        pool_process, exclusive, input_grad_data);
  }
 };

@ -313,16 +316,14 @@ template class Pool2dGradFunctor<platform::CUDADeviceContext,
                                 double>;

 template <typename PoolProcess, typename T>
-__global__ void KernelPool3D(const int nthreads, const T* input_data,
-                             const int channels, const int input_depth,
-                             const int input_height, const int input_width,
-                             const int output_depth, const int output_height,
-                             const int output_width, const int ksize_depth,
-                             const int ksize_height, const int ksize_width,
-                             const int stride_depth, const int stride_height,
-                             const int stride_width, const int padding_depth,
-                             const int padding_height, const int padding_width,
-                             PoolProcess pool_process, T* output_data) {
+__global__ void KernelPool3D(
+    const int nthreads, const T* input_data, const int channels,
+    const int input_depth, const int input_height, const int input_width,
+    const int output_depth, const int output_height, const int output_width,
+    const int ksize_depth, const int ksize_height, const int ksize_width,
+    const int stride_depth, const int stride_height, const int stride_width,
+    const int padding_depth, const int padding_height, const int padding_width,
+    PoolProcess pool_process, bool exclusive, T* output_data) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
    int pw = index % output_width;
@ -351,7 +352,9 @@ __global__ void KernelPool3D(const int nthreads, const T* input_data,
        }
      }
    }
-    int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+    int pool_size = exclusive
+                        ? (dend - dstart) * (hend - hstart) * (wend - wstart)
+                        : ksize_depth * ksize_height * ksize_width;
    pool_process.finalize(static_cast<T>(pool_size), &ele);
    output_data[index] = ele;
  }
@ -366,7 +369,7 @@ __global__ void KernelPool3DGrad(
    const int ksize_height, const int ksize_width, const int stride_depth,
    const int stride_height, const int stride_width, const int padding_depth,
    const int padding_height, const int padding_width, PoolProcess pool_process,
-    T* input_grad) {
+    bool exclusive, T* input_grad) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
    int offsetW = index % input_width + padding_width;
@ -409,7 +412,9 @@ __global__ void KernelPool3DGrad(
          dstart = max(dstart, 0);
          hstart = max(hstart, 0);
          wstart = max(wstart, 0);
-          int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+          int pool_size =
+              exclusive ? (dend - dstart) * (hend - hstart) * (wend - wstart)
+                        : ksize_depth * ksize_height * ksize_width;
          int output_sub_idx = (pd * output_height + ph) * output_width + pw;
          pool_process.compute(input, output_data[output_sub_idx],
                               output_grad[output_sub_idx],
@ -484,7 +489,7 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
                  const framework::Tensor& input, const std::vector<int>& ksize,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings, PoolProcess pool_process,
-                  framework::Tensor* output) {
+                  bool exclusive, framework::Tensor* output) {
    const int batch_size = input.dims()[0];
    const int input_channels = input.dims()[1];
    const int input_depth = input.dims()[2];
@ -517,7 +522,7 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
        nthreads, input_data, input_channels, input_depth, input_height,
        input_width, output_depth, output_height, output_width, ksize_depth,
        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
-        padding_depth, padding_height, padding_width, pool_process,
+        padding_depth, padding_height, padding_width, pool_process, exclusive,
        output_data);
  }
 };
@ -537,7 +542,7 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
                  const std::vector<int>& ksize,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings, PoolProcess pool_process,
-                  framework::Tensor* input_grad) {
+                  bool exclusive, framework::Tensor* input_grad) {
    const int batch_size = input.dims()[0];
    const int input_channels = input.dims()[1];
    const int input_depth = input.dims()[2];
@ -573,7 +578,7 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
        input_depth, input_height, input_width, output_depth, output_height,
        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
        stride_height, stride_width, padding_depth, padding_height,
-        padding_width, pool_process, input_grad_data);
+        padding_width, pool_process, exclusive, input_grad_data);
  }
 };

--- a/paddle/fluid/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
@ -89,7 +89,7 @@ class Pool2dFunctor {
                  const std::vector<int>& ksize,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings, PoolProcess pool_compute,
-                  framework::Tensor* output);
+                  bool exclusive, framework::Tensor* output);
 };

 template <typename DeviceContext, typename PoolProcess, typename T>
@ -101,7 +101,7 @@ class Pool2dGradFunctor {
                  const std::vector<int>& ksize,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings, PoolProcess pool_compute,
-                  framework::Tensor* input_grad);
+                  bool exclusive, framework::Tensor* input_grad);
 };

 template <typename DeviceContext, class T>
@ -123,7 +123,7 @@ class Pool3dFunctor {
                  const std::vector<int>& ksize,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings, PoolProcess pool_compute,
-                  framework::Tensor* output);
+                  bool exclusive, framework::Tensor* output);
 };

 template <typename DeviceContext, typename PoolProcess, typename T>
@ -135,7 +135,7 @@ class Pool3dGradFunctor {
                  const std::vector<int>& ksize,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings, PoolProcess pool_compute,
-                  framework::Tensor* input_grad);
+                  bool exclusive, framework::Tensor* input_grad);
 };

 template <typename DeviceContext, class T>
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@ -41,6 +41,7 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
    T *output_data = output->mutable_data<T>(ctx.GetPlace());

    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
+    bool exclusive = ctx.Attr<bool>("exclusive");
    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
@ -72,7 +73,8 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
    if (pooling_type == "max") {
      pooling_mode = PoolingMode::kMaximum;
    } else {
-      pooling_mode = PoolingMode::kAverage;
+      pooling_mode = exclusive ? PoolingMode::kAverageExclusive
+                               : PoolingMode::kAverageInclusive;
    }

    cudnnPoolingDescriptor_t cudnn_pool_desc =
@ -101,6 +103,7 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
    Tensor *input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));

    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
+    bool exclusive = ctx.Attr<bool>("exclusive");
    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
@ -141,7 +144,8 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
        pooling_mode = PoolingMode::kMaximum;
      }
    } else {
-      pooling_mode = PoolingMode::kAverage;
+      pooling_mode = exclusive ? PoolingMode::kAverageExclusive
+                               : PoolingMode::kAverageInclusive;
    }

    cudnnPoolingDescriptor_t cudnn_pool_desc =
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@ -180,6 +180,12 @@ void Pool2dOpMaker::Make() {
      "operator."
      "If global_pooling = true, paddings and ksize will be ignored.")
      .SetDefault({0, 0});
+  AddAttr<bool>(
+      "exclusive",
+      "(bool, default True) When true, will exclude the zero-padding in the "
+      "averaging calculating, otherwise, include the zero-padding. Note, it "
+      "is only used when pooling_type is avg. The defalut is True.")
+      .SetDefault(true);
  AddAttr<bool>(
      "use_cudnn",
      "(bool, default false) Only used in cudnn kernel, need install cudnn")
@ -236,6 +242,23 @@ Example:
       W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
       $$

+  For exclusive = true:
+       $$
+       hstart = i * strides[0] - paddings[0]
+       hend = hstart + ksize[0]
+       wstart = j * strides[1] - paddings[1]
+       wend = wstart + ksize[1]
+       Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]}
+       $$
+  For exclusive = false:
+       $$
+       hstart = max(0, i * strides[0] - paddings[0])
+       hend = min(H, hstart + ksize[0])
+       wstart = max(0, j * strides[1] - paddings[1])
+       wend = min(W, wstart + ksize[1])
+       Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
+       $$
+
 )DOC");
 }

@ -283,6 +306,12 @@ void Pool3dOpMaker::Make() {
      "If global_pooling = true, ksize and paddings will be ignored.")
      .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
                               // TypedAttrChecker don't support vector type.)
+  AddAttr<bool>(
+      "exclusive",
+      "(bool, default True) When true, will exclude the zero-padding in the "
+      "averaging calculating, otherwise, include the zero-padding. Note, it "
+      "is only used when pooling_type is avg. The defalut is True.")
+      .SetDefault(true);

  AddAttr<bool>(
      "use_cudnn",
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@ -69,6 +69,7 @@ class PoolKernel : public framework::OpKernel<T> {
    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    bool exclusive = context.Attr<bool>("exclusive");
    if (context.Attr<bool>("global_pooling")) {
      for (size_t i = 0; i < ksize.size(); ++i) {
        paddings[i] = 0;
@ -84,7 +85,7 @@ class PoolKernel : public framework::OpKernel<T> {
              pool2d_forward;
          paddle::operators::math::MaxPool<T> pool_process;
          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         out);
+                         true, out);

        } else if (pooling_type == "avg") {
          paddle::operators::math::Pool2dFunctor<
@ -92,7 +93,7 @@ class PoolKernel : public framework::OpKernel<T> {
              pool2d_forward;
          paddle::operators::math::AvgPool<T> pool_process;
          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         out);
+                         exclusive, out);
        }
      } break;
      case 3: {
@ -102,14 +103,14 @@ class PoolKernel : public framework::OpKernel<T> {
              pool3d_forward;
          paddle::operators::math::MaxPool<T> pool_process;
          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         out);
+                         true, out);
        } else if (pooling_type == "avg") {
          paddle::operators::math::Pool3dFunctor<
              DeviceContext, paddle::operators::math::AvgPool<T>, T>
              pool3d_forward;
          paddle::operators::math::AvgPool<T> pool_process;
          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         out);
+                         exclusive, out);
        }
      } break;
      default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
@ -131,6 +132,7 @@ class PoolGradKernel : public framework::OpKernel<T> {
    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    bool exclusive = context.Attr<bool>("exclusive");

    if (context.Attr<bool>("global_pooling")) {
      for (size_t i = 0; i < ksize.size(); ++i) {
@ -157,7 +159,7 @@ class PoolGradKernel : public framework::OpKernel<T> {
                pool2d_backward;
            paddle::operators::math::AvgPoolGrad<T> pool_process;
            pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, pool_process, in_x_grad);
+                            paddings, pool_process, exclusive, in_x_grad);
          }
        } break;
        case 3: {
@ -172,7 +174,7 @@ class PoolGradKernel : public framework::OpKernel<T> {
                pool3d_backward;
            paddle::operators::math::AvgPoolGrad<T> pool_process;
            pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, pool_process, in_x_grad);
+                            paddings, pool_process, exclusive, in_x_grad);
          }
        } break;
        default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
--- a/Show More
+++ b/Show More