Add bn and relu fuse pass (#22048)

* add bn and relu fuse pass * add op attr assert and dtype assert * fix some inputs&&outputs bugs for the fused op and pattern. * add the unittest for fuse_bn_act_pass. test=develop * use normative enforce statements. test=develop * add the cpu test. test=develop * add the support of batch_size=1 for the bn with relu op. test=develop * add the error type for paddle throws. test=develop * add fused_batch_norm_act and fused_batch_norm_act_grad to op_has_unsed_vars_white_list. test=develop
6 years ago · 46189b166d
parent 0d82baf837
commit 46189b166d
17 changed files with 1587 additions and 3 deletions
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@ -118,7 +118,7 @@ function(op_library TARGET)
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
 "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
 "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
-"multihead_matmul_op" "fusion_group_op")
+"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op")
        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
            set(pybind_flag 1)
        endif()
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@ -100,7 +100,7 @@ endif()
 cc_library(build_strategy SRCS build_strategy.cc DEPS
        graph_viz_pass multi_devices_graph_pass
        multi_devices_graph_print_pass multi_devices_graph_check_pass
-        fuse_elewise_add_act_pass multi_batch_merge_pass 
+        fuse_elewise_add_act_pass fuse_bn_act_pass multi_batch_merge_pass
        fuse_relu_depthwise_conv_pass
        lock_free_optimize_pass
        coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@ -167,6 +167,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
                        "fuse_relu_depthwise_conv_pass");
    AppendPassWithCheck(strategy_.fuse_elewise_add_act_ops_,
                        "fuse_elewise_add_act_pass");
    AppendPassWithCheck(strategy_.fuse_bn_act_ops_, "fuse_bn_act_pass");
    // for single card training, fuse_all_reduce_ops is unnecessary.
    // coalesce_grad_tensor_pass should be before of MultiDevPass.
    AppendPassWithCheck(strategy_.fuse_all_reduce_ops_,
@ -369,6 +370,12 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
                        "GPU, skipped.";
        continue;
      }
    } else if (pass->Type() == "fuse_bn_act_pass") {
      if (!use_cuda) {
        LOG(WARNING) << "fuse_bn_act_pass is only supported on "
                        "GPU, skipped.";
        continue;
      }
    } else if (pass->Type() == "mkldnn_placement_pass") {
      pass->Set("mkldnn_enabled_op_types",
                new std::unordered_set<std::string>(mkldnn_enabled_op_types_));
@ -394,6 +401,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
 USE_PASS(sync_batch_norm_pass);
 USE_PASS(fuse_relu_depthwise_conv_pass);
 USE_PASS(fuse_elewise_add_act_pass);
 USE_PASS(fuse_bn_act_pass);
 USE_PASS(graph_viz_pass);
 USE_PASS(multi_batch_merge_pass);
 USE_PASS(reduce_mode_multi_devices_pass);
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@ -87,6 +87,7 @@ struct BuildStrategy {
  // TODO(dev-paddle): fuse_elewise_add_act_ops may cause some models have
  // cycle.
  bool fuse_elewise_add_act_ops_{false};
  bool fuse_bn_act_ops_{false};
  // Fuse_all_optimizer_ops and fuse_all_reduce_ops require that gradients
  // should not be sparse types
  boost::optional<bool> fuse_all_optimizer_ops_{false};
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@ -106,6 +106,7 @@ if(WITH_NGRAPH)
    set(INFER_IR_PASSES ${INFER_IR_PASSES} ngraph_subgraph_pass CACHE INTERNAL "")
 endif()
 cc_library(fuse_bn_act_pass SRCS fuse_bn_act_pass.cc DEPS pass graph_pattern_detector )
 cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
 cc_library(fuse_relu_depthwise_conv_pass SRCS fuse_relu_depthwise_conv_pass.cc DEPS pass graph_pattern_detector )
--- a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
--- a/paddle/fluid/framework/ir/fuse_bn_act_pass.h
+++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.h
@ -0,0 +1,64 @@
 // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <string>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/pass.h"
 namespace paddle {
 namespace framework {
 namespace ir {
 /*
 * Fuse the BatchNorm and activation.
 */
 class FuseBatchNormActPass : public FusePassBase {
 public:
  virtual ~FuseBatchNormActPass() {}
 protected:
  void ApplyImpl(ir::Graph *graph) const override;
  ir::Graph *FuseBatchNormAct(
      ir::Graph *graph, const std::unordered_set<std::string> &act_types) const;
  ir::Graph *FuseBatchNormActGrad(
      ir::Graph *graph,
      const std::unordered_set<std::string> &act_grad_types) const;
  std::vector<Node *> ReplaceNode(Node *cur_node, Node *new_node,
                                  const std::vector<Node *> &nodes) const;
  void ReLinkNodes(Graph *graph, const Node *intermediate_out, Node *op_1,
                   Node *op_2, Node *fused_op) const;
  Node *CreateFusedBatchNormActNode(
      Graph *g, const Node *act, const Node *bn, const std::string &bn_x_n,
      const std::string &bn_scale_n, const std::string &bn_bias_n,
      const std::string &bn_variance_n, const std::string &bn_mean_n,
      const std::string &bn_mean_out_n, const std::string &bn_variance_out_n,
      const std::string &bn_saved_variance_n,
      const std::string &bn_saved_mean_n, const std::string &bn_reserve_space_n,
      const std::string &act_out_n) const;
 };
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@ -383,6 +383,13 @@ PDNode *PDNode::assert_is_var() {
  return this;
 }
 PDNode *PDNode::assert_var_dtype(proto::VarType::Type dtype) {
  assert_is_var();
  asserts_.emplace_back(
      [dtype](Node *x) { return x->Var()->GetDataType() == dtype; });
  return this;
 }
 PDNode *PDNode::assert_is_not_ctrl_var() {
  asserts_.emplace_back([](Node *x) { return x && !x->IsCtrlVar(); });
  return this;
@ -476,6 +483,7 @@ PDNode *PDNode::assert_is_op_output(const std::string &op_type,
  assert_is_op_nth_output(op_type, argument, 0);
  return this;
 }
 PDNode *PDNode::assert_is_op_input(const std::string &op_type) {
  assert_is_var();
  asserts_.emplace_back([=](Node *x) {
@ -489,6 +497,16 @@ PDNode *PDNode::assert_is_op_input(const std::string &op_type) {
  return this;
 }
 PDNode *PDNode::assert_is_not_op_input(const std::string &argument) {
  assert_is_op();
  asserts_.emplace_back([=](Node *x) {
    auto &ins = x->Op()->Inputs();
    auto iter = ins.find(argument);
    return iter == ins.end() || iter->second.empty();
  });
  return this;
 }
 PDNode *PDNode::assert_is_op_input(const std::string &op_type,
                                   const std::string &argument) {
  assert_is_var();
@ -1048,6 +1066,117 @@ PDNode *patterns::ActElewiseAdd::operator()(
  return elewise_add_out;
 }
 PDNode *patterns::BatchNormAct::operator()(
    paddle::framework::ir::PDNode *bn_x_var,
    std::unordered_set<std::string> act_types) {
  auto *bn_scale_var = pattern->NewNode(bn_scale_repr())
                           ->assert_is_op_input("batch_norm", "Scale");
  auto *bn_bias_var = pattern->NewNode(bn_bias_repr())
                          ->assert_is_op_input("batch_norm", "Bias");
  auto *bn_variance_var = pattern->NewNode(bn_variance_repr())
                              ->assert_is_op_input("batch_norm", "Variance");
  auto *bn_mean_var = pattern->NewNode(bn_mean_repr())
                          ->assert_is_op_input("batch_norm", "Mean");
  auto *bn = pattern->NewNode(batch_norm_repr())
                 ->assert_is_op("batch_norm")
                 ->assert_is_not_op_input("MomentumTensor")
                 ->assert_op_attr<bool>("is_test", false)
                 ->assert_op_attr<bool>("use_global_stats", false)
                 ->assert_op_attr<std::string>("data_layout", "NHWC");
  auto *bn_mean_out_var = pattern->NewNode(bn_mean_out_repr())
                              ->assert_is_op_output("batch_norm", "MeanOut");
  auto *bn_variance_out_var =
      pattern->NewNode(bn_variance_out_repr())
          ->assert_is_op_output("batch_norm", "VarianceOut");
  auto *bn_saved_variance_var =
      pattern->NewNode(bn_saved_variance_repr())
          ->assert_is_op_output("batch_norm", "SavedVariance");
  auto *bn_saved_mean_var =
      pattern->NewNode(bn_saved_mean_repr())
          ->assert_is_op_output("batch_norm", "SavedMean");
  auto *bn_reserve_space =
      pattern->NewNode(bn_reserve_space_repr())
          ->assert_is_op_output("batch_norm", "ReserveSpace");
  auto *bn_out_var = pattern->NewNode(bn_out_repr())
                         ->assert_is_op_output("batch_norm", "Y")
                         ->assert_has_n_outputs(1);
  bn_out_var->AsIntermediate()->assert_is_ops_input(act_types);
  auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types);
  auto *act_out_var =
      pattern->NewNode(act_out_repr())->assert_is_ops_output(act_types, "Out");
  bn->LinksFrom(
        {bn_x_var, bn_scale_var, bn_bias_var, bn_variance_var, bn_mean_var})
      .LinksTo({bn_mean_out_var, bn_variance_out_var, bn_saved_variance_var,
                bn_saved_mean_var, bn_reserve_space, bn_out_var});
  act->LinksFrom({bn_out_var}).LinksTo({act_out_var});
  return act_out_var;
 }
 PDNode *patterns::BatchNormActGrad::operator()(
    paddle::framework::ir::PDNode *d_act_out_var,
    std::unordered_set<std::string> act_grad_types) {
  auto *act_grad =
      pattern->NewNode(act_grad_repr())->assert_is_ops(act_grad_types);
  auto *bn_grad = pattern->NewNode(batch_norm_grad_repr())
                      ->assert_is_op("batch_norm_grad")
                      ->assert_op_attr<bool>("use_global_stats", false)
                      ->assert_op_attr<std::string>("data_layout", "NHWC");
  auto *act_out_var = pattern->NewNode(act_out_repr())
                          ->assert_is_ops_input(act_grad_types, "Out");
  auto *d_intermediate_var =
      pattern->NewNode(d_itermediate_out_repr())
          ->assert_is_ops_output(act_grad_types, GradVarName("X"))
          ->assert_has_n_outputs(1);
  auto *bn_x_var = pattern->NewNode(bn_x_repr())
                       ->assert_is_op_input("batch_norm_grad", "X")
                       ->assert_var_dtype(proto::VarType::FP16);
  auto *bn_scale_var = pattern->NewNode(bn_scale_repr())
                           ->assert_is_op_input("batch_norm_grad", "Scale");
  auto *bn_bias_var = pattern->NewNode(bn_bias_repr())
                          ->assert_is_op_input("batch_norm_grad", "Bias");
  auto *bn_saved_mean_var =
      pattern->NewNode(bn_saved_mean_repr())
          ->assert_is_op_input("batch_norm_grad", "SavedMean");
  auto *bn_saved_variance_var =
      pattern->NewNode(bn_saved_variance_repr())
          ->assert_is_op_input("batch_norm_grad", "SavedVariance");
  // ReserveSpace as the output is equal to:
  // data_layout == 'NHWC' && FLAGS_cudnn_batchnorm_spatial_persistent == true
  auto *bn_reserve_space =
      pattern->NewNode(bn_reserve_space_repr())
          ->assert_is_op_input("batch_norm_grad", "ReserveSpace");
  auto *d_bn_x_var =
      pattern->NewNode(d_bn_x_repr())
          ->assert_is_not_ctrl_var()
          ->assert_is_op_output("batch_norm_grad", GradVarName("X"));
  auto *d_bn_scale_var =
      pattern->NewNode(d_bn_scale_repr())
          ->assert_is_not_ctrl_var()
          ->assert_is_op_output("batch_norm_grad", GradVarName("Scale"));
  auto *d_bn_bias_var =
      pattern->NewNode(d_bn_bias_repr())
          ->assert_is_not_ctrl_var()
          ->assert_is_op_output("batch_norm_grad", GradVarName("Bias"));
  act_grad->LinksFrom({d_act_out_var, act_out_var})
      .LinksTo({d_intermediate_var});
  bn_grad
      ->LinksFrom({bn_x_var, d_intermediate_var, bn_scale_var, bn_bias_var,
                   bn_saved_mean_var, bn_saved_variance_var, bn_reserve_space})
      .LinksTo({d_bn_x_var, d_bn_scale_var, d_bn_bias_var});
  return bn_grad;
 }
 PDNode *patterns::ElewiseAddAct::operator()(
    paddle::framework::ir::PDNode *ele_x_var,
    std::unordered_set<std::string> act_types) {
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@ -27,6 +27,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/inference/analysis/dot.h"
@ -100,6 +101,7 @@ struct PDNode {
  PDNode* assert_is_op();
  PDNode* assert_is_op(const std::string& op_type);
  PDNode* assert_is_var();
  PDNode* assert_var_dtype(proto::VarType::Type dtype);
  PDNode* assert_is_not_ctrl_var();
  PDNode* assert_var_not_persistable();
  PDNode* assert_is_persistable_var();
@ -111,6 +113,7 @@ struct PDNode {
                             const std::string& argument);
  PDNode* assert_is_op_nth_input(const std::string& op_type,
                                 const std::string& argument, int nth);
  PDNode* assert_is_not_op_input(const std::string& argument);
  PDNode* assert_is_op_nth_output(const std::string& op_type,
                                  const std::string& argument, int nth);
  PDNode* assert_is_only_input_of_op(const std::string& op_type);
@ -590,6 +593,64 @@ struct GRU : public PatternBase {
  PATTERN_DECL_NODE(Hidden);
 };
 // The following pattern is used to fuse batch_norm and act
 // formula: act(bn(x))
 // op: batch_norm + act
 struct BatchNormAct : public PatternBase {
  BatchNormAct(PDPattern* pattern, const std::string& name_scope)
      : PatternBase(pattern, name_scope, "bn_act") {}
  PDNode* operator()(PDNode* x, std::unordered_set<std::string> acts);
  // declare operator node's name
  PATTERN_DECL_NODE(batch_norm);
  PATTERN_DECL_NODE(act);
  // declare variable node's name
  // BN inputs
  PATTERN_DECL_NODE(bn_scale);
  PATTERN_DECL_NODE(bn_bias);
  PATTERN_DECL_NODE(bn_variance);
  PATTERN_DECL_NODE(bn_mean);
  // BN outputs
  PATTERN_DECL_NODE(bn_mean_out);
  PATTERN_DECL_NODE(bn_variance_out);
  PATTERN_DECL_NODE(bn_saved_variance);
  PATTERN_DECL_NODE(bn_saved_mean);
  PATTERN_DECL_NODE(bn_reserve_space);
  PATTERN_DECL_NODE(bn_out);
  // ACT output
  PATTERN_DECL_NODE(act_out);
 };
 // the backward of act(bn(x))
 // op: batch_norm_grad + act_grad
 struct BatchNormActGrad : public PatternBase {
  BatchNormActGrad(PDPattern* pattern, const std::string& name_scope)
      : PatternBase(pattern, name_scope, "bn_act_grad") {}
  // act_grad: in["Out", "Out@GRAD"], out["X@GRAD"]
  // bn_grad: in["X", "Y@GRAD", "Scale", "Bias", "SavedMean", "SavedVariance",
  // "ReserveSpace"],
  // out["X@GRAD", "Scale@GRAD", "Bias@GRAD"]
  PDNode* operator()(PDNode* x, std::unordered_set<std::string> act_grad_types);
  // declare operator node's name
  PATTERN_DECL_NODE(act_grad);
  PATTERN_DECL_NODE(batch_norm_grad);
  // declare variable node's name
  PATTERN_DECL_NODE(act_out);
  PATTERN_DECL_NODE(d_itermediate_out);
  PATTERN_DECL_NODE(bn_x);
  PATTERN_DECL_NODE(bn_scale);
  PATTERN_DECL_NODE(bn_bias);
  PATTERN_DECL_NODE(bn_saved_mean);
  PATTERN_DECL_NODE(bn_saved_variance);
  PATTERN_DECL_NODE(bn_reserve_space);
  PATTERN_DECL_NODE(d_bn_x);
  PATTERN_DECL_NODE(d_bn_scale);
  PATTERN_DECL_NODE(d_bn_bias);
 };
 // The following patterns are used to fuse elewise_add and act
 // formula: act(ele_add(x, y))
 // op: elementwise_add + act
--- a/paddle/fluid/framework/unused_var_check.cc
+++ b/paddle/fluid/framework/unused_var_check.cc
@ -30,6 +30,8 @@ const std::unordered_set<std::string> op_has_unsed_vars_white_list = {
    "auc",
    "batch_norm",
    "batch_norm_grad",
    "fused_batch_norm_act",
    "fused_batch_norm_act_grad",
    "sync_batch_norm_grad",
    "center_loss_grad",
    "crop",
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@ -1,5 +1,6 @@
 include(operators)
 register_operators(EXCLUDES
    fused_bn_activation_op
    conv_fusion_op
    fusion_transpose_flatten_concat_op
    fusion_conv_inception_op
@ -8,6 +9,11 @@ register_operators(EXCLUDES
    fusion_group_op)
 if (WITH_GPU)
    # fused_bn_activation_op needs cudnn 7.4.1 above
    if (NOT ${CUDNN_VERSION} VERSION_LESS 7401)
        op_library(fused_bn_activation_op)
        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_batch_norm_act);\n")
    endif()
    # conv_fusion_op needs cudnn 7 above
    if (NOT ${CUDNN_VERSION} VERSION_LESS 7100)
        op_library(conv_fusion_op)
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cc
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.h
@ -0,0 +1,109 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include "paddle/fluid/framework/grad_op_desc_maker.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
 class FusedBatchNormActOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override;
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override;
  framework::OpKernelType GetKernelTypeForVar(
      const std::string& var_name, const Tensor& tensor,
      const framework::OpKernelType& expected_kernel_type) const override;
 };
 class FusedBatchNormActGradOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override;
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override;
 };
 class FusedBatchNormActOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override;
 };
 template <typename T>
 class FusedBatchNormActGradOpMaker : public framework::SingleGradOpMaker<T> {
 public:
  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
 protected:
  std::unique_ptr<T> Apply() const override {
    std::unique_ptr<T> op(new T());
    op->SetType(this->ForwardOpType() + "_grad");
    op->SetInput("X", this->Input("X"));
    op->SetInput("Y", this->Output("Y"));
    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
    op->SetInput("Scale", this->Input("Scale"));
    op->SetInput("Bias", this->Input("Bias"));
    op->SetInput("SavedMean", this->Output("SavedMean"));
    op->SetInput("SavedVariance", this->Output("SavedVariance"));
    op->SetInput("ReserveSpace", this->Output("ReserveSpace"));
    op->SetAttrMap(this->Attrs());
    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
    op->SetOutput(framework::GradVarName("Scale"), this->InputGrad("Scale"));
    op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
    return op;
  }
 };
 class FusedBatchNormActOpInferVarType
    : public framework::PassInDtypeAndVarTypeToOutput {
 protected:
  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
      const override {
    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
  }
 };
 template <typename DeviceContext, typename T>
 class FusedBatchNormActKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override;
 };
 template <typename DeviceContext, typename T>
 class FusedBatchNormActGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override;
 };
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@ -1994,6 +1994,26 @@ All parameter, weight, gradient are variables in Paddle.
                        build_strategy = fluid.BuildStrategy()
                        build_strategy.fuse_elewise_add_act_ops = True
                     )DOC")
      .def_property(
          "fuse_bn_act_ops",
          [](const BuildStrategy &self) { return self.fuse_bn_act_ops_; },
          [](BuildStrategy &self, bool b) {
            PADDLE_ENFORCE_EQ(!self.IsFinalized(), true,
                              platform::errors::PreconditionNotMet(
                                  "BuildStrategy is finlaized."));
            self.fuse_bn_act_ops_ = b;
          },
          R"DOC((bool, optional): fuse_bn_act_ops indicate whether
                to fuse batch_norm and activation_op,
                it may make the execution faster. Default is False.
                Examples:
                    .. code-block:: python
                        import paddle.fluid as fluid
                        build_strategy = fluid.BuildStrategy()
                        build_strategy.fuse_bn_act_ops = True
                     )DOC")
      .def_property(
          "fuse_relu_depthwise_conv",
          [](const BuildStrategy &self) {
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@ -182,6 +182,7 @@ list(REMOVE_ITEM TEST_OPS test_basic_gru_unit_op)
 list(REMOVE_ITEM TEST_OPS test_basic_lstm_api)
 list(REMOVE_ITEM TEST_OPS test_basic_lstm_unit_op)
 list(REMOVE_ITEM TEST_OPS test_imperative_debug_string)
 list(REMOVE_ITEM TEST_OPS test_fuse_bn_act_pass)
 if (APPLE OR WIN32)
  list(REMOVE_ITEM TEST_OPS test_dataset)
@ -301,6 +302,7 @@ py_test_modules(test_parallel_executor_seresnext_base_cpu MODULES test_parallel_
 py_test_modules(test_parallel_executor_seresnext_with_reduce_cpu MODULES test_parallel_executor_seresnext_with_reduce_cpu)
 py_test_modules(test_parallel_executor_seresnext_with_fuse_all_reduce_cpu MODULES test_parallel_executor_seresnext_with_fuse_all_reduce_cpu)
 py_test_modules(test_data_norm_op MODULES test_data_norm_op)
 py_test_modules(test_fuse_bn_act_pass MODULES test_fuse_bn_act_pass ENVS FLAGS_cudnn_deterministic=1 FLAGS_cudnn_batchnorm_spatial_persistent=1 FLAGS_conv_workspace_size_limit=1000)
 if(NOT WIN32)
    py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer)
@ -330,5 +332,5 @@ set_tests_properties(test_parallel_executor_test_while_train test_parallel_execu
        test_parallel_executor_crf test_sync_batch_norm_op
        test_parallel_executor_feed_persistable_var
        test_parallel_executor_crf_auto_growth test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass
-        test_data_norm_op test_imperative_using_non_zero_gpu
+        test_data_norm_op test_imperative_using_non_zero_gpu test_fuse_bn_act_pass
        test_buffer_shared_memory_reuse_pass PROPERTIES LABELS "RUN_TYPE=DIST")
--- a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
@ -0,0 +1,121 @@
 # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle
 import paddle.fluid as fluid
 import unittest
 class TestFuseBatchNormActPass(unittest.TestCase):
    def build_program(self, main_program, startup_program, use_cuda, seed=1):
        main_program.random_seed = seed
        startup_program.random_seed = seed
        with fluid.program_guard(main_program, startup_program):
            x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
            y = fluid.layers.data(name="y", shape=[1], dtype='int64')
            hidden1 = fluid.layers.conv2d(
                input=x,
                filter_size=3,
                num_filters=32,
                stride=1,
                padding=1,
                act=None,
                bias_attr=False,
                data_format='NHWC')
            param_attr = fluid.ParamAttr(
                name='batch_norm_w',
                initializer=fluid.initializer.Constant(value=1.0))
            bias_attr = fluid.ParamAttr(
                name='batch_norm_b',
                initializer=fluid.initializer.Constant(value=0.0))
            hidden2 = fluid.layers.batch_norm(
                input=hidden1,
                param_attr=param_attr,
                bias_attr=bias_attr,
                act='relu',
                data_layout='NHWC')
            hidden3 = fluid.layers.fc(input=hidden2, size=128, act='relu')
            hidden4 = fluid.layers.batch_norm(
                input=hidden3, act='relu', data_layout='NHWC')
            prediction = fluid.layers.fc(input=hidden4, size=10, act='softmax')
            loss = fluid.layers.cross_entropy(input=prediction, label=y)
            loss = fluid.layers.mean(loss)
            sgd = fluid.optimizer.SGD(learning_rate=0.001)
            if use_cuda:
                sgd = fluid.contrib.mixed_precision.decorate(
                    sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0)
            sgd.minimize(loss)
        return x, y, loss
    def check(self, place, use_cuda):
        main_program = fluid.Program()
        startup_program = fluid.Program()
        x, y, loss = self.build_program(main_program, startup_program, use_cuda)
        exe = fluid.Executor(place)
        iters = 10
        batch_size = 16
        feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
        # close fused_bn_act_ops
        build_strategy = fluid.BuildStrategy()
        build_strategy.fuse_bn_act_ops = False
        binary = fluid.CompiledProgram(main_program).with_data_parallel(
            loss_name=loss.name, build_strategy=build_strategy)
        train_reader = paddle.batch(
            paddle.dataset.mnist.train(), batch_size=batch_size)
        loss_vals = []
        scope = fluid.Scope()
        with fluid.scope_guard(scope):
            exe.run(startup_program)
            for _ in range(iters):
                data = next(train_reader())
                loss_v = exe.run(binary,
                                 feed=feeder.feed(data),
                                 fetch_list=[loss])
                loss_vals.append(loss_v[0][0])
        # open fused_bn_act_ops
        build_strategy_fused = fluid.BuildStrategy()
        build_strategy_fused.fuse_bn_act_ops = True
        binary_fused = fluid.CompiledProgram(main_program).with_data_parallel(
            loss_name=loss.name, build_strategy=build_strategy_fused)
        train_reader_fused = paddle.batch(
            paddle.dataset.mnist.train(), batch_size=batch_size)
        loss_vals_fused = []
        scope_fused = fluid.Scope()
        with fluid.scope_guard(scope_fused):
            exe.run(startup_program)
            for _ in range(iters):
                data = next(train_reader_fused())
                loss_v = exe.run(binary_fused,
                                 feed=feeder.feed(data),
                                 fetch_list=[loss])
                loss_vals_fused.append(loss_v[0][0])
        # check loss
        for i in range(iters):
            self.assertAlmostEqual(loss_vals[i], loss_vals_fused[i], delta=1e-5)
    def test_fuse_bn_act_pass_cpu(self):
        place = fluid.CPUPlace()
        self.check(place, use_cuda=False)
    def test_fuse_bn_act_pass_cuda(self):
        if fluid.core.is_compiled_with_cuda():
            place = fluid.CUDAPlace(0)
            self.check(place, use_cuda=True)
 if __name__ == '__main__':
    unittest.main()