Add inplace abn op (#22806)

* add inplace_abn_op. test=develop
5 years ago · 21d95be0db
parent 821534efd3
commit 21d95be0db
15 changed files with 1654 additions and 487 deletions
--- a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
+++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
@ -24,16 +24,24 @@ namespace ir {
 class SyncBatchNormPass : public Pass {
 protected:
  void ApplyImpl(ir::Graph *graph) const override {
-    VLOG(3) << "Use synchronous batch norm";
+    VLOG(3) << "Use synchronize batch norm";
    for (const Node *n : graph->Nodes()) {
      if (n->IsOp() && n->Op()) {
        auto *op = n->Op();
+        // process synchronize in batch_norm
        if (op->Type() == "batch_norm") {
          op->SetType("sync_batch_norm");
        }
        if (op->Type() == "batch_norm_grad") {
          op->SetType("sync_batch_norm_grad");
        }
+        // process synchronize in inplace_abn
+        if (op->Type() == "inplace_abn") {
+          op->SetAttr("use_sync_bn", true);
+        }
+        if (op->Type() == "inplace_abn_grad") {
+          op->SetAttr("use_sync_bn", true);
+        }
      }
    }
  }
--- a/paddle/fluid/framework/unused_var_check.cc
+++ b/paddle/fluid/framework/unused_var_check.cc
@ -41,6 +41,8 @@ const std::unordered_set<std::string> op_has_unsed_vars_white_list = {
    "batch_norm_grad",                 // 0
    "sync_batch_norm",                 // 0
    "sync_batch_norm_grad",            // 0
+    "inplace_abn",                     // 0
+    "inplace_abn_grad",                // 0
    "dgc_momentum",                    // 0
    "fake_quantize_range_abs_max",     // 0
    "rmsprop",                         // 0
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@ -59,7 +59,7 @@ if(WITH_COVERAGE OR NOT WITH_AVX OR WIN32)
 endif()

 register_operators(EXCLUDES py_func_op warpctc_op dgc_op
-	sync_batch_norm_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
+    sync_batch_norm_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})

 if (WITH_GPU)
    # warpctc_op needs cudnn 7 above
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@ -40,8 +40,9 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
    : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::InvalidArgument("It must use CUDAPlace."));
    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
    float momentum = ctx.Attr<float>("momentum");
    const bool is_test = ctx.Attr<bool>("is_test");
@ -355,6 +356,41 @@ static __global__ void KeBNBackwardData(const T *dy,
  }
 }

+template <typename T>
+static __global__ void KeBNRestoreData(const framework::DataLayout layout, T *x,
+                                       const BatchNormParamType<T> *scale,
+                                       const BatchNormParamType<T> *bias,
+                                       const BatchNormParamType<T> *mean,
+                                       const BatchNormParamType<T> *variance,
+                                       double epsilon, int C, int M,
+                                       const int num, const T *y) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == framework::DataLayout::kNCHW ? (i / M) % C : i % C;
+    auto y_i = static_cast<BatchNormParamType<T>>(y[i]);
+    auto x_i = (y_i - bias[c]) / scale[c] / variance[c] + mean[c];
+    x[i] = static_cast<T>(x_i);
+  }
+}
+
+template <typename T>
+class InplaceHelper {
+ public:
+  void operator()(const framework::DataLayout layout, T *x,
+                  const BatchNormParamType<T> *scale,
+                  const BatchNormParamType<T> *bias,
+                  const BatchNormParamType<T> *mean,
+                  const BatchNormParamType<T> *variance, double epsilon, int C,
+                  int M, const int num, const T *y, int grid2, const int block,
+                  const cudaStream_t &stream) {
+    PADDLE_ENFORCE_EQ(x, y, platform::errors::InvalidArgument(
+                                "X and Y should be inplaced in inplace mode"));
+    KeBNRestoreData<<<grid2, block, 0, stream>>>(
+        layout, x, scale, bias, mean, variance, epsilon, C, M, num, y);
+  }
+};
+
 template <typename T, int BlockDim, framework::DataLayout layout>
 static __global__ void BNBackwardData(const T *dy,
                                      const BatchNormParamType<T> *scale,
@ -417,17 +453,43 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
    : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::InvalidArgument("It must use CUDAPlace."));
    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");

    const DataLayout data_layout =
        framework::StringToDataLayout(data_layout_str);
-    const auto *x = ctx.Input<Tensor>("X");
    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *bias = ctx.Input<Tensor>("Bias");
+
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    // batch_norm with inplace as false will take X as grad input, which
+    // is same as cuDNN batch_norm backward calculation, batch_norm
+    // with inplace as true only take Y as input and X should be calculate
+    // by inverse operation of batch_norm on Y
+    const Tensor *x;
+    bool is_inplace;
+    if (ctx.HasInput("Y")) {
+      x = ctx.Input<Tensor>("Y");
+      is_inplace = true;
+      PADDLE_ENFORCE_EQ(d_x, d_y,
+                        platform::errors::InvalidArgument(
+                            "X@GRAD and Y@GRAD not inplace in inplace mode"));
+    } else {
+      x = ctx.Input<Tensor>("X");
+      is_inplace = false;
+      PADDLE_ENFORCE_NE(d_x, d_y,
+                        platform::errors::InvalidArgument(
+                            "X@GRAD and Y@GRAD inplaced in non-inplace mode"));
+    }
+
    const bool is_test = ctx.Attr<bool>("is_test");
    PADDLE_ENFORCE_EQ(
        is_test, false,
@ -444,11 +506,8 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);

    // init output
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
    d_x->mutable_data<T>(ctx.GetPlace());
+
    if (d_scale && d_bias) {
      d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
      d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
@ -505,6 +564,8 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
    const int max_blocks = std::max(max_threads / block, 1);
    int grid1 = (num + block - 1) / block;
    int grid2 = std::min(C, max_blocks);
+    auto stream = dev_ctx.stream();
+    InplaceHelper<T> inplace_functor;

    if (!use_global_stats) {
      if ((N * H * W * D) == 1) {
@ -555,6 +616,14 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
      const auto *saved_var_data =
          saved_var->template data<BatchNormParamType<T>>();

+      if (is_inplace) {
+        inplace_functor(compute_format, transformed_x.data<T>(),
+                        scale->template data<BatchNormParamType<T>>(),
+                        bias->template data<BatchNormParamType<T>>(),
+                        saved_mean_data, saved_var_data, epsilon, C, H * W * D,
+                        num, transformed_x.data<T>(), grid2, block, stream);
+      }
+
      if (d_scale && d_bias) {
        bool called = false;
 #if CUDNN_VERSION_MIN(7, 4, 1)
@ -680,30 +749,41 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
      const auto *running_var_data =
          running_var->template data<BatchNormParamType<T>>();

+      if (is_inplace) {
+        auto px = *x;
+        inplace_functor(data_layout, px.mutable_data<T>(ctx.GetPlace()),
+                        scale->template data<BatchNormParamType<T>>(),
+                        bias->template data<BatchNormParamType<T>>(),
+                        running_mean_data, running_var_data, epsilon, C,
+                        H * W * D, num, x->data<T>(), grid2, block, stream);
+      }
+
      if (compute_format == DataLayout::kNCHW) {
        if (d_x) {
-          KeBNBackwardData<T, framework::DataLayout::kNCHW><<<
-              grid1, block, 0, dev_ctx.stream()>>>(
+          KeBNBackwardData<
+              T, framework::DataLayout::kNCHW><<<grid1, block, 0, stream>>>(
              d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
              running_var_data, epsilon, C, H * W, num, d_x->data<T>());
        }
        if (d_scale && d_bias) {
-          KeBNBackwardScaleBias<T, block, framework::DataLayout::kNCHW><<<
-              grid2, block, 0, dev_ctx.stream()>>>(
+          KeBNBackwardScaleBias<
+              T, block,
+              framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
              d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
              epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
              d_bias->data<BatchNormParamType<T>>());
        }
      } else {
        if (d_x) {
-          KeBNBackwardData<T, framework::DataLayout::kNHWC><<<
-              grid1, block, 0, dev_ctx.stream()>>>(
+          KeBNBackwardData<
+              T, framework::DataLayout::kNHWC><<<grid1, block, 0, stream>>>(
              d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
              running_var_data, epsilon, C, H * W, num, d_x->data<T>());
        }
        if (d_scale && d_bias) {
-          KeBNBackwardScaleBias<T, block, framework::DataLayout::kNHWC><<<
-              grid2, block, 0, dev_ctx.stream()>>>(
+          KeBNBackwardScaleBias<
+              T, block,
+              framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
              d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
              epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
              d_bias->data<BatchNormParamType<T>>());
--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
@ -0,0 +1,208 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/inplace_abn_op.h"
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/operators/batch_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+class InplaceABNOp : public paddle::operators::BatchNormOp {
+ public:
+  using paddle::operators::BatchNormOp::BatchNormOp;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    // By default, the type of the scale, bias, mean,
+    // and var tensors should both be float. (For float or float16 input tensor)
+    // or double (For double input tensor).
+    auto bn_param_type = framework::proto::VarType::FP32;
+    if (input_data_type == framework::proto::VarType::FP64) {
+      bn_param_type = framework::proto::VarType::FP64;
+    }
+    PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Scale")->type(),
+                      platform::errors::InvalidArgument(
+                          "Scale input should be of float type"));
+    PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Bias")->type(),
+                      platform::errors::InvalidArgument(
+                          "Bias input should be of float type"));
+    PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Mean")->type(),
+                      platform::errors::InvalidArgument(
+                          "Mean input should be of float type"));
+    PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input<Tensor>("Variance")->type(),
+                      platform::errors::InvalidArgument(
+                          "Variance input should be of float type"));
+
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+
+    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
+                                   library);
+  }
+};
+
+class InplaceABNGradOp : public paddle::operators::BatchNormGradOp {
+ public:
+  using paddle::operators::BatchNormGradOp::BatchNormGradOp;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    const auto* var = ctx.InputVar(framework::GradVarName("Y"));
+    auto input_data_type = ctx.Input<Tensor>("Y")->type();
+    if (var == nullptr) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "can't find gradient variable of Y"));
+    }
+    const Tensor* t = nullptr;
+    if (var->IsType<Tensor>()) {
+      t = &var->Get<Tensor>();
+    } else if (var->IsType<LoDTensor>()) {
+      t = &var->Get<LoDTensor>();
+    }
+    if (t == nullptr) {
+      PADDLE_THROW(
+          platform::errors::InvalidArgument("gradient variable of Y is empty"));
+    }
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+
+    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
+                                   library);
+  }
+};
+
+class InplaceABNOpMaker : public paddle::operators::BatchNormOpMaker {
+ public:
+  void Make() override {
+    BatchNormOpMaker::Make();
+    AddAttr<std::string>(
+        "activation",
+        "(enum string, default identity, can be identity|elu|leaky-relu) "
+        "The activation type used for output candidate {h}_t.")
+        .SetDefault("");
+    AddAttr<float>("alpha",
+                   "(float, default 1.0) Only used in inplace-abn kernel,"
+                   "the activation type(identity|elu|leakyrelu) would be fused "
+                   "with batch_norm, "
+                   "this is the alpha value for elu|leakyrelu.")
+        .SetDefault(0.1f);
+    AddAttr<bool>("use_sync_bn",
+                  "(bool, default false) Whether use synchronize batch "
+                  "normalization.")
+        .SetDefault(false);
+  }
+};
+
+template <typename T>
+class InplaceABNOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType(this->ForwardOpType() + "_grad");
+    op->SetInput("Y", this->Output("Y"));
+    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
+
+    op->SetInput("Scale", this->Input("Scale"));
+    op->SetInput("Bias", this->Input("Bias"));
+    op->SetInput("SavedMean", this->Output("SavedMean"));
+    op->SetInput("SavedVariance", this->Output("SavedVariance"));
+
+    // used when setting use_global_stats True during training
+    if (boost::get<bool>(this->GetAttr("use_global_stats"))) {
+      op->SetInput("Mean", this->Output("MeanOut"));
+      op->SetInput("Variance", this->Output("VarianceOut"));
+    }
+
+    op->SetAttrMap(this->Attrs());
+
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Scale"), this->InputGrad("Scale"));
+    op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class InplaceABNKernel
+    : public paddle::operators::BatchNormKernel<DeviceContext, T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Output<Tensor>("Y");
+    PADDLE_ENFORCE_EQ(x, y, platform::errors::InvalidArgument(
+                                "X and Y not inplaced in inplace mode"));
+    auto activation =
+        GetInplaceABNActivationType(ctx.Attr<std::string>("activation"));
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    BatchNormKernel<DeviceContext, T>::Compute(ctx);
+
+    auto cur_y = EigenVector<T>::Flatten(*y);
+    InplaceABNActivation<DeviceContext, T> functor;
+    functor.Compute(ctx, activation, place, cur_y, cur_y);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class InplaceABNGradKernel
+    : public paddle::operators::BatchNormGradKernel<DeviceContext, T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    PADDLE_ENFORCE_EQ(d_x, d_y,
+                      platform::errors::InvalidArgument(
+                          "X@GRAD and Y@GRAD not inplaced in inplace mode"));
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto activation =
+        GetInplaceABNActivationType(ctx.Attr<std::string>("activation"));
+
+    auto py = *y;
+    auto pd_y = *d_y;
+    auto cur_y = EigenVector<T>::Flatten(py);
+    auto cur_dy = EigenVector<T>::Flatten(pd_y);
+
+    InplaceABNActivation<DeviceContext, T> functor;
+    functor.GradCompute(ctx, activation, place, cur_y, cur_y, cur_dy, cur_dy);
+
+    BatchNormGradKernel<DeviceContext, T>::Compute(ctx);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(inplace_abn, ops::InplaceABNOp, ops::InplaceABNOpMaker,
+                  ops::BatchNormOpInferVarType,
+                  ops::InplaceABNOpGradMaker<paddle::framework::OpDesc>,
+                  ops::InplaceABNOpGradMaker<paddle::imperative::OpBase>)
+REGISTER_OPERATOR(inplace_abn_grad, ops::InplaceABNGradOp)
+
+REGISTER_OP_CPU_KERNEL(
+    inplace_abn,
+    ops::InplaceABNKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::InplaceABNKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    inplace_abn_grad,
+    ops::InplaceABNGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::InplaceABNGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/inplace_abn_op.cu
+++ b/paddle/fluid/operators/inplace_abn_op.cu
@ -0,0 +1,92 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/batch_norm_op.h"
+#include "paddle/fluid/operators/inplace_abn_op.h"
+#include "paddle/fluid/operators/sync_batch_norm_op.cu.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class InplaceABNKernel
+    : public paddle::operators::SyncBatchNormKernel<DeviceContext, T>,
+      public paddle::operators::BatchNormKernel<DeviceContext, T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* y = ctx.Output<Tensor>("Y");
+    auto* x = ctx.Input<Tensor>("X");
+    PADDLE_ENFORCE_EQ(x, y, platform::errors::InvalidArgument(
+                                "X and Y not inplaced in inplace mode"));
+    auto activation =
+        GetInplaceABNActivationType(ctx.Attr<std::string>("activation"));
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    if (ctx.Attr<bool>("use_sync_bn")) {
+      SyncBatchNormKernel<DeviceContext, T>::Compute(ctx);
+    } else {
+      BatchNormKernel<DeviceContext, T>::Compute(ctx);
+    }
+
+    auto cur_y = EigenVector<T>::Flatten(*y);
+    InplaceABNActivation<DeviceContext, T> functor;
+    functor.Compute(ctx, activation, place, cur_y, cur_y);
+  }
+};
+
+// Deriving the Gradient for the Backward Pass of Batch Normalization
+// https://kevinzakka.github.io/2016/09/14/batch_normalization/
+template <typename DeviceContext, typename T>
+class InplaceABNGradKernel
+    : public paddle::operators::SyncBatchNormGradKernel<DeviceContext, T>,
+      public paddle::operators::BatchNormGradKernel<DeviceContext, T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* y = ctx.Input<Tensor>("Y");
+    auto* d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    PADDLE_ENFORCE_EQ(d_x, d_y,
+                      platform::errors::InvalidArgument(
+                          "X@GRAD and Y@GRAD not inplaced in inplace mode"));
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto activation =
+        GetInplaceABNActivationType(ctx.Attr<std::string>("activation"));
+
+    auto py = *y;
+    auto pd_y = *d_y;
+    auto cur_y = EigenVector<T>::Flatten(py);
+    auto cur_dy = EigenVector<T>::Flatten(pd_y);
+
+    InplaceABNActivation<DeviceContext, T> functor;
+    functor.GradCompute(ctx, activation, place, cur_y, cur_y, cur_dy, cur_dy);
+
+    if (ctx.Attr<bool>("use_sync_bn")) {
+      SyncBatchNormGradKernel<DeviceContext, T>::Compute(ctx);
+    } else {
+      BatchNormGradKernel<DeviceContext, T>::Compute(ctx);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(inplace_abn,
+                        ops::InplaceABNKernel<plat::CUDADeviceContext, float>,
+                        ops::InplaceABNKernel<plat::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    inplace_abn_grad, ops::InplaceABNGradKernel<plat::CUDADeviceContext, float>,
+    ops::InplaceABNGradKernel<plat::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/inplace_abn_op.h
+++ b/paddle/fluid/operators/inplace_abn_op.h
@ -0,0 +1,117 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+enum InplaceABNActivationType { identity = 0, leakyrelu = 1, elu = 2 };
+
+inline InplaceABNActivationType GetInplaceABNActivationType(
+    const std::string& type) {
+  if (type == "leaky_relu") {
+    return InplaceABNActivationType::leakyrelu;
+  } else if (type == "elu") {
+    return InplaceABNActivationType::elu;
+  } else if (type == "identity" || type == "") {
+    return InplaceABNActivationType::identity;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "unsupported activation type %s for Op(inplace_abn)", type));
+  }
+}
+
+template <typename DeviceContext, typename T>
+class InplaceABNActivation {
+ private:
+  template <typename Functor>
+  void setAttrs(const framework::ExecutionContext& ctx, Functor* functor) {
+    auto attrs = functor->GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = ctx.Attr<float>(attr.first);
+    }
+  }
+
+  template <typename Functor, typename... Args>
+  void compute(const framework::ExecutionContext& ctx, Functor* functor,
+               Args... args) {
+    setAttrs(ctx, functor);
+    (*functor)(args...);
+  }
+
+ public:
+  template <typename Device, typename X, typename Y>
+  void Compute(const framework::ExecutionContext& ctx, const int act_type,
+               const Device& d, X x, Y y) {
+    if (act_type == InplaceABNActivationType::identity) {
+      y.device(d) = x;
+    } else if (act_type == InplaceABNActivationType::leakyrelu) {
+      LeakyReluFunctor<T> functor;
+      compute(ctx, &functor, d, x, y);
+    } else if (act_type == InplaceABNActivationType::elu) {
+      ELUFunctor<T> functor;
+      compute(ctx, &functor, d, x, y);
+    } else {
+      PADDLE_THROW(
+          platform::errors::InvalidArgument("unsupported activation type"));
+    }
+  }
+
+  template <typename Device, typename X, typename Y, typename DX, typename DY>
+  void GradCompute(const framework::ExecutionContext& ctx, const int act_type,
+                   const Device& d, X x, Y y, DX dx, DY dy) {
+    const float alpha = ctx.Attr<float>("alpha");
+
+    if (act_type == InplaceABNActivationType::identity) {
+      x.device(d) = y;
+      dx.device(d) = dy;
+    } else if (act_type == InplaceABNActivationType::leakyrelu) {
+      auto temp1 = (y < static_cast<T>(0)).template cast<T>().eval() /
+                   static_cast<T>(alpha);
+      auto temp2 = (y >= static_cast<T>(0)).template cast<T>().eval();
+      x.device(d) = y * (temp1 + temp2).template cast<T>();
+
+      LeakyReluGradFunctor<T> functor;
+      compute(ctx, &functor, d, x, y, dy, dx);
+    } else if (act_type == InplaceABNActivationType::elu) {
+      auto temp1 = (y >= static_cast<T>(0)).template cast<T>().eval();
+      auto temp = (y < static_cast<T>(0)).template cast<T>().eval();
+      auto temp2 = (y * temp / static_cast<T>(alpha) + static_cast<T>(1)).log();
+      x.device(d) = (y * temp1 + temp2).template cast<T>();
+
+      ELUGradFunctor<T> functor;
+      compute(ctx, &functor, d, x, y, dy, dx);
+    } else {
+      PADDLE_THROW(
+          platform::errors::InvalidArgument("unsupported activation type"));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/sync_batch_norm_op.cu
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu
--- a/paddle/fluid/operators/sync_batch_norm_op.cu.h
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu.h
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@ -50,6 +50,7 @@ __all__ = [
    'adaptive_pool2d',
    'adaptive_pool3d',
    'batch_norm',
+    'inplace_abn',
    'instance_norm',
    'data_norm',
    'conv2d_transpose',
@ -2638,9 +2639,9 @@ def batch_norm(input,
 	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
 	     Default: None.
        data_layout (str, optional): Specify the data format of the input, and the data format of the output 
-            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
-            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-            `[batch_size, input_channels, input_height, input_width]`.
+             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
+             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+             `[batch_size, input_channels, input_height, input_width]`.
        in_place(bool, Default False): Make the input and output of batch norm reuse memory.
        name(str|None): For detailed information, please refer to :ref:`api_guide_Name`. 
            Usually name is no need to set and None by default. 
@ -2657,7 +2658,6 @@ def batch_norm(input,
            or is_test to true, and the behavior is equivalent.
            In train mode, when setting use_global_stats True, the global mean
            and variance are also used during train period.
-
    Returns:
        A Variable holding Tensor which is the result after applying batch normalization on the input, 
        has same shape and data type with input. 
@ -2770,8 +2770,8 @@ def batch_norm(input,
        reserve_space = helper.create_variable_for_type_inference(
            dtype=core.VarDesc.VarType.FP16, stop_gradient=True)

-    batch_norm_out = input if in_place else helper.create_variable_for_type_inference(
-        dtype)
+    batch_norm_out = input if in_place else \
+            helper.create_variable_for_type_inference(dtype)

    inputs = {
        "X": input,
@ -2809,6 +2809,209 @@ def batch_norm(input,
    return helper.append_activation(batch_norm_out)


+def inplace_abn(input,
+                act=None,
+                is_test=False,
+                momentum=0.9,
+                epsilon=1e-05,
+                param_attr=None,
+                bias_attr=None,
+                data_layout='NCHW',
+                name=None,
+                moving_mean_name=None,
+                moving_variance_name=None,
+                do_model_average_for_mean_and_var=True,
+                use_global_stats=False,
+                act_alpha=1.0):
+    """
+    **In-place Activation Batch Normalization Layer**
+    
+    This layer calculates batch normalization and activation with in-place memory.
+    For batch normalization calculations, see `fluid.layers.batch_norm`.
+    For in-place activation batch normalization, see `In-Place Activated BatchNorm for 
+    Memory-Optimized Training of DNNs <https://arxiv.org/abs/1712.02616>`_
+
+    `inplace_abn` only support activation type as `None`, `identity`, `leaky_relu`,
+    `elu` currently.
+    `inplace_abn` only support data type as `float32`, `float64` currently.
+
+    Note:
+        if build_strategy.sync_batch_norm=True, the batch_norm in network will use 
+        sync_batch_norm automatically.
+        `is_test = True` can only be used in test program and inference program, `is_test` CANNOT be set to True in train program, if you want to use global status from pre_train model in train program, please set `use_global_stats = True`.
+
+    Args:
+        input(Variable): The rank of input variable can be 2, 3, 4, 5. The data type 
+            is float16 or float32 or float64.
+        act(string, Default None): Activation type, linear|relu|prelu|...
+        is_test (bool, Default False): A flag indicating whether it is in
+            test phrase or not.
+        momentum(float|Variable, Default 0.9): The value used for the moving_mean and
+            moving_var computation. This should be a float number or a Variable with
+            shape [1] and data type as float32. The updated formula is:
+            :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
+            :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
+            Default is 0.9.
+        epsilon(float, Default 1e-05): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        param_attr(ParamAttr|None): The parameter attribute for Parameter `scale`
+             of inplace_abn. If it is set to None or one attribute of ParamAttr, inplace_abn 
+	     will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
+	     If the Initializer of the param_attr is not set, the parameter is initialized 
+	     with Xavier. Default: None.
+        bias_attr(ParamAttr|None): The parameter attribute for the bias of inplace_abn.
+             If it is set to None or one attribute of ParamAttr, inplace_abn 
+	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
+	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
+	     Default: None.
+        data_layout (str, optional): Specify the data format of the input, and the data format of the output 
+             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
+             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+             `[batch_size, input_channels, input_height, input_width]`.
+        name(str|None): For detailed information, please refer to :ref:`api_guide_Name`. 
+            Usually name is no need to set and None by default. 
+        moving_mean_name(str, Default None): The name of moving_mean which store the global Mean. If it 
+            is set to None, inplace_abn will save global mean with a random name, otherwise, inplace_abn 
+            will save global mean with the string.
+        moving_variance_name(str, Default None): The name of the moving_variance which store the global Variance.
+            If it is set to None, inplace_abn, will save global variance with a random name, otherwise, inplace_abn 
+            will save global variance with the string.
+        do_model_average_for_mean_and_var(bool, Default True): Whether parameter mean and variance should do model
+            average when model average is enabled.
+        use_global_stats(bool, Default False): Whether to use global mean and
+            variance. In inference or test mode, set use_global_stats to true
+            or is_test to true, and the behavior is equivalent.
+            In train mode, when setting use_global_stats True, the global mean
+            and variance are also used during train period.
+        act_alpha(float, Default 1.0): when activation is in ['elu', 'identity', 'leaky_relu'],
+            inplace activative batch normalization will be used, and alpha parameter for activation
+            can be given by this parameter.
+    Returns:
+        A Variable holding Tensor which is the result after applying batch normalization and activation on the input, 
+        has same shape and data type with input. 
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            x = fluid.data(name='x', shape=[3, 7, 3, 7], dtype='float32')
+            hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
+            hidden2 = fluid.layers.inplace_abn(input=hidden1)
+            hidden3 = fluid.layers.inplace_abn(input=hidden2, act='leaky_relu', act_alpha=0.2)
+
+    """
+    assert act in [None, 'identity', 'leaky_relu', 'elu'], \
+        "inplace_abn only support act as None, 'identity', " \
+        "'leaky_relu', 'elu' currently"
+    assert bias_attr is not False, "bias_attr should not be False in inplace_abn."
+    helper = LayerHelper('inplace_abn', **locals())
+
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                             'inplace_abn')
+    dtype = helper.input_dtype()
+
+    has_reserve_space = False
+    if data_layout == 'NHWC':
+        flag = os.environ.get('FLAGS_cudnn_batchnorm_spatial_persistent')
+        if flag is not None and flag.lower() in ['true', '1']:
+            has_reserve_space = True
+
+    input_shape = input.shape
+    if data_layout == 'NCHW':
+        channel_num = input_shape[1]
+    else:
+        if data_layout == 'NHWC':
+            channel_num = input_shape[-1]
+        else:
+            raise ValueError("unsupported data layout:" + data_layout)
+
+    param_shape = [channel_num]
+
+    # create parameter
+    scale = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=param_shape,
+        dtype=dtype,
+        default_initializer=Constant(1.0))
+    bias = helper.create_parameter(
+        attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
+
+    mean = helper.create_parameter(
+        attr=ParamAttr(
+            name=moving_mean_name,
+            initializer=Constant(0.0),
+            trainable=False,
+            do_model_average=do_model_average_for_mean_and_var),
+        shape=param_shape,
+        dtype=dtype)
+    mean.stop_gradient = True
+
+    variance = helper.create_parameter(
+        attr=ParamAttr(
+            name=moving_variance_name,
+            initializer=Constant(1.0),
+            trainable=False,
+            do_model_average=do_model_average_for_mean_and_var),
+        shape=param_shape,
+        dtype=dtype)
+    variance.stop_gradient = True
+
+    # create output
+    # mean and mean_out share the same memory
+    mean_out = mean
+    # variance and variance out share the same memory
+    variance_out = variance
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True)
+    saved_variance = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True)
+
+    reserve_space = None
+    if has_reserve_space:
+        reserve_space = helper.create_variable_for_type_inference(
+            dtype=core.VarDesc.VarType.FP16, stop_gradient=True)
+
+    batch_norm_out = input
+
+    inputs = {
+        "X": input,
+        "Scale": scale,
+        "Bias": bias,
+        "Mean": mean,
+        "Variance": variance
+    }
+    attrs = {
+        "epsilon": epsilon,
+        "is_test": is_test,
+        "data_layout": data_layout,
+        "use_mkldnn": False,
+        "fuse_with_relu": False,
+        "use_global_stats": use_global_stats,
+        "activation": act,
+        "alpha": act_alpha,
+    }
+    if isinstance(momentum, Variable):
+        inputs['MomemtumTensor'] = momentum
+    else:
+        attrs['momentum'] = momentum
+
+    outputs = {
+        "Y": batch_norm_out,
+        "MeanOut": mean_out,
+        "VarianceOut": variance_out,
+        "SavedMean": saved_mean,
+        "SavedVariance": saved_variance
+    }
+    if reserve_space is not None:
+        outputs["ReserveSpace"] = reserve_space
+
+    helper.append_op(
+        type="inplace_abn", inputs=inputs, outputs=outputs, attrs=attrs)
+
+    return batch_norm_out
+
+
 def instance_norm(input,
                  epsilon=1e-05,
                  param_attr=None,
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@ -234,7 +234,7 @@ def img_conv_group(input,
            use_cudnn=use_cudnn)

        if conv_with_batchnorm[i]:
-            tmp = layers.batch_norm(input=tmp, act=conv_act, in_place=True)
+            tmp = layers.batch_norm(input=tmp, act=conv_act)
            drop_rate = conv_batchnorm_drop_rate[i]
            if abs(drop_rate) > 1e-5:
                tmp = layers.dropout(x=tmp, dropout_prob=drop_rate)
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@ -360,7 +360,7 @@ set_tests_properties(test_parallel_executor_test_while_train test_parallel_execu
        test_fetch_unmerged
        test_buffer_shared_memory_reuse_pass PROPERTIES LABELS "RUN_TYPE=DIST")

-set_tests_properties(test_parallel_executor_crf test_sync_batch_norm_op
+set_tests_properties(test_parallel_executor_crf test_sync_batch_norm_op test_inplace_abn_op
        test_parallel_executor_seresnext_base_gpu
        test_parallel_executor_seresnext_with_reduce_gpu
        test_parallel_executor_seresnext_with_fuse_all_reduce_gpu
--- a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
@ -0,0 +1,189 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import os
+import six
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid import compiler
+import paddle.fluid.unique_name as unique_name
+
+
+class TestInplaceANBOpTraining(unittest.TestCase):
+    def setUp(self):
+        self.dtype = np.float64
+        self.N = 4
+        self.C = 5
+        self.H = 7
+        self.W = 9
+        self.dshape = [self.N, self.C, self.H, self.W]
+
+    def build_program(self,
+                      place,
+                      layout,
+                      seed,
+                      only_forward=False,
+                      activation="identity",
+                      alpha=1.0,
+                      use_cuda=False,
+                      inplace=False):
+        main = fluid.Program()
+        startup = fluid.Program()
+        main.random_seed = seed
+        startup.random_seed = seed
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                data = fluid.layers.data(
+                    name='input',
+                    shape=self.dshape,
+                    dtype=self.dtype,
+                    append_batch_size=False,
+                    stop_gradient=False)
+                if inplace:
+                    bn = fluid.layers.inplace_abn(
+                        data,
+                        act=activation,
+                        param_attr=fluid.ParamAttr(name='bn_scale'),
+                        bias_attr=fluid.ParamAttr(name='bn_bias'),
+                        moving_mean_name='bn_moving_mean',
+                        moving_variance_name='bn_moving_variance',
+                        data_layout=layout,
+                        is_test=only_forward,
+                        act_alpha=alpha)
+                else:
+                    bn = fluid.layers.batch_norm(
+                        data,
+                        param_attr=fluid.ParamAttr(name='bn_scale'),
+                        bias_attr=fluid.ParamAttr(name='bn_bias'),
+                        moving_mean_name='bn_moving_mean',
+                        moving_variance_name='bn_moving_variance',
+                        data_layout=layout,
+                        is_test=only_forward,
+                        in_place=inplace)
+                    if activation == 'leaky_relu':
+                        bn = fluid.layers.leaky_relu(bn, alpha)
+                    if activation == 'elu':
+                        bn = fluid.layers.elu(bn, alpha)
+
+                # NOTE: in inplace mode input and output of bn
+                # may have same name, multiply 1. to generate 
+                # a new Variable for fetch
+                bn = bn * 1.
+
+                sigmoid = fluid.layers.sigmoid(bn)
+                out = fluid.layers.reduce_sum(sigmoid)
+                if not only_forward:
+                    sgd_opt = fluid.optimizer.SGD(learning_rate=0.0)
+                    sgd_opt.backward(out)
+        return main, startup, [out, bn]
+
+    def compare(self, place, layout, only_forward, activation, alpha, use_cuda):
+        seed = 10
+        os.environ['FLAGS_cudnn_deterministic'] = "1"
+        data = np.random.random(size=self.dshape).astype(self.dtype) * 4. - 2
+
+        fetch_outs = []
+        fetch_names = []
+        for inplace in [False, True]:
+            main, startup, outs = self.build_program(
+                place,
+                layout,
+                seed,
+                only_forward,
+                activation,
+                alpha,
+                inplace=inplace)
+            exe = fluid.Executor(place)
+            exe.run(startup)
+
+            fetch_name = [v.name for v in outs] + [
+                'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias'
+            ]
+            if not only_forward:
+                others = [
+                    'inplace_abn_0.tmp_0' if inplace else 'batch_norm_0.tmp_0',
+                    'inplace_abn_0.tmp_1' if inplace else 'batch_norm_0.tmp_1',
+                    'bn_scale@GRAD',
+                    'bn_bias@GRAD',
+                    'input@GRAD',
+                ]
+                fetch_name += others
+            for nm in fetch_name:
+                fv = fluid.framework._get_var(str(nm), program=main)
+                fv.persistable = True
+
+            build_strategy = fluid.BuildStrategy()
+            build_strategy.sync_batch_norm = use_cuda and \
+                        fluid.core.get_cuda_device_count() > 1
+            build_strategy.enable_inplace = inplace
+            exec_strategy = fluid.ExecutionStrategy()
+            exec_strategy.num_threads = 1 if os.name == 'nt' else 0
+            comp_prog1 = compiler.CompiledProgram(main).with_data_parallel(
+                outs[0].name if not only_forward else None,
+                build_strategy=build_strategy,
+                exec_strategy=exec_strategy)
+            bn_fetches = exe.run(program=comp_prog1,
+                                 feed={'input': data},
+                                 fetch_list=fetch_name)
+            fetch_outs.append(bn_fetches)
+            fetch_names.append(fetch_name)
+
+        for bn_val, inplace_abn_val, name1, name2 in zip(*(fetch_outs +
+                                                           fetch_names)):
+            self.assertTrue(
+                np.allclose(
+                    bn_val, inplace_abn_val, atol=1e-2),
+                "Output (" + name1 + ":" + name2 +
+                ") has diff on {} with {} layout and {} activation. \n".format(
+                    place, layout, activation) + "\nBN     " + str(bn_val) +
+                "\n" + "Inplace ABN " + str(inplace_abn_val))
+
+    def test_op(self):
+        use_cudas = [False, True] if core.is_compiled_with_cuda() else [False]
+        for use_cuda in use_cudas:
+            place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
+            layouts = ["NCHW", "NHWC"]
+            for layout in layouts:
+                for activation, alpha in zip([None, 'elu', 'leaky_relu'],
+                                             [0., 1., 0.02]):
+                    for infer_only in [True, False]:
+                        self.compare(place, layout, infer_only, activation,
+                                     alpha, use_cuda)
+
+    def test_all_branches(self):
+        seed = 10
+        os.environ['FLAGS_cudnn_deterministic'] = "1"
+        data = np.random.random(size=self.dshape).astype(self.dtype) * 4. - 2
+        use_cudas = [False, True] if core.is_compiled_with_cuda() else [False]
+        alpha = 0.1
+        layouts = ["NCHW", "NHWC"]
+        for use_cuda in use_cudas:
+            place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
+            for layout in layouts:
+                for activation in ['identity', 'leaky_relu']:
+                    main, startup, outs = self.build_program(
+                        place, layout, seed, False, activation, alpha, use_cuda,
+                        True)
+                    exe = fluid.Executor(place)
+                    exe.run(startup)
+                    exe.run(program=main, feed={'input': data})
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@ -2684,6 +2684,28 @@ class TestBook(LayerTest):
            out = layers.batch_norm(data, momentum=momentum)
            return (out)

+    def make_inplace_abn(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            data = self._get_data(
+                name='data', shape=[32, 128, 128], dtype="float32")
+            out = layers.inplace_abn(data, act='leaky_relu', act_alpha=0.2)
+            return (out)
+
+    def make_inplace_abn_momentum_variable(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            data = self._get_data(
+                name='data', shape=[32, 128, 128], dtype="float32")
+            momentum = self._get_data(
+                name='momentum',
+                shape=[1],
+                dtype='float32',
+                append_batch_size=False)
+            out = layers.inplace_abn(
+                data, momentum=momentum, act='elu', act_alpha=2.0)
+            return (out)
+
    def make_range(self):
        with program_guard(fluid.default_main_program(),
                           fluid.default_startup_program()):