Support sync batch norm. (#16121)

* Support Sync Batch Norm. * Note, do not enable it in one device. Usage: build_strategy = fluid.BuildStrategy() build_strategy.sync_batch_norm = True binary = fluid.compiler.CompiledProgram(tp).with_data_parallel( loss_name=loss_mean.name, build_strategy=build_strategy)
7 years ago · 8ad672a287
parent 4ae23cc3c5
commit 8ad672a287
23 changed files with 1189 additions and 303 deletions
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@ -110,7 +110,7 @@ function(op_library TARGET)
    # Define operators that don't need pybind here.
    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
-"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op")
+"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op")
        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
            set(pybind_flag 1)
        endif()
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -91,7 +91,7 @@ paddle.fluid.layers.pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'po
 paddle.fluid.layers.pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', '043de7333b79ee0ac55053c14ed81625'))
 paddle.fluid.layers.adaptive_pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '859b887174d06f361658f69cb7c06d95'))
 paddle.fluid.layers.adaptive_pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '120f4323a3d7ed9c0916f15a59f0e497'))
-paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', 'c527b71b8a4c60dca8df8a745c2b598d'))
+paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', '320c6973b02ea179fa89fecc80796464'))
 paddle.fluid.layers.data_norm (ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, None, None, None, False)), ('document', 'e45e09e65a2658e07cad987222f0d9ab'))
 paddle.fluid.layers.beam_search_decode (ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b0b8d53821716cd50c42e09b593f3feb'))
 paddle.fluid.layers.conv2d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', '03993955ab1e6d3044c44e6f17fc85e9'))
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@ -16,6 +16,7 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <memory>
 #include <utility>
 #include "paddle/fluid/framework/details/memory_optimize_helper.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
@ -49,6 +50,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
      AppendPass("sequential_execution_pass");
    }
    // Add op fusion.
    if (strategy.sync_batch_norm_) {
      AppendPass("sync_batch_norm_pass");
    }
    // Add op fusion.
    if (strategy.fuse_relu_depthwise_conv_) {
      AppendPass("fuse_relu_depthwise_conv_pass");
@ -227,6 +233,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
 }  // namespace framework
 }  // namespace paddle
 USE_PASS(sync_batch_norm_pass);
 USE_PASS(fuse_relu_depthwise_conv_pass);
 USE_PASS(fuse_elewise_add_act_pass);
 USE_PASS(graph_viz_pass);
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@ -77,6 +77,8 @@ struct BuildStrategy {
  bool fuse_relu_depthwise_conv_{false};
  bool sync_batch_norm_{false};
  bool memory_optimize_{true};
  // TODO(dzhwinter):
  // make enable_inplace, memory_optimize_
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@ -67,6 +67,7 @@ pass_library(conv_elementwise_add_fuse_pass inference)
 pass_library(conv_affine_channel_fuse_pass inference)
 pass_library(transpose_flatten_concat_fuse_pass inference)
 pass_library(identity_scale_op_clean_pass base)
 pass_library(sync_batch_norm_pass base)
 # There may be many transpose-flatten structures in a model, and the output of
 # these structures will be used as inputs to the concat Op. This pattern will
@ -101,6 +102,7 @@ cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS g
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
 cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto)
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
 cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass)
 cc_test(test_cpu_quantize_squash_pass SRCS cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
 if (WITH_MKLDNN)
    cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
--- a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
+++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
@ -0,0 +1,45 @@
 /* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/framework/ir/sync_batch_norm_pass.h"
 #include <memory>
 #include <string>
 #include <utility>
 namespace paddle {
 namespace framework {
 namespace ir {
 std::unique_ptr<ir::Graph> SyncBatchNormPass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  VLOG(3) << "Use synchronous batch norm";
  for (const Node* n : graph->Nodes()) {
    if (n->IsOp()) {
      auto* op = n->Op();
      if (op->Type() == "batch_norm") {
        op->SetType("sync_batch_norm");
      }
      if (op->Type() == "batch_norm_grad") {
        op->SetType("sync_batch_norm_grad");
      }
    }
  }
  return graph;
 }
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
 REGISTER_PASS(sync_batch_norm_pass, paddle::framework::ir::SyncBatchNormPass);
--- a/paddle/fluid/framework/ir/sync_batch_norm_pass.h
+++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.h
@ -0,0 +1,32 @@
 /* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <memory>
 #include "paddle/fluid/framework/ir/pass.h"
 namespace paddle {
 namespace framework {
 namespace ir {
 class SyncBatchNormPass : public Pass {
 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(
      std::unique_ptr<ir::Graph> graph) const override;
 };
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc
+++ b/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc
@ -0,0 +1,80 @@
 // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/ir/sync_batch_norm_pass.h"
 #include <gtest/gtest.h>
 namespace paddle {
 namespace framework {
 namespace ir {
 void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
           const std::vector<std::string>& inputs,
           const std::vector<std::string>& outputs) {
  auto* op = prog->MutableBlock(0)->AppendOp();
  op->SetType(type);
  op->SetAttr("name", name);
  op->SetInput("X", inputs);
  op->SetOutput("Out", outputs);
 }
 // (a, conv_w)->conv2d->b
 // (b, bn_scale, bn_bias, mean, var)->batch_norm
 //     ->(c, mean, var, save_mean, save_inv_var)
 ProgramDesc BuildProgramDesc() {
  ProgramDesc prog;
  for (auto& v : std::vector<std::string>({"a", "conv_w", "b", "bn_scale",
                                           "bn_bias", "mean", "var", "c",
                                           "save_mean", "save_inv_var"})) {
    auto* var = prog.MutableBlock(0)->Var(v);
    if (v == "conv_w" || v == "bn_scale" || v == "bn_bias" || v == "mean" ||
        v == "var") {
      var->SetPersistable(true);
    }
  }
  SetOp(&prog, "conv2d", "conv", std::vector<std::string>({"a", "conv_w"}),
        std::vector<std::string>({"b"}));
  SetOp(&prog, "batch_norm", "bn",
        std::vector<std::string>({"b", "bn_scale", "bn_bias", "mean", "var"}),
        std::vector<std::string>(
            {"c", "mean", "var", "save_mean", "save_inv_var"}));
  return prog;
 }
 TEST(IsTestPass, basic) {
  auto prog = BuildProgramDesc();
  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
  auto pass = PassRegistry::Instance().Get("sync_batch_norm_pass");
  graph = pass->Apply(std::move(graph));
  for (auto* node : graph->Nodes()) {
    if (node->IsOp()) {
      auto* op = node->Op();
      auto op_name = boost::get<std::string>(op->GetAttr("name"));
      if (op_name == "bn") {
        ASSERT_EQ(op->Type(), "sync_batch_norm");
      }
    }
  }
 }
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
 USE_PASS(sync_batch_norm_pass);
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -14,8 +14,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/parallel_executor.h"
 #include <algorithm>
 #include <memory>
 #include <string>
 #include <tuple>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph_helper.h"
@ -251,6 +253,20 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
    member_->nccl_ctxs_.reset(new platform::NCCLContextMap(
        member_->places_, nccl_id, build_strategy.num_trainers_,
        build_strategy.trainer_id_));
    std::unique_ptr<platform::NCCLContextMap> dev_nccl_ctxs;
    dev_nccl_ctxs.reset(new platform::NCCLContextMap(member_->places_));
    // Initialize device context's nccl comm
    // Note, more than one ParallelExecutor with same place, the nccl comm will
    // be rewrite and there will be some problem.
    for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
      auto &nccl_ctx = dev_nccl_ctxs->at(dev_id);
      platform::DeviceContextPool &pool =
          platform::DeviceContextPool::Instance();
      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
          pool.Get(member_->places_[dev_id]));
      dev_ctx->set_nccl_comm(nccl_ctx.comm());
    }
 #else
    PADDLE_THROW("Not compiled with CUDA");
 #endif
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@ -44,10 +44,10 @@ if (WITH_DISTRIBUTE)
    SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch)
 endif()
-register_operators(EXCLUDES py_func_op warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
+register_operators(EXCLUDES py_func_op warpctc_op conv_fusion_op sync_batch_norm_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
 # warpctc_op needs cudnn 7 above
 if (WITH_GPU)
    # warpctc_op needs cudnn 7 above
    if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
        op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
    else()
@ -58,6 +58,8 @@ if (WITH_GPU)
        op_library(conv_fusion_op)
        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n")
    endif()
    op_library(sync_batch_norm_op)
    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n")
 else()
    op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 endif()
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/batch_norm_op.h"
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include "paddle/fluid/framework/data_layout.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@ -22,27 +24,34 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-class BatchNormOp : public framework::OperatorWithKernel {
+void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
- public:
+  PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of ConvOp should not be null.");
-  using framework::OperatorWithKernel::OperatorWithKernel;
+  PADDLE_ENFORCE(ctx->HasInput("Scale"),
-
+                 "Input(Scale) of ConvOp should not be null.");
-  void InferShape(framework::InferShapeContext *ctx) const override {
+  PADDLE_ENFORCE(ctx->HasInput("Bias"),
-    PADDLE_ENFORCE(ctx->HasInput("X"), "");
+                 "Input(Bias) of ConvOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Scale"), "");
+  PADDLE_ENFORCE(ctx->HasInput("Mean"),
-    PADDLE_ENFORCE(ctx->HasInput("Bias"), "");
+                 "Input(Mean) of ConvOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Mean"), "");
+  PADDLE_ENFORCE(ctx->HasInput("Variance"),
-    PADDLE_ENFORCE(ctx->HasInput("Variance"), "");
+                 "Input(Variance) of ConvOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Y"), "");
+  PADDLE_ENFORCE(ctx->HasOutput("Y"),
-    PADDLE_ENFORCE(ctx->HasOutput("MeanOut"), "");
+                 "Output(Y) of ConvOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("VarianceOut"), "");
+  bool is_test = ctx->Attrs().Get<bool>("is_test");
-    PADDLE_ENFORCE(ctx->HasOutput("SavedMean"), "");
+  if (!is_test) {
-    PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"), "");
+    PADDLE_ENFORCE(ctx->HasOutput("MeanOut"),
                   "Output(MeanOut) of ConvOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("VarianceOut"),
                   "Output(VarianceOut) of ConvOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("SavedMean"),
                   "Output(SavedMean) of ConvOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"),
                   "Output(SavedVariance) of ConvOp should not be null.");
  }
  // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python
  PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0],
                    "Mean and MeanOut should share the same memory");
-    PADDLE_ENFORCE_EQ(ctx->Inputs("Variance")[0],
+  PADDLE_ENFORCE_EQ(ctx->Inputs("Variance")[0], ctx->Outputs("VarianceOut")[0],
                      ctx->Outputs("VarianceOut")[0],
                    "Variance and VarianceOut should share the same memory");
  const auto x_dims = ctx->GetInputDim("X");
@ -69,9 +78,8 @@ class BatchNormOp : public framework::OperatorWithKernel {
  ctx->ShareLoD("X", "Y");
 }
- protected:
+framework::OpKernelType BatchNormOp::GetExpectedKernelType(
-  framework::OpKernelType GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
      const framework::ExecutionContext &ctx) const override {
  auto input_data_type = ctx.Input<Tensor>("X")->type();
  // By default, the type of the scale, bias, mean,
  // and var tensors should both be float. (For float or float16 input tensor)
@ -103,11 +111,8 @@ class BatchNormOp : public framework::OperatorWithKernel {
  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
                                 library);
 }
 };
-class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
+void BatchNormOpMaker::Make() {
 public:
  void Make() override {
  AddAttr<bool>("is_test",
                "(bool, default false) Set to true for inference only, false "
                "for training. Some layers may run faster when this is true.")
@ -174,16 +179,6 @@ The required data format for this layer is one of the following:
 )DOC");
 }
 };
 class BatchNormOpInferVarType
    : public framework::PassInDtypeAndVarTypeToOutput {
 protected:
  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
      const override {
    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
  }
 };
 template <typename T>
 class BatchNormKernel<platform::CPUDeviceContext, T>
@ -336,11 +331,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
  }
 };
-class BatchNormGradOp : public framework::OperatorWithKernel {
+void BatchNormGradOp::InferShape(framework::InferShapeContext *ctx) const {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext *ctx) const override {
  // check input
  PADDLE_ENFORCE(ctx->HasInput("X"));
  PADDLE_ENFORCE(ctx->HasInput("Scale"), "Input(scale) should not be null.");
@ -368,8 +359,7 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
  const auto x_dims = ctx->GetInputDim("X");
  const DataLayout data_layout = framework::StringToDataLayout(
      ctx->Attrs().Get<std::string>("data_layout"));
-    const int C =
+  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
        (data_layout == DataLayout::kNCHW ? x_dims[1]
                                                  : x_dims[x_dims.size() - 1]);
  ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
@ -379,9 +369,8 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
  }
 }
- protected:
+framework::OpKernelType BatchNormGradOp::GetExpectedKernelType(
-  framework::OpKernelType GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
      const framework::ExecutionContext &ctx) const override {
  const auto *var = ctx.InputVar(framework::GradVarName("Y"));
  if (var == nullptr) {
    PADDLE_THROW("can't find Y@GRAD");
@ -408,10 +397,9 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
  }
 #endif
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+  return framework::OpKernelType(ctx.Input<Tensor>("X")->type(), ctx.GetPlace(),
-                                   ctx.GetPlace(), layout, library);
+                                 layout, library);
 }
 };
 template <typename T>
 class BatchNormGradKernel<platform::CPUDeviceContext, T>
@ -572,14 +560,9 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
  }
 };
-class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
+std::unique_ptr<framework::OpDesc> BatchNormGradMaker::Apply() const {
 public:
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 protected:
  std::unique_ptr<framework::OpDesc> Apply() const override {
  auto *op = new framework::OpDesc();
-    op->SetType("batch_norm_grad");
+  op->SetType(GradOpType());
  op->SetInput("X", Input("X"));
  op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
@ -602,7 +585,6 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
  return std::unique_ptr<framework::OpDesc>(op);
 }
 };
 class BatchNormInplaceInToOut : public framework::InplaceInToOut {
 public:
@ -642,10 +624,10 @@ class BatchNormGradInplaceInToOut : public framework::InplaceInToOut {
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
-                  ops::BatchNormOpInferVarType, ops::BatchNormGradMaker,
+                  ops::BatchNormOpInferVarType, ops::BatchNormGradMaker)
-                  ops::BatchNormInplaceInToOut);
+// ops::BatchNormInplaceInToOut);
-REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp,
+REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp)
-                  ops::BatchNormGradInplaceInToOut);
+//                  ops::BatchNormGradInplaceInToOut);
 REGISTER_OP_CPU_KERNEL(
    batch_norm, ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>,
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@ -33,26 +33,6 @@ using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
 using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 void ExtractNCWHD(const framework::DDim &dims, const DataLayout &data_layout,
                  int *N, int *C, int *H, int *W, int *D) {
  *N = dims[0];
  if (dims.size() == 2) {
    *C = dims[1];
    *H = 1;
    *W = 1;
    *D = 1;
  } else {
    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
    *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
    *W = dims.size() > 3
             ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2])
             : 1;
    *D = dims.size() > 4
             ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3])
             : 1;
  }
 }
 template <typename T>
 class BatchNormKernel<platform::CUDADeviceContext, T>
    : public framework::OpKernel<T> {
@ -196,22 +176,6 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
  }
 };
 template <typename T, framework::DataLayout layout>
 static __global__ void KeBNBackwardData(const T *dy,
                                        const BatchNormParamType<T> *scale,
                                        const BatchNormParamType<T> *variance,
                                        const double epsilon, const int C,
                                        const int HxW, const int num, T *dx) {
  int gid = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
  for (int i = gid; i < num; i += stride) {
    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
    BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
    dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
                           scale[c] * inv_var);
  }
 }
 template <typename T, int BlockDim, framework::DataLayout layout>
 static __global__ void KeBNBackwardScaleBias(
    const T *dy, const T *x, const BatchNormParamType<T> *mean,
@ -248,6 +212,22 @@ static __global__ void KeBNBackwardScaleBias(
  }
 }
 template <typename T, framework::DataLayout layout>
 static __global__ void KeBNBackwardData(const T *dy,
                                        const BatchNormParamType<T> *scale,
                                        const BatchNormParamType<T> *variance,
                                        const double epsilon, const int C,
                                        const int HxW, const int num, T *dx) {
  int gid = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
  for (int i = gid; i < num; i += stride) {
    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
    BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
    dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
                           scale[c] * inv_var);
  }
 }
 template <typename T>
 class BatchNormGradKernel<platform::CUDADeviceContext, T>
    : public framework::OpKernel<T> {
@ -383,7 +363,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
          KeBNBackwardScaleBias<T, block, framework::DataLayout::kNCHW><<<
              grid2, block, 0, dev_ctx.stream()>>>(
              d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
-              epsilon, C, H * W, num, d_scale->data<BatchNormParamType<T>>(),
+              epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
              d_bias->data<BatchNormParamType<T>>());
        }
      } else {
@ -394,10 +374,10 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
              running_var_data, epsilon, C, H * W, num, d_x->data<T>());
        }
        if (d_scale && d_bias) {
-          KeBNBackwardScaleBias<T, block, framework::DataLayout::kNCHW><<<
+          KeBNBackwardScaleBias<T, block, framework::DataLayout::kNHWC><<<
              grid2, block, 0, dev_ctx.stream()>>>(
              d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
-              epsilon, C, H * W, num, d_scale->data<BatchNormParamType<T>>(),
+              epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
              d_bias->data<BatchNormParamType<T>>());
        }
      }
--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
@ -35,6 +38,52 @@ template <typename T>
 using ConstEigenVectorArrayMap =
    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
 class BatchNormOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext *ctx) const override;
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext &ctx) const override;
 };
 class BatchNormGradOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext *ctx) const override;
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext &ctx) const override;
 };
 class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override;
 };
 class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
 public:
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 protected:
  std::unique_ptr<framework::OpDesc> Apply() const override;
  virtual std::string GradOpType() const {
    return this->ForwardOpType() + "_grad";
  }
 };
 class BatchNormOpInferVarType
    : public framework::PassInDtypeAndVarTypeToOutput {
 protected:
  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
      const override {
    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
  }
 };
 template <typename DeviceContext, typename T>
 class BatchNormKernel : public framework::OpKernel<T> {
 public:
@ -47,5 +96,26 @@ class BatchNormGradKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext &ctx) const override;
 };
 inline void ExtractNCWHD(const framework::DDim &dims,
                         const DataLayout &data_layout, int *N, int *C, int *H,
                         int *W, int *D) {
  *N = dims[0];
  if (dims.size() == 2) {
    *C = dims[1];
    *H = 1;
    *W = 1;
    *D = 1;
  } else {
    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
    *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
    *W = dims.size() > 3
             ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2])
             : 1;
    *D = dims.size() > 4
             ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3])
             : 1;
  }
 }
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/sync_batch_norm_op.cc
+++ b/paddle/fluid/operators/sync_batch_norm_op.cc
@ -0,0 +1,20 @@
 /* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/batch_norm_op.h"
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(sync_batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
                  ops::BatchNormOpInferVarType, ops::BatchNormGradMaker);
 REGISTER_OPERATOR(sync_batch_norm_grad, ops::BatchNormGradOp);
--- a/paddle/fluid/operators/sync_batch_norm_op.cu
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@ -57,7 +57,6 @@ DeviceContextPool::DeviceContextPool(
  for (auto& p : places) {
    set.insert(p);
  }
  for (auto& p : set) {
    if (platform::is_cpu_place(p)) {
 #ifdef PADDLE_WITH_MKLDNN
@ -317,6 +316,7 @@ CUDADeviceContext::~CUDADeviceContext() {
  eigen_stream_.reset();
  eigen_device_.reset();
  PADDLE_ENFORCE(cudaStreamDestroy(stream_));
  PADDLE_ENFORCE(dynload::ncclCommDestroy(nccl_comm_));
 }
 Place CUDADeviceContext::GetPlace() const { return place_; }
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@ -265,6 +265,12 @@ class CUDADeviceContext : public DeviceContext {
  /*! \brief  Return cuda stream in the device context. */
  cudaStream_t stream() const;
  /*! \brief  Return nccl communicators. */
  ncclComm_t nccl_comm() const { return nccl_comm_; }
  /*! \brief  Set nccl communicators. */
  void set_nccl_comm(ncclComm_t comm) { nccl_comm_ = comm; }
  template <typename Callback>
  void RecordEvent(cudaEvent_t ev, Callback callback) {
    callback();
@ -289,6 +295,13 @@ class CUDADeviceContext : public DeviceContext {
  std::unique_ptr<CublasHandleHolder> cublas_handle_;
  std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;
  // NCCL communicator (single process version) for NCCL collective operations.
  // NCCL collective operations provides fast collectives over multiple GPUs
  // both within and across nodes.
  // But, this collectives is used for collectives over multiple GPUs within
  // nodes.
  ncclComm_t nccl_comm_{nullptr};
  int compute_capability_;
  int runtime_version_;
  int driver_version_;
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <string.h>  // for strdup
 #include <algorithm>
 #include <memory>
 #include <set>
 #include <stdexcept>
 #include <string>
@ -140,6 +142,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
  places.emplace_back(platform::CPUPlace());
  platform::DeviceContextPool::Init(places);
  platform::DeviceTemporaryAllocator::Init();
 #ifndef PADDLE_WITH_MKLDNN
  platform::SetNumThreads(FLAGS_paddle_num_threads);
 #endif
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@ -16,9 +16,11 @@
 #pragma once
 #include <stdio.h>
 #include <memory>
 #include <string>
 #include <thread>  // NOLINT
 #include <typeindex>
 #include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/platform/dynload/nccl.h"
@ -78,6 +80,8 @@ struct NCCLContext {
  cudaStream_t stream() const { return ctx_->stream(); }
  ncclComm_t comm() const { return comm_; }
  int device_id() const {
    return boost::get<platform::CUDAPlace>(ctx_->GetPlace()).device;
  }
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@ -1231,6 +1231,21 @@ All parameter, weight, gradient are variables in Paddle.
                      This options is only available in GPU devices.
                      Default False)DOC")
      .def_property(
          "sync_batch_norm",
          [](const BuildStrategy &self) { return self.sync_batch_norm_; },
          [](BuildStrategy &self, bool b) {
            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
            self.sync_batch_norm_ = b;
          },
          R"DOC(The type is BOOL, sync_batch_norm indicates whether to use
                synchronous batch normalization which synchronizes the mean
                and variance through multi-devices in training phase.
                Current implementation doesn't support FP16 training and CPU.
                And only synchronous on one machine, not all machines.
                Default False)DOC")
      .def_property(
          "memory_optimize",
          [](const BuildStrategy &self) { return self.memory_optimize_; },
          [](BuildStrategy &self, bool b) { self.memory_optimize_ = b; })
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@ -223,6 +223,9 @@ class CompiledProgram(object):
                tps), "num_trainers == len(end_points)"
            self._build_strategy.trainers_endpoints = tps
        if self._build_strategy.sync_batch_norm:
            self._build_strategy.enable_sequential_execution = True
        self._persistable_vars = []
        for node in self._graph.nodes():
            if node.is_var() and node.var() is not None and node.var().persistable() and \
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@ -2922,11 +2922,17 @@ def batch_norm(input,
        y_i &\\gets \\gamma \\hat{x_i} + \\beta
    Args:
-        input(variable): The input variable which is a LoDTensor.
+        input(variable): The rank of input variable can be 2, 3, 4, 5.
        act(string, Default None): Activation type, linear|relu|prelu|...
-        is_test(bool, Default False): Used for training or training.
+        is_test (bool, Default False): A flag indicating whether it is in
-        momentum(float, Default 0.9):
+            test phrase or not.
-        epsilon(float, Default 1e-05):
+        momentum(float, Default 0.9): The value used for the moving_mean and
            moving_var computation. The updated formula is:
            :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
            :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
            Default is 0.9.
        epsilon(float, Default 1e-05): A value added to the denominator for
            numerical stability. Default is 1e-5.
        param_attr(ParamAttr|None): The parameter attribute for Parameter `scale`
             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
             will create ParamAttr as param_attr. If the Initializer of the param_attr
@ -2984,15 +2990,8 @@ def batch_norm(input,
        shape=param_shape,
        dtype=dtype,
        default_initializer=Constant(1.0))
    # setting stop_gradient=True to reduce computation
    if use_global_stats and helper.param_attr.learning_rate == 0.:
        scale.stop_gradient = True
    bias = helper.create_parameter(
        attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
    # setting stop_gradient=True to reduce computation
    if use_global_stats and helper.bias_attr.learning_rate == 0.:
        bias.stop_gradient = True
    mean = helper.create_parameter(
        attr=ParamAttr(
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@ -0,0 +1,159 @@
 #   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import print_function
 import unittest
 import numpy as np
 import os
 import six
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import compiler
 class TestSyncBatchNormOpTraining(unittest.TestCase):
    def setUp(self):
        #self.dtype = np.float32
        self.dtype = np.float64
        self.N = 32
        self.C = 16
        self.H = 64
        self.W = 32
        self.dshape = [self.N, self.C, self.H, self.W]
    def build_program(self,
                      place,
                      layout,
                      seed,
                      sync_bn=False,
                      only_forward=False):
        main = fluid.Program()
        startup = fluid.Program()
        main.random_seed = seed
        startup.random_seed = seed
        with fluid.unique_name.guard():
            with fluid.program_guard(main, startup):
                data = fluid.layers.data(
                    name='input',
                    shape=self.dshape,
                    dtype=self.dtype,
                    append_batch_size=False)
                conv = fluid.layers.conv2d(
                    input=data,
                    num_filters=32,
                    filter_size=1,
                    param_attr=fluid.ParamAttr(name='conv2d_weight'),
                    bias_attr=False,
                    use_cudnn=False)
                bn = fluid.layers.batch_norm(
                    conv,
                    param_attr=fluid.ParamAttr(name='bn_scale'),
                    bias_attr=fluid.ParamAttr(name='bn_bias'),
                    moving_mean_name='bn_moving_mean',
                    moving_variance_name='bn_moving_variance',
                    data_layout=layout,
                    is_test=only_forward)
                sigmoid = fluid.layers.sigmoid(bn)
                out = fluid.layers.reduce_sum(sigmoid)
                if not sync_bn:
                    out = out / core.get_cuda_device_count()
                if not only_forward:
                    sgd_opt = fluid.optimizer.SGD(learning_rate=0.0)
                    sgd_opt.backward(out)
        return main, startup, [out, conv, bn]
    def compare(self, place, layout, only_forward):
        seed = 10
        os.environ['FLAGS_cudnn_deterministic'] = "1"
        data = np.random.random(size=self.dshape).astype(self.dtype) * 4. - 2
        # Single-GPU, N = 32 per GPU
        main, startup, outs = self.build_program(place, layout, seed, False,
                                                 only_forward)
        exe = fluid.Executor(place)
        exe.run(startup)
        fetch_names = [v.name for v in outs] + [
            'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias'
        ]
        if not only_forward:
            others = [
                'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD',
                'bn_bias@GRAD', 'batch_norm_0.tmp_2@GRAD', 'conv2d_0.tmp_0@GRAD'
            ]
            fetch_names += others
        bn_fetches = exe.run(program=main,
                             feed={'input': data},
                             fetch_list=fetch_names)
        #####################################################################
        # Multi-GPUs, self.N / core.get_cuda_device_count() per GPU
        main, startup, outs = self.build_program(place, layout, seed, True,
                                                 only_forward)
        exe = fluid.Executor(place)
        exe.run(startup)
        fetch_names = [v.name for v in outs] + [
            'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias'
        ]
        if not only_forward:
            others = [
                'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD',
                'bn_bias@GRAD', 'batch_norm_0.tmp_2@GRAD', 'conv2d_0.tmp_0@GRAD'
            ]
            fetch_names += others
        for nm in fetch_names:
            fv = fluid.framework._get_var(str(nm), program=main)
            fv.persistable = True
        build_strategy = fluid.BuildStrategy()
        build_strategy.sync_batch_norm = True
        build_strategy.enable_inplace = False
        build_strategy.memory_optimize = False
        comp_prog = compiler.CompiledProgram(main).with_data_parallel(
            outs[0].name if not only_forward else None,
            build_strategy=build_strategy)
        sync_bn_fetches = exe.run(program=comp_prog,
                                  feed={'input': data},
                                  fetch_list=fetch_names)
        for i in six.moves.xrange(1, len(sync_bn_fetches)):
            bn_val = bn_fetches[i]
            sync_bn_val = sync_bn_fetches[i]
            if sync_bn_val.shape != bn_val.shape:
                sync_bn_val = sync_bn_val[:bn_val.shape[0]]
            self.assertTrue(
                np.allclose(
                    bn_val, sync_bn_val, atol=1e-3),
                "Output (" + fetch_names[i] + ") has diff. \n" + "\nBN     " +
                str(bn_val) + "\n" + "Sync BN " + str(sync_bn_val))
    def test_train(self):
        if not core.is_compiled_with_cuda():
            return
        places = [core.CUDAPlace(0)]
        for place in places:
            for layout in ["NCHW", "NHWC"]:
                self.compare(place, layout, False)
    def test_infer(self):
        if not core.is_compiled_with_cuda():
            return
        places = [core.CUDAPlace(0)]
        for place in places:
            for layout in ["NCHW", "NHWC"]:
                self.compare(place, layout, True)
 if __name__ == '__main__':
    unittest.main()