supports collective training with programs (#18392)

1. Since allreduce op has 4 reduce types, We split these four reduce types into four ops 2. We also refined the collective op code, e.g. we separated the collective op kernel into CPUKernel and CUDAKernel, and remove the device specified DeviceContext parameter in template as we already knew the target DeviceContext 3. We remove the newly added Collective op role to reduce the complexity of program and graph analysis
6 years ago · a873fa84ce
parent 85b49d8473
commit a873fa84ce
30 changed files with 669 additions and 428 deletions
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@ -74,7 +74,6 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
               static_cast<int>(OpRole::kBackward),
           static_cast<int>(OpRole::kOptimize) |
               static_cast<int>(OpRole::kLRSched),
-           static_cast<int>(OpRole::kCollective),
           static_cast<int>(OpRole::kNotSpecified)})
      .SetDefault(static_cast<int>(OpRole::kNotSpecified));
  AddAttr<std::vector<std::string>>(OpRoleVarAttrName(),
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@ -34,9 +34,6 @@ enum class OpRole {
  kDist = 0x0008,
  // Tag all learning rate scheduler operators.
  kLRSched = 0x0010,
-  // Collective role is for all collective operators and other operators used
-  // for collective training
-  kCollective = 0x0020,

  kLoss = 0x0100,
  // The default value of op's role. This should be only used for unittests and
--- a/paddle/fluid/operators/collective/c_allgather_op.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cc
@ -13,9 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/collective/c_allgather_op.h"
-#include <future>  // NOLINT
+
 #include <memory>
-#include <ostream>

 namespace paddle {
 namespace operators {
@ -25,8 +24,7 @@ class CAllGatherOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SyncFCGather op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
    int nranks = ctx->Attrs().Get<int>("nranks");
    PADDLE_ENFORCE_GE(nranks, 2, "nranks should be >=2");
    framework::DDim dim = ctx->GetInputDim("X");
@ -49,10 +47,10 @@ class CAllGatherOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<int>("nranks",
                 "Total trainer count of the distributed training job");
    AddComment(R"DOC(
-***CAllGather Operator***
+CAllGather Operator
 each rank receives the aggregation of data from all ranks in the order of the ranks

-Call NCCL collective AllGather internally.https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/api/colls.html#c.ncclAllGather
+reference: https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/usage/operations.html#allgather
 )DOC");
  }
 };
@ -81,9 +79,8 @@ namespace plat = paddle::platform;
 REGISTER_OPERATOR(c_allgather, ops::CAllGatherOp, ops::CAllGatherOpGradMaker,
                  ops::CAllGatherOpMaker);

-REGISTER_OP_CPU_KERNEL(
-    c_allgather, ops::CAllGatherOpKernel<plat::CPUDeviceContext, float>,
-    ops::CAllGatherOpKernel<plat::CPUDeviceContext, double>,
-    ops::CAllGatherOpKernel<plat::CPUDeviceContext, int>,
-    ops::CAllGatherOpKernel<plat::CPUDeviceContext, int64_t>,
-    ops::CAllGatherOpKernel<plat::CPUDeviceContext, plat::float16>);
+REGISTER_OP_CPU_KERNEL(c_allgather, ops::CAllGatherOpCPUKernel<float>,
+                       ops::CAllGatherOpCPUKernel<double>,
+                       ops::CAllGatherOpCPUKernel<int>,
+                       ops::CAllGatherOpCPUKernel<int64_t>,
+                       ops::CAllGatherOpCPUKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
@ -14,12 +14,64 @@ limitations under the License. */

 #include "paddle/fluid/operators/collective/c_allgather_op.h"

+#include <memory>
+
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CAllGatherOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    ncclDataType_t dtype = platform::ToNCCLDataType(in->type());
+
+    int nranks = ctx.Attr<int>("nranks");
+    int rid = ctx.Attr<int>("ring_id");
+    auto comm = platform::NCCLCommContext::Instance().Get(rid);
+    PADDLE_ENFORCE_EQ(nranks, comm->nranks());
+
+    auto place = ctx.GetPlace();
+    framework::DDim out_dims = in->dims();
+    out_dims[0] *= nranks;
+    out->mutable_data<T>(out_dims, place);
+
+    int64_t send_numel = in->numel();
+    const T* send_buff = in->data<T>();
+    T* recv_buff = out->data<T>();
+
+    cudaStream_t stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    PADDLE_ENFORCE(platform::dynload::ncclAllGather(
+        send_buff, recv_buff, send_numel, static_cast<ncclDataType_t>(dtype),
+        comm->comm(), stream));
+#else
+    PADDLE_THROW("PaddlePaddle should compile with GPU.");
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;

-REGISTER_OP_CUDA_KERNEL(
-    c_allgather, ops::CAllGatherOpKernel<plat::CUDADeviceContext, float>,
-    ops::CAllGatherOpKernel<plat::CUDADeviceContext, double>,
-    ops::CAllGatherOpKernel<plat::CUDADeviceContext, int>,
-    ops::CAllGatherOpKernel<plat::CUDADeviceContext, int64_t>,
-    ops::CAllGatherOpKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(c_allgather, ops::CAllGatherOpCUDAKernel<float>,
+                        ops::CAllGatherOpCUDAKernel<double>,
+                        ops::CAllGatherOpCUDAKernel<int>,
+                        ops::CAllGatherOpCUDAKernel<int64_t>,
+                        ops::CAllGatherOpCUDAKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/c_allgather_op.h
+++ b/paddle/fluid/operators/collective/c_allgather_op.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+
 #include <algorithm>
 #include <utility>
 #include <vector>
@ -22,52 +23,14 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"

-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
 namespace paddle {
 namespace operators {

-template <typename DeviceContext, typename T>
-class CAllGatherOpKernel : public framework::OpKernel<T> {
+template <typename T>
+class CAllGatherOpCPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE(is_gpu_place(place),
-                   "CAllGatherOp can run on gpu place only for now.");
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
-    ncclDataType_t dtype = platform::ToNCCLDataType(in->type());
-
-    int rid = ctx.Attr<int>("ring_id");
-    auto comm = platform::NCCLCommContext::Instance().Get(rid);
-    int nranks = comm->nranks();
-
-    framework::DDim out_dims = in->dims();
-    out_dims[0] *= nranks;
-    out->mutable_data<T>(out_dims, place);
-
-    int64_t send_numel = in->numel();
-    const T* send_buff = in->data<T>();
-    T* recv_buff = out->data<T>();
-
-    cudaStream_t stream = nullptr;
-    if (ctx.Attr<bool>("use_calc_stream")) {
-      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
-    } else {
-      stream = comm->stream();
-    }
-
-    PADDLE_ENFORCE(platform::dynload::ncclAllGather(
-        send_buff, recv_buff, send_numel, static_cast<ncclDataType_t>(dtype),
-        comm->comm(), stream));
-#else
-    PADDLE_THROW("PaddlePaddle should compile with GPU.");
-#endif
+    PADDLE_THROW("unimplemented cpu kernel for CAllGatherOp.");
  }
 };

--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cc
@ -0,0 +1,39 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CAllReduceMaxOpMaker : public CAllReduceOpMaker {
+ protected:
+  std::string GetName() const override { return "Max"; }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_max, ops::CAllReduceOp,
+                             ops::CAllReduceMaxOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_allreduce_max,
+                       ops::CAllReduceOpCPUKernel<ops::kRedMax, float>,
+                       ops::CAllReduceOpCPUKernel<ops::kRedMax, double>,
+                       ops::CAllReduceOpCPUKernel<ops::kRedMax, int>,
+                       ops::CAllReduceOpCPUKernel<ops::kRedMax, int64_t>,
+                       ops::CAllReduceOpCPUKernel<ops::kRedMax, plat::float16>);
--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
@ -18,8 +18,8 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;

 REGISTER_OP_CUDA_KERNEL(
-    c_allreduce, ops::CAllReduceOpKernel<plat::CUDADeviceContext, float>,
-    ops::CAllReduceOpKernel<plat::CUDADeviceContext, double>,
-    ops::CAllReduceOpKernel<plat::CUDADeviceContext, int>,
-    ops::CAllReduceOpKernel<plat::CUDADeviceContext, int64_t>,
-    ops::CAllReduceOpKernel<plat::CUDADeviceContext, plat::float16>);
+    c_allreduce_max, ops::CAllReduceOpCUDAKernel<ops::kRedMax, float>,
+    ops::CAllReduceOpCUDAKernel<ops::kRedMax, double>,
+    ops::CAllReduceOpCUDAKernel<ops::kRedMax, int>,
+    ops::CAllReduceOpCUDAKernel<ops::kRedMax, int64_t>,
+    ops::CAllReduceOpCUDAKernel<ops::kRedMax, plat::float16>)
--- a/paddle/fluid/operators/collective/c_allreduce_min_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cc
@ -0,0 +1,39 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CAllReduceMinOpMaker : public CAllReduceOpMaker {
+ protected:
+  std::string GetName() const override { return "Min"; }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_min, ops::CAllReduceOp,
+                             ops::CAllReduceMinOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_allreduce_min,
+                       ops::CAllReduceOpCPUKernel<ops::kRedMin, float>,
+                       ops::CAllReduceOpCPUKernel<ops::kRedMin, double>,
+                       ops::CAllReduceOpCPUKernel<ops::kRedMin, int>,
+                       ops::CAllReduceOpCPUKernel<ops::kRedMin, int64_t>,
+                       ops::CAllReduceOpCPUKernel<ops::kRedMin, plat::float16>);
--- a/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
@ -0,0 +1,25 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    c_allreduce_min, ops::CAllReduceOpCUDAKernel<ops::kRedMin, float>,
+    ops::CAllReduceOpCUDAKernel<ops::kRedMin, double>,
+    ops::CAllReduceOpCUDAKernel<ops::kRedMin, int>,
+    ops::CAllReduceOpCUDAKernel<ops::kRedMin, int64_t>,
+    ops::CAllReduceOpCUDAKernel<ops::kRedMin, plat::float16>)
--- a/paddle/fluid/operators/collective/c_allreduce_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_op.cc
@ -1,83 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-
-namespace paddle {
-namespace operators {
-
-class CAllReduceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
-                                   ctx.GetPlace());
-  }
-};
-
-class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor), tensor to be allreduced.");
-    AddOutput("Out", "(Tensor) the allreduced result.");
-    AddAttr<int>("reduce_type", "(int default 0) determin the reduce type.")
-        .SetDefault(0);
-    AddAttr<int>("ring_id", "(int default 0) communication ring id.")
-        .SetDefault(0);
-    AddAttr<bool>(
-        "use_calc_stream",
-        "(bool default false) eject CUDA operations to calculation stream.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-***CAllReduce Operator***
-
-Call NCCL collective AllReduce internally. Note that this op must be used when one
-thread is managing one GPU device.
-
-For speed reasons, reduce_type should be an integer:
-
-0: sum
-1: prod
-2: max
-3: min 
-If input and output are the same variable, in-place allreduce will be used.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_WITHOUT_GRADIENT(c_allreduce, ops::CAllReduceOp,
-                             ops::CAllReduceOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    c_allreduce, ops::CAllReduceOpKernel<plat::CPUDeviceContext, float>,
-    ops::CAllReduceOpKernel<plat::CPUDeviceContext, double>,
-    ops::CAllReduceOpKernel<plat::CPUDeviceContext, int>,
-    ops::CAllReduceOpKernel<plat::CPUDeviceContext, int64_t>,
-    ops::CAllReduceOpKernel<plat::CPUDeviceContext, plat::float16>);
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -13,9 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-#include <algorithm>
-#include <utility>
-#include <vector>
+
+#include <string>

 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@ -29,17 +28,41 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename DeviceContext, typename T>
-class CAllReduceOpKernel : public framework::OpKernel<T> {
+enum ReduceType { kRedSum, kRedMax, kRedMin, kRedProd };
+
+class CAllReduceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.GetPlace());
+  }
+};
+
+template <ReduceType red_type, typename T>
+class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW("CAllReduce op do not support CPUKernel for now.");
+  }
+};
+
+template <ReduceType red_type, typename T>
+class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE(is_gpu_place(place),
-                   "CAllReduce op can run on gpu place only for now.");
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
    auto in = ctx.Input<framework::Tensor>("X");
    auto out = ctx.Output<framework::Tensor>("Out");

+    auto place = ctx.GetPlace();
    ncclDataType_t dtype = platform::ToNCCLDataType(in->type());
    int64_t numel = in->numel();
    const void* sendbuff = in->data<void>();
@ -49,23 +72,6 @@ class CAllReduceOpKernel : public framework::OpKernel<T> {
    int rid = ctx.Attr<int>("ring_id");
    auto comm = platform::NCCLCommContext::Instance().Get(rid);

-    int reduce_type = ctx.Attr<int>("reduce_type");
-    ncclRedOp_t red_type = ncclSum;
-    switch (reduce_type) {
-      case 0:
-        red_type = ncclSum;
-        break;
-      case 1:
-        red_type = ncclProd;
-        break;
-      case 2:
-        red_type = ncclMax;
-        break;
-      case 3:
-        red_type = ncclMin;
-        break;
-    }
-
    cudaStream_t stream = nullptr;
    if (ctx.Attr<bool>("use_calc_stream")) {
      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
@ -74,13 +80,60 @@ class CAllReduceOpKernel : public framework::OpKernel<T> {
      stream = comm->stream();
    }

+    ncclRedOp_t nccl_red_type = ncclSum;
+    switch (red_type) {
+      case kRedSum:
+        nccl_red_type = ncclSum;
+        break;
+
+      case kRedMax:
+        nccl_red_type = ncclMax;
+        break;
+
+      case kRedMin:
+        nccl_red_type = ncclMin;
+        break;
+
+      case kRedProd:
+        nccl_red_type = ncclProd;
+        break;
+
+      default:
+        PADDLE_THROW("Invalid reduce type: %d", red_type);
+    }
+
    PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
-        sendbuff, recvbuff, numel, dtype, red_type, comm->comm(), stream));
+        sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream));
 #else
    PADDLE_THROW("PaddlePaddle should compile with GPU.");
 #endif
  }
 };

+class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor), tensor to be allreduced.");
+    AddOutput("Out", "(Tensor) the allreduced result.");
+    AddAttr<int>("ring_id", "(int default 0) communication ring id.")
+        .SetDefault(0);
+    AddAttr<bool>(
+        "use_calc_stream",
+        "(bool default false) eject CUDA operations to calculation stream.")
+        .SetDefault(false);
+    AddComment(string::Sprintf(R"DOC(
+CAllReduce %s Operator
+
+Call collective AllReduce with reduce type %s. If input and output are
+the same variable, in-place allreduce will be used.
+Reference: https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/usage/operations.html#allreduce
+)DOC",
+                               GetName(), GetName()));
+  }
+
+ protected:
+  virtual std::string GetName() const = 0;
+};
+
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
@ -0,0 +1,39 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CAllReduceProdOpMaker : public CAllReduceOpMaker {
+ protected:
+  std::string GetName() const override { return "Prod"; }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_prod, ops::CAllReduceOp,
+                             ops::CAllReduceProdOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_allreduce_prod,
+                       ops::CAllReduceOpCPUKernel<ops::kRedProd, float>,
+                       ops::CAllReduceOpCPUKernel<ops::kRedProd, double>,
+                       ops::CAllReduceOpCPUKernel<ops::kRedProd, int>,
+                       ops::CAllReduceOpCPUKernel<ops::kRedProd, int64_t>,
+                       ops::CAllReduceOpCPUKernel<ops::kRedProd, plat::float16>)
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
@ -0,0 +1,25 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    c_allreduce_prod, ops::CAllReduceOpCUDAKernel<ops::kRedProd, float>,
+    ops::CAllReduceOpCUDAKernel<ops::kRedProd, double>,
+    ops::CAllReduceOpCUDAKernel<ops::kRedProd, int>,
+    ops::CAllReduceOpCUDAKernel<ops::kRedProd, int64_t>,
+    ops::CAllReduceOpCUDAKernel<ops::kRedProd, plat::float16>)
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
@ -0,0 +1,54 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CAllReduceSumOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> retv(new framework::OpDesc());
+    retv->SetType("c_allreduce_sum");
+    retv->SetInput("X", OutputGrad("Out"));
+    retv->SetOutput("Out", InputGrad("X"));
+    retv->SetAttrMap(Attrs());
+    return retv;
+  }
+};
+
+class CAllReduceSumOpMaker : public CAllReduceOpMaker {
+ protected:
+  std::string GetName() const override { return "Sum"; }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(c_allreduce_sum, ops::CAllReduceOp,
+                  ops::CAllReduceSumOpGradMaker, ops::CAllReduceSumOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_allreduce_sum,
+                       ops::CAllReduceOpCPUKernel<ops::kRedSum, float>,
+                       ops::CAllReduceOpCPUKernel<ops::kRedSum, double>,
+                       ops::CAllReduceOpCPUKernel<ops::kRedSum, int>,
+                       ops::CAllReduceOpCPUKernel<ops::kRedSum, int64_t>,
+                       ops::CAllReduceOpCPUKernel<ops::kRedSum, plat::float16>)
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
@ -0,0 +1,25 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    c_allreduce_sum, ops::CAllReduceOpCUDAKernel<ops::kRedSum, float>,
+    ops::CAllReduceOpCUDAKernel<ops::kRedSum, double>,
+    ops::CAllReduceOpCUDAKernel<ops::kRedSum, int>,
+    ops::CAllReduceOpCUDAKernel<ops::kRedSum, int64_t>,
+    ops::CAllReduceOpCUDAKernel<ops::kRedSum, plat::float16>)
--- a/paddle/fluid/operators/collective/c_broadcast_op.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cc
@ -12,9 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <future>  // NOLINT
-#include <ostream>
-
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"

 namespace paddle {
@ -50,9 +47,9 @@ class CBroadcastOpMaker : public framework::OpProtoAndCheckerMaker {
        "(bool default false) eject CUDA operations to calculation stream.")
        .SetDefault(false);
    AddComment(R"DOC(
-***CBroadcast Operator***
+CBroadcast Operator

-Call ncclBcast internally.
+Reference: https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/usage/operations.html#broadcast
 )DOC");
  }
 };
@ -66,9 +63,8 @@ namespace plat = paddle::platform;
 REGISTER_OP_WITHOUT_GRADIENT(c_broadcast, ops::CBroadcastOp,
                             ops::CBroadcastOpMaker);

-REGISTER_OP_CPU_KERNEL(
-    c_broadcast, ops::CBroadcastOpKernel<plat::CPUDeviceContext, float>,
-    ops::CBroadcastOpKernel<plat::CPUDeviceContext, double>,
-    ops::CBroadcastOpKernel<plat::CPUDeviceContext, int>,
-    ops::CBroadcastOpKernel<plat::CPUDeviceContext, int64_t>,
-    ops::CBroadcastOpKernel<plat::CPUDeviceContext, plat::float16>);
+REGISTER_OP_CPU_KERNEL(c_broadcast, ops::CBroadcastOpCPUKernel<float>,
+                       ops::CBroadcastOpCPUKernel<double>,
+                       ops::CBroadcastOpCPUKernel<int>,
+                       ops::CBroadcastOpCPUKernel<int64_t>,
+                       ops::CBroadcastOpCPUKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
@ -14,12 +14,74 @@ limitations under the License. */

 #include "paddle/fluid/operators/collective/c_broadcast_op.h"

+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+    int numel = x->numel();
+    ncclDataType_t dtype = platform::ToNCCLDataType(x->type());
+
+    int rid = ctx.Attr<int>("ring_id");
+    auto comm = platform::NCCLCommContext::Instance().Get(rid);
+
+    auto place = ctx.GetPlace();
+    cudaStream_t stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    int root = ctx.Attr<int>("root");
+    if (root == comm->rank()) {
+      PADDLE_ENFORCE(platform::dynload::ncclBcast(
+          reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), numel, dtype,
+          root, comm->comm(), stream));
+      VLOG(3) << "rank " << comm->rank() << " invoke Bcast. sent "
+              << x->numel();
+
+      if (out != x) {
+        framework::TensorCopy(
+            *static_cast<const framework::Tensor*>(x), place,
+            *platform::DeviceContextPool::Instance().Get(place),
+            static_cast<framework::Tensor*>(out));
+      }
+    } else {
+      PADDLE_ENFORCE(platform::dynload::ncclBcast(out->mutable_data<T>(place),
+                                                  numel, dtype, root,
+                                                  comm->comm(), stream));
+      VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved "
+              << framework::product(out->dims());
+    }
+
+    out->Resize(x->dims());
+    out->set_lod(x->lod());
+#else
+    PADDLE_THROW("PaddlePaddle should compile with GPU.");
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;

-REGISTER_OP_CUDA_KERNEL(
-    c_broadcast, ops::CBroadcastOpKernel<plat::CUDADeviceContext, float>,
-    ops::CBroadcastOpKernel<plat::CUDADeviceContext, double>,
-    ops::CBroadcastOpKernel<plat::CUDADeviceContext, int>,
-    ops::CBroadcastOpKernel<plat::CUDADeviceContext, int64_t>,
-    ops::CBroadcastOpKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(c_broadcast, ops::CBroadcastOpCUDAKernel<float>,
+                        ops::CBroadcastOpCUDAKernel<double>,
+                        ops::CBroadcastOpCUDAKernel<int>,
+                        ops::CBroadcastOpCUDAKernel<int64_t>,
+                        ops::CBroadcastOpCUDAKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/c_broadcast_op.h
+++ b/paddle/fluid/operators/collective/c_broadcast_op.h
@ -22,69 +22,14 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"

-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
 namespace paddle {
 namespace operators {

-template <typename DeviceContext, typename T>
-class CBroadcastOpKernel : public framework::OpKernel<T> {
+template <typename T>
+class CBroadcastOpCPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE(is_gpu_place(place),
-                   "CBroadcastOp can run on gpu place only for now.");
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    auto x = ctx.Input<framework::LoDTensor>("X");
-    auto out = ctx.Output<framework::LoDTensor>("Out");
-    int numel = x->numel();
-    ncclDataType_t dtype = platform::ToNCCLDataType(x->type());
-
-    int rid = ctx.Attr<int>("ring_id");
-    auto comm = platform::NCCLCommContext::Instance().Get(rid);
-
-    cudaStream_t stream = nullptr;
-    if (ctx.Attr<bool>("use_calc_stream")) {
-      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
-    } else {
-      stream = comm->stream();
-    }
-
-    int root = ctx.Attr<int>("root");
-    int nranks = comm->nranks();
-    PADDLE_ENFORCE(root >= 0 && root < nranks,
-                   "Expected root in range of [0,%d),but get %d", nranks, root);
-    if (root == comm->rank()) {
-      PADDLE_ENFORCE(platform::dynload::ncclBcast(
-          reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), numel, dtype,
-          root, comm->comm(), stream));
-      VLOG(3) << "rank " << comm->rank() << " invoke Bcast. sent "
-              << x->numel();
-
-      if (out != x) {
-        // TODO(liuyi05): check inplace
-        framework::TensorCopy(
-            *static_cast<const framework::Tensor*>(x), place,
-            *platform::DeviceContextPool::Instance().Get(place),
-            static_cast<framework::Tensor*>(out));
-      }
-    } else {
-      PADDLE_ENFORCE(platform::dynload::ncclBcast(out->mutable_data<T>(place),
-                                                  numel, dtype, root,
-                                                  comm->comm(), stream));
-      VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved "
-              << framework::product(out->dims());
-    }
-
-    out->Resize(x->dims());
-    out->set_lod(x->lod());
-#else
-    PADDLE_THROW("PaddlePaddle should compile with GPU.");
-#endif
+    PADDLE_THROW("Unimplemented cpu kernel for CBroadcastOp.");
  }
 };

--- a/paddle/fluid/operators/collective/c_comm_init_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_op.cc
@ -28,6 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
+
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
@ -11,9 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include <nccl.h>
 #endif
+
 #include <stdint.h>
 #include <ostream>
 #include <string>
@ -24,9 +26,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
+
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
+
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/collective/c_reducescatter_op.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.cc
@ -13,9 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
-#include <future>  // NOLINT
+
 #include <memory>
-#include <ostream>

 namespace paddle {
 namespace operators {
@ -54,9 +53,9 @@ class CReduceScatterOpMaker : public framework::OpProtoAndCheckerMaker {
        "(bool default false) eject CUDA operations to calculation stream.")
        .SetDefault(false);
    AddComment(R"DOC(
-***CReduceScatter Operator***
+CReduceScatter Operator

-Call NCCL collective ReduceScatter internally.
+Reference: https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/usage/operations.html#reducescatter
 )DOC");
  }
 };
@ -85,9 +84,8 @@ namespace plat = paddle::platform;
 REGISTER_OPERATOR(c_reducescatter, ops::CReduceScatterOp,
                  ops::CReduceScatterOpMaker);

-REGISTER_OP_CPU_KERNEL(
-    c_reducescatter, ops::CReduceScatterOpKernel<plat::CPUDeviceContext, float>,
-    ops::CReduceScatterOpKernel<plat::CPUDeviceContext, double>,
-    ops::CReduceScatterOpKernel<plat::CPUDeviceContext, int>,
-    ops::CReduceScatterOpKernel<plat::CPUDeviceContext, int64_t>,
-    ops::CReduceScatterOpKernel<plat::CPUDeviceContext, plat::float16>);
+REGISTER_OP_CPU_KERNEL(c_reducescatter, ops::CReduceScatterOpCPUKernel<float>,
+                       ops::CReduceScatterOpCPUKernel<double>,
+                       ops::CReduceScatterOpCPUKernel<int>,
+                       ops::CReduceScatterOpCPUKernel<int64_t>,
+                       ops::CReduceScatterOpCPUKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
@ -14,13 +14,61 @@ limitations under the License. */

 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"

+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CReduceScatterOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    int rid = ctx.Attr<int>("ring_id");
+    auto comm = platform::NCCLCommContext::Instance().Get(rid);
+    int nranks = comm->nranks();
+
+    auto place = ctx.GetPlace();
+    auto out_dims = in->dims();
+    out_dims[0] = out_dims[0] / nranks;
+    out->mutable_data<T>(out_dims, place);
+
+    int64_t recv_numel = in->numel() / nranks;
+    const T* send_buff = in->data<T>();
+    T* recv_buff = out->data<T>();
+    int dtype = platform::ToNCCLDataType(in->type());
+
+    cudaStream_t stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    PADDLE_ENFORCE(platform::dynload::ncclReduceScatter(
+        send_buff, recv_buff, recv_numel, static_cast<ncclDataType_t>(dtype),
+        ncclSum, comm->comm(), stream));
+#else
+    PADDLE_THROW("PaddlePaddle should compile with GPU.");
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;

-REGISTER_OP_CUDA_KERNEL(
-    c_reducescatter,
-    ops::CReduceScatterOpKernel<plat::CUDADeviceContext, float>,
-    ops::CReduceScatterOpKernel<plat::CUDADeviceContext, double>,
-    ops::CReduceScatterOpKernel<plat::CUDADeviceContext, int>,
-    ops::CReduceScatterOpKernel<plat::CUDADeviceContext, int64_t>,
-    ops::CReduceScatterOpKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(c_reducescatter, ops::CReduceScatterOpCUDAKernel<float>,
+                        ops::CReduceScatterOpCUDAKernel<double>,
+                        ops::CReduceScatterOpCUDAKernel<int>,
+                        ops::CReduceScatterOpCUDAKernel<int64_t>,
+                        ops::CReduceScatterOpCUDAKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/c_reducescatter_op.h
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.h
@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+
 #include <algorithm>
 #include <utility>
 #include <vector>
@ -22,52 +23,14 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"

-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
 namespace paddle {
 namespace operators {

-template <typename DeviceContext, typename T>
-class CReduceScatterOpKernel : public framework::OpKernel<T> {
+template <typename T>
+class CReduceScatterOpCPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE(is_gpu_place(place),
-                   "CAllReduce op can run on gpu place only for now.");
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
-
-    int rid = ctx.Attr<int>("ring_id");
-    auto comm = platform::NCCLCommContext::Instance().Get(rid);
-    int nranks = comm->nranks();
-
-    auto out_dims = in->dims();
-    out_dims[0] = out_dims[0] / nranks;
-    out->mutable_data<T>(out_dims, place);
-
-    int64_t recv_numel = in->numel() / nranks;
-    const T* send_buff = in->data<T>();
-    T* recv_buff = out->data<T>();
-    int dtype = platform::ToNCCLDataType(in->type());
-
-    cudaStream_t stream = nullptr;
-    if (ctx.Attr<bool>("use_calc_stream")) {
-      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
-    } else {
-      stream = comm->stream();
-    }
-
-    PADDLE_ENFORCE(platform::dynload::ncclReduceScatter(
-        send_buff, recv_buff, recv_numel, static_cast<ncclDataType_t>(dtype),
-        ncclSum, comm->comm(), stream));
-#else
-    PADDLE_THROW("PaddlePaddle should compile with GPU.");
-#endif
+    PADDLE_THROW("Unimplemented cpu kernel for CReduceScatterOp.");
  }
 };

--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
@ -15,12 +15,12 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include <nccl.h>
 #endif
-#include <stdint.h>
-#include <ostream>
+
 #include <string>

 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include "paddle/fluid/platform/collective_helper.h"
 #endif
@ -40,7 +40,6 @@ class CSyncCalcStreamOp : public framework::OperatorBase {
               const platform::Place& place) const override {
    PADDLE_ENFORCE(is_gpu_place(place),
                   "Sync stream op can run on gpu place only for now.");
-
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
    auto dev_ctx = static_cast<platform::CUDADeviceContext*>(
        platform::DeviceContextPool::Instance().Get(place));
@ -57,12 +56,12 @@ class CSyncCalcStreamOp : public framework::OperatorBase {
 class CSyncCalcStreamOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() {
-    AddInput("X", "(Tensor) Dependency of last param need to sync");
-    AddOutput("Out", "(Tensor) Dependency of last param need to sync");
+    AddInput("X", "(Tensor) Dependency of the variable need to sync");
+    AddOutput("Out", "(Tensor) Dependency of the variable need to sync");
    AddComment(R"DOC(
-***Sync Operator***
+CSyncCalcStream Operator

-Call cuda stream synchronize.
+Call calculation stream synchronization.
 )DOC");
  }
 };
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
@ -11,11 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include <nccl.h>
 #endif
-#include <stdint.h>
-#include <ostream>
+
 #include <string>

 #include "paddle/fluid/framework/lod_tensor.h"
@ -57,13 +57,13 @@ class CSyncCommStreamOp : public framework::OperatorBase {
 class CSyncCommStreamOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() {
-    AddInput("X", "(Tensor) Dependency of last param need to sync");
-    AddOutput("Out", "(Tensor) Dependency of last param need to sync");
+    AddInput("X", "(Tensor) Dependency of the variable need to sync");
+    AddOutput("Out", "(Tensor) Dependency of the variable need to sync");
    AddAttr<int>("ring_id", "(int default 0) ring id.").SetDefault(0);
    AddComment(R"DOC(
-***Sync Operator***
+CSyncCommStream Operator

-Call nccl stream synchronize.
+Call communication stream synchronization.
 )DOC");
  }
 };
--- a/Show More
+++ b/Show More