"add multioperator testcase"

8 years ago · 38d3adfeb6
parent 94992a990b
commit 38d3adfeb6
3 changed files with 180 additions and 121 deletions
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@ -100,8 +100,8 @@ class NCCLReduceOp : public framework::OperatorWithKernel {
  }
 };

-// BcastSendOp
-class NCCLBcastSendOp : public framework::OperatorWithKernel {
+// BcastOp
+class NCCLBcastOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

@ -111,20 +111,12 @@ class NCCLBcastSendOp : public framework::OperatorWithKernel {
                   " Input(X) of Bcast op input should not be NULL");
    PADDLE_ENFORCE(ctx->HasInput("Communicator"),
                   " Input(Communicator) of Bcast op input should not be NULL");
-  }
-};
-
-// BcastRecvOp
-class NCCLBcastRecvOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Communicator"),
-                   " Input(Communicator) of Bcast op input should not be NULL");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   " Output(Out) of Bcast op output should not be NULL");
+
+    auto x_dims = ctx->GetInputsDim("X");
+    ctx->SetOutputsDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
  }
 };

@ -146,52 +138,41 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
  }
 };

-// BcastSend should be in the root
-// BcastSendOp
-class NCCLBcastSendOpMaker : public framework::OpProtoAndCheckerMaker {
+// ReduceOp
+class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  NCCLBcastSendOpMaker(framework::OpProto *proto,
-                       framework::OpAttrChecker *op_checker)
+  NCCLReduceOpMaker(framework::OpProto *proto,
+                    framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input of BcastSend op");
+    AddInput("X", "The input of Reduce op");
    AddInput("Communicator", "Communicator for communicating between gpus");
-    AddAttr<int>("root", "root gpu of Bcast");
+    AddOutput("Out", "The output of Reduce op");
+    AddAttr<int>("root",
+                 "root gpu of the parameter. if not set(-1). hashed by name.")
+        .SetDefault(-1);
    AddComment(R"DOC(
-            Bcast the tensors.
-        )DOC");
+            Reduce the tensors)DOC");
  }
 };

 // BcastOp
-class NCCLBcastRecvOpMaker : public framework::OpProtoAndCheckerMaker {
+class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  NCCLBcastRecvOpMaker(framework::OpProto *proto,
-                       framework::OpAttrChecker *op_checker)
+  NCCLBcastOpMaker(framework::OpProto *proto,
+                   framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of BcastSend op");
    AddInput("Communicator", "Communicator for communicating between gpus");
-    AddAttr<int>("root", "root gpu of BcastRecv");
    AddOutput("Out", "The output of Bcast");
+    AddAttr<int>("root",
+                 "root gpu of the parameter. if not set(-1). hashed by name.")
+        .SetDefault(-1);
    AddComment(R"DOC(
            Bcast the tensors.
        )DOC");
  }
 };

-// BcastRecvOp
-class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  NCCLReduceOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input of Reduce op");
-    AddInput("Communicator", "Communicator for communicating between gpus");
-    AddOutput("Out", "The output of Reduce op");
-    AddComment(R"DOC(
-            Reduce the tensors.
-        )DOC");
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle

@ -201,9 +182,7 @@ REGISTER_OPERATOR(ncclInit, ops::NCCLInitOp,

 REGISTER_OP_WITHOUT_GRADIENT(ncclAllReduce, ops::NCCLAllReduceOp,
                             ops::NCCLAllReduceOpMaker);
-REGISTER_OP_WITHOUT_GRADIENT(ncclBcastSend, ops::NCCLBcastSendOp,
-                             ops::NCCLBcastSendOpMaker);
-REGISTER_OP_WITHOUT_GRADIENT(ncclBcastRecv, ops::NCCLBcastRecvOp,
-                             ops::NCCLBcastRecvOpMaker);
+REGISTER_OP_WITHOUT_GRADIENT(ncclBcast, ops::NCCLBcastOp,
+                             ops::NCCLBcastOpMaker);
 REGISTER_OP_WITHOUT_GRADIENT(ncclReduce, ops::NCCLReduceOp,
                             ops::NCCLReduceOpMaker);
--- a/paddle/operators/nccl_op.cu
+++ b/paddle/operators/nccl_op.cu
@ -83,6 +83,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {

    auto ins = ctx.MultiInput<LoDTensor>("X");  // x0, x1, x2
    auto outs = ctx.MultiOutput<LoDTensor>("Out");
+    int root = ctx.Attr<int>("root");

    auto* comm = ctx.Input<Communicator>("Communicator");

@ -97,7 +98,9 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
    auto ins_names = ctx.Inputs("X");
    std::hash<std::string> hasher;
    for (size_t i = 0; i < ins.size(); ++i) {
-      int root = hasher(ins_names[i]) % comm->comms_.size();
+      if (root == -1) {
+        root = hasher(ins_names[i]) % comm->comms_.size();
+      }
      T* recvbuffer = nullptr;
      if (root == device_id) {
        recvbuffer = outs[i]->mutable_data<T>(ctx.GetPlace());
@ -135,8 +138,9 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
    int device_id =
        boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
    int idx = comm->GetCommId(device_id);
+
    if (idx == root) {
-      auto ins = ctx.MultiInput<Tensor>("X");
+      auto ins = ctx.MultiInput<LoDTensor>("X");
      for (size_t i = 0; i < ins.size(); ++i) {
        PADDLE_ENFORCE(platform::dynload::ncclBcast(
            (void*)ins[i]->data<T>(), ins[i]->numel(), NCCLTypeWrapper<T>::type,
@ -144,7 +148,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
        PADDLE_ENFORCE(cudaStreamSynchronize(stream));
      }
    } else {
-      auto outs = ctx.MultiOutput<Tensor>("Out");
+      auto outs = ctx.MultiOutput<LoDTensor>("Out");
      for (size_t i = 0; i < outs.size(); ++i) {
        PADDLE_ENFORCE(platform::dynload::ncclBcast(
            outs[i]->mutable_data<T>(ctx.GetPlace()), outs[i]->numel(),
@ -160,6 +164,5 @@ class NCCLBcastKernel : public framework::OpKernel<T> {

 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
-REGISTER_OP_GPU_KERNEL(ncclBcastSend, ops::NCCLBcastKernel<float>);
+REGISTER_OP_GPU_KERNEL(ncclBcast, ops::NCCLBcastKernel<float>);
 REGISTER_OP_GPU_KERNEL(ncclReduce, ops::NCCLReduceKernel<float>);
-REGISTER_OP_GPU_KERNEL(ncclBcastRecv, ops::NCCLBcastKernel<float>);
--- a/paddle/operators/nccl_op_test.cu
+++ b/paddle/operators/nccl_op_test.cu