"move nccl to another directory"

8 years ago · 333045d7b2
parent fdfc8f9baa
commit 333045d7b2
10 changed files with 215 additions and 266 deletions
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@ -76,6 +76,14 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP(sigmoid);\n")
    endif()
    # nccl_op contains several operators
    if ("${TARGET}" STREQUAL "nccl_op")
        set(pybind_flag 1)
        # It's enough to just adding one operator to pybind
        file(APPEND ${pybind_file} "USE_OP(ncclInit);\n")
        # file(APPEND ${pybind_file} "USE_OP(ncclInit);\n")
    endif()
    # reduce_op contains several operators
    if ("${TARGET}" STREQUAL "reduce_op")
        set(pybind_flag 1)
@ -116,7 +124,9 @@ set(DEPS_OPS
    softmax_with_cross_entropy_op
    sum_op
    pool_op
-    pool_with_index_op)
+    pool_with_index_op
    nccl_op
    )
 op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
@ -127,6 +137,9 @@ op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 op_library(sum_op DEPS net_op)
 op_library(pool_op DEPS pooling)
 op_library(pool_with_index_op DEPS pooling)
 if(WITH_GPU)
 op_library(nccl_op DEPS nccl_common)
 endif()
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
@ -134,6 +147,7 @@ foreach(src ${GENERAL_OPS})
 endforeach()
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 message(STATUS "operators_list: ${OP_LIBRARY}")
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
--- a/paddle/operators/nccl/CMakeLists.txt
+++ b/paddle/operators/nccl/CMakeLists.txt
@ -1,8 +1,4 @@
 if(WITH_GPU)
-  nv_library(nccl_common SRCS nccl_gpu_common DEPS device_context operator)
+  nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator)
-  nv_library(nccl_op SRCS nccl_ops.cc DEPS nccl_common)
+  nv_test(nccl_gpu_common_test SRCS nccl_gpu_common_test.cc DEPS nccl_common)
 else()
  cc_library(nccl_common SRCS nccl_gpu_common DEPS device_context operator)
 endif()
 cc_test(nccl_gpu_common_test SRCS nccl_gpu_common_test.cc DEPS nccl_common)
--- a/paddle/operators/nccl/nccl_gpu_common.cc
+++ b/paddle/operators/nccl/nccl_gpu_common.cc
@ -1,61 +1,17 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
   http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */
 #include "paddle/operators/nccl/nccl_gpu_common.h"
 #include "paddle/platform/gpu_info.h"
 namespace paddle {
-namespace platform {
+namespace platform {}  // namespace platform
 NCCLManager::NCCLManager() {}
 NCCLManager::~NCCLManager() {
  for (auto& p : comm_table) {
    auto& comm = p.second;
    auto& gpus_ = comm->gpus_;
    for (size_t i = 0; i < gpus_.size(); ++i) {
      int gid = gpus_[i];
      platform::SetDeviceId(gid);
      // mapping gid to idx
      int idx = gid % gpus_.size();
      // wait finish
      PADDLE_ENFORCE(
          cudaStreamWaitEvent(comm->streams_[idx], comm->events_[idx], 0));
      PADDLE_ENFORCE(cudaEventDestroy(comm->events_[idx]));
      PADDLE_ENFORCE(ncclCommDestroy(comm->comms_[idx]));
    }
    comm.reset(nullptr);
  }
 }
 Communicator* NCCLManager::GetCommunicator(const std::vector<int>& gpus) {
  std::string key;
  for (auto& id : gpus) {
    key += std::to_string(id);
  }
  std::sort(key.begin(), key.end());
  std::mutex mu;
  std::lock_guard<std::mutex> lk(mu);
  auto it = comm_table.find(key);
  if (it->second == nullptr) {
    auto* comm = new Communicator(gpus);
    PADDLE_ENFORCE(
        ncclCommInitAll(comm->comms_.data(), gpus.size(), gpus.data()));
    for (size_t i = 0; i < gpus.size(); ++i) {
      platform::SetDeviceId(gpus[i]);
      // block wait
      PADDLE_ENFORCE(cudaEventCreateWithFlags(
          &comm->events_[i], cudaEventBlockingSync | cudaEventDisableTiming));
    }
    comm_table[key].reset(comm);
  }
  return comm_table[key].get();
 }
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ b/paddle/operators/nccl/nccl_gpu_common.h
@ -65,65 +65,30 @@ class WaitGroup {
  std::condition_variable cv_;
 };
 // TODO(dzh) : make resources managed unified with framework
 struct Communicator {
  std::vector<ncclComm_t> comms_;
-  std::vector<cudaStream_t> streams_;
+  std::unordered_map<int, int> comm_id_map_;
-  std::vector<cudaEvent_t> events_;
+
-  std::vector<int> gpus_;
+  int GetCommId(int device_id) const { return comm_id_map_.at(device_id); }
-  WaitGroup wg_;
+
-  int root_gpu = -1;
+  void InitAll(const std::vector<int>& gpus) {
  // cudaEvent_t root_monitor;
  explicit Communicator(const std::vector<int>& gpus) : gpus_(gpus) {
    comms_.resize(gpus.size());
-    streams_.resize(gpus.size());
+    for (size_t i = 0; i < gpus.size(); ++i) {
-    events_.resize(gpus.size());
+      comm_id_map_[gpus[i]] = i;
    }
    PADDLE_ENFORCE(ncclCommInitAll(comms_.data(), gpus.size(), gpus.data()));
  }
  ~Communicator() {
-    for (size_t i = 0; i < gpus_.size(); ++i) {
+    for (size_t i = 0; i < comms_.size(); ++i) {
-      int gid = gpus_[i];
+      PADDLE_ENFORCE(ncclCommDestroy(comms_[i]));
      platform::SetDeviceId(gid);
      int idx = gid % gpus_.size();
      // wait finish
      PADDLE_ENFORCE(
          cudaStreamWaitEvent(comm->streams_[idx], comm->events_[idx], 0));
      PADDLE_ENFORCE(cudaEventDestroy(comm->events_[idx]));
      PADDLE_ENFORCE(ncclCommDestroy(comm->comms_[idx]));
    }
  }
-  inline int get_root_gpu() const { return root_gpu; }
+  // DISABLE_COPY_AND_ASSIGN(Communicator);
  inline void set_root_gpu(int id) { root_gpu = id; }
 };
-class NCCLManager {
+Communicator* NewCommunicator(const std::vector<int>& gpus);
 public:
  static NCCLManager* Get() {
    static NCCLManager m;
    return &m;
  }
  NCCLManager();
  ~NCCLManager();
  // for each card only have one communicator
  Communicator* GetCommunicator(const std::vector<int>& gpus);
 private:
  // // the gpu id list available. Note that only support
  // // whole world communication.
  // std::vector<int> _gpu_worlds;
  // communicator list
  std::unordered_map<std::string /* key*/, std::unique_ptr<Communicator>>
      comm_table;
 };
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/operators/nccl/nccl_ops.cu
+++ b/paddle/operators/nccl/nccl_ops.cu
@ -1,16 +0,0 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/operators/nccl/nccl_ops.h"
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
--- a/paddle/operators/nccl/nccl_ops.h
+++ b/paddle/operators/nccl/nccl_ops.h
@ -1,103 +0,0 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
   http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */
 #pragma once
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/nccl/nccl_gpu_common.h"
 #include <string.h>
 namespace paddle {
 namespace operators {
 using framework::Tensor;
 template <typename Type>
 class NCCLTypeWrapper;
 template <>
 class NCCLTypeWrapper<float> {
 public:
  static const ncclDataType_t type = ncclFloat;
 };
 template <>
 class NCCLTypeWrapper<double> {
 public:
  static const ncclDataType_t type = ncclDouble;
 };
 class NCCLInitOp : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto gpus = ctx.Input<std::vector<int>>("gpus");
    auto* comm = ctx.Output<Communicator>("Communicator");
    comm->mutable_data<Communicator>(CPUPlace());
    comm = NCCLManager::GetCommunicator(gpus);
  }
 };
 template <typename T>
 class NCCLAllReduceKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto ins = ctx.MultiInput<Tensor>("X");
    auto outs = ctx.MultiOutput<Tensor>("Out");
    std::string reduction = ctx.Attr<std::string>("reduction");
    std::vector<int> gpus = ctx.Attr<std::vector<int>>("gpus");
    ncclRedOp_t op_type;
    if (reduction == "ncclSum") {
      op_type = ncclSum;
    } else if (reduction == "ncclProd") {
      op_type = ncclProd;
    } else if (reduction == "ncclMin") {
      op_type = ncclMin;
    } else if (reduction == "ncclMax") {
      op_type = ncclMax;
    }
    auto* comm = ctx.Input<Communicator>("Communicator");
    auto dev_ctx =
        static_cast<const platform::CUDADeviceContext>(ctx.device_context());
    // platform::NCCLManager* m = platform::NCCLManager::Get();
    // auto* comm = m->GetCommunicator(gpus);
    // comm->wg_.Add(1);
    auto stream = dev_ctx.stream();
    // device id
    int gid = static_cast<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
    int idx = gid % gpus.size();
    comm->streams_[idx] = stream;
    for (size_t i = 0; i < ins.size(); ++i) {
      PADDLE_ENFORCE(
          ncclAllReduce(ins[i]->data<T>(), outs[i]->mutable_data<T>(),
                        outs[i]->numel() * sizeof(T), NCCLTypeWrapper<T>::type,
                        op_type, comm->comms_[idx], comm->streams_[idx]));
      PADDLE_ENFORCE(cudaEventRecord(comm->events_[idx], comm->streams_[idx]));
      // // wait finish
      // PADDLE_ENFORCE(
      //     cudaStreamWaitEvent(comm->streams_[idx], comm->events_[idx], 0));
    }
    // comm->wg_.Done();
    // comm->wg_.Wait();
  }
 };
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/nccl/nccl_ops.cc
+++ b/paddle/operators/nccl/nccl_ops.cc
@ -9,7 +9,7 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
-#include "paddle/operators/nccl/nccl_ops.h"
+#include "paddle/operators/nccl_op.h"
 namespace paddle {
 namespace operators {
@ -85,31 +85,36 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
  }
 };
-// // BcastSendOp
+// BcastOp
-// class NCCLBcastSendOpMaker : public framework::OpProtoAndCheckerMaker {
+class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker {
-//  public:
+ public:
-//   NCCLAllReduceOpMaker(framework::OpProto *proto,
+  NCCLAllBcastOpMaker(framework::OpProto *proto,
-//                        framework::OpAttrChecker *op_checker)
+                      framework::OpAttrChecker *op_checker)
-//       : OpProtoAndCheckerMaker(proto, op_checker) {
+      : OpProtoAndCheckerMaker(proto, op_checker) {
-//     AddInput("X", "The input of BcastSend op");
+    AddInput("X", "The input of Bcast op");
-//     AddComment(R"DOC(
+    AddInput("Communicator", "Communicator for communicating between gpus");
-//             BcastSend the tensors.
+    AddInput("root", "root gpu of Bcast");
-//         )DOC");
+    AddComment(R"DOC(
-//   }
+            Bcast the tensors.
-// };
+        )DOC");
  }
 };
-// // BcastRecvOp
+// BcastRecvOp
-// class NCCLBcastRecvOpMaker : public framework::OpProtoAndCheckerMaker {
+class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
-//  public:
+ public:
-//   NCCLAllReduceOpMaker(framework::OpProto *proto,
+  NCCLReduceOpMaker(framework::OpProto *proto,
-//                        framework::OpAttrChecker *op_checker)
+                    framework::OpAttrChecker *op_checker)
-//       : OpProtoAndCheckerMaker(proto, op_checker) {
+      : OpProtoAndCheckerMaker(proto, op_checker) {
-//     AddOutput("Out", "The output of BcastRecv op");
+    AddInput("X", "The input of Reduce op");
-//     AddComment(R"DOC(
+    AddInput("Communicator", "Communicator for communicating between gpus");
-//             BcastRecv the tensors.
+    AddInput("root", "root gpu of Reduce");
-//         )DOC");
+    AddOutput("Out", "The output of Reduce op");
-//   }
+    AddComment(R"DOC(
-// };
+            Reduce the tensors.
        )DOC");
  }
 };
 }  // namespace operators
 }  // namespace paddle
@ -117,3 +122,5 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(ncclAllReduce, ops::NCCLAllReduceOp,
                             ops::NCCLAllReduceOpMaker);
 REGISTER_OP_WITHOUT_GRADIENT(ncclInit, ops::NCCLInitOp, ops::NCCLInitOpMaker);
 REGISTER_OP_CPU_KERNEL(ncclInit, ops::NCCLInitKernel<float>);
--- a/paddle/operators/nccl_op.cu
+++ b/paddle/operators/nccl_op.cu
@ -0,0 +1,66 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/operators/nccl_op.h"
 namespace paddle {
 namespace operators {
 template <typename T>
 class NCCLAllReduceKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                   "This kernel only runs on GPU device.");
    auto ins = ctx.MultiInput<Tensor>("X");
    auto outs = ctx.MultiOutput<Tensor>("Out");
    std::string reduction = ctx.Attr<std::string>("reduction");
    ncclRedOp_t op_type;
    if (reduction == "ncclSum") {
      op_type = ncclSum;
    } else if (reduction == "ncclProd") {
      op_type = ncclProd;
    } else if (reduction == "ncclMin") {
      op_type = ncclMin;
    } else if (reduction == "ncclMax") {
      op_type = ncclMax;
    } else {
      PADDLE_ENFORCE(false, "reduction error.");
    }
    auto* comm = ctx.Input<Communicator>("Communicator");
    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
                      ctx.device_context())
                      .stream();
    // device id
    int device_id =
        boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
    int idx = comm->GetCommId(device_id);
    for (size_t i = 0; i < ins.size(); ++i) {
      PADDLE_ENFORCE(ncclAllReduce(
          ins[i]->data<T>(), outs[i]->mutable_data<T>(ctx.GetPlace()),
          outs[i]->numel() * sizeof(T), NCCLTypeWrapper<T>::type, op_type,
          comm->comms_[idx], stream));
      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
    }
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
--- a/paddle/operators/nccl_op.h
+++ b/paddle/operators/nccl_op.h
@ -0,0 +1,50 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
   http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */
 #pragma once
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/nccl/nccl_gpu_common.h"
 #include <string.h>
 namespace paddle {
 namespace operators {
 using framework::Tensor;
 using platform::Communicator;
 template <typename Type>
 class NCCLTypeWrapper;
 template <>
 class NCCLTypeWrapper<float> {
 public:
  static const ncclDataType_t type = ncclFloat;
 };
 template <>
 class NCCLTypeWrapper<double> {
 public:
  static const ncclDataType_t type = ncclDouble;
 };
 template <typename T>
 class NCCLInitKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* gpus = ctx.Input<std::vector<int>>("gpus");
    auto* comm = ctx.Output<Communicator>("Communicator");
    comm->InitAll(*gpus);
  }
 };
 }  // namespace operators
 }  // namespace paddle
--- a/python/paddle/v2/framework/tests/test_nccl_ops.py
+++ b/python/paddle/v2/framework/tests/test_nccl_ops.py
@ -5,13 +5,15 @@ from paddle.v2.framework.op import Operator
 import paddle.v2.framework.core as core
 from op_test import OpTest, create_op, set_input
-gpu_list = os.environ["NV_LIST"]
+# gpu_list = os.environ["NV_LIST"]
 gpu_list = "0,1,2,3"
 if not core.is_compile_gpu() or not gpu_list:
    exit(0)
-def allreduce(tensors, num_device):
+def allreduce(tensors, gpus):
    num_device = len(gpus)
    assert (len(tensors) == num_device), "not match of tensor and device"
    Out = tensors
    for i in range(1, len(tensors)):
@ -24,23 +26,32 @@ def allreduce(tensors, num_device):
 class TestNCCLAllReduce(unittest.TestCase):
-    def __init__(self):
+    def setUp(self):
        self.op_type = "nnclAllReduce"
-        self.gpus = [int(g) for g in gpu_list]
+        self.op_type = "ncclAllReduce"
        self.gpus = [int(g) for g in gpu_list.split(",")]
        self.g_scope = core.Scope()
        self.g_ctx = core.DeviceContext.create(core.CPUPlace())
        self.scopes = []
        self.ops = []
        self.places = []
        self.input_data = []
        for i in range(len(self.gpus)):
-            input_data.append(np.random.random((32, 32)))
+            self.input_data.append(np.random.random((32, 32)))
-        self.output_data = allreduce(input_data)
+        self.output_data = allreduce(self.input_data, self.gpus)
        nccl_init = Operator("ncclInit", Out="Communicator", gpus=self.gpus)
        op.run(self.g_scope, self.g_ctx)
        for i in range(len(self.gpus)):
-            scope = core.Scope()
+            # insert kid scope
            scope = self.g_scope.new_scope()
            place = core.GPUPlace(self.gpus[i])
            inputs = {"X": self.input_data[i]}
            outputs = {"Out": self.output_data[i]}
            attrs = {"gpus": self.gpus}
@ -66,8 +77,11 @@ class TestNCCLAllReduce(unittest.TestCase):
            self.assertTrue(actual, expect), "has diff"
-if __name__ == "__main__":
+# if __name__ == "__main__":
-    # usage : export NV_LIST=0,1,2,3 python *.py
+#     unittest.main()
 # usage : export NV_LIST=0,1,2,3 python *.py
 # os.environ["NV_LIST"] = ["0,1,2,3"]
-    os.environ["NV_LIST"] = ["0,1,2,3"]
+if __name__ == "__main__":
    unittest.main()