checkin nccl operator

8 years ago · 0990c87bf6
parent da1181bfc6
commit 0990c87bf6
5 changed files with 42 additions and 89 deletions
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ b/paddle/operators/nccl/nccl_gpu_common.h
@ -46,7 +46,8 @@ struct Communicator {

  ~Communicator() {
    for (size_t i = 0; i < comms_.size(); ++i) {
-      PADDLE_ENFORCE(dynload::ncclCommDestroy(comms_[i]));
+      // FIXME(dzh) : PADDLE_ENFORCE return void
+      dynload::ncclCommDestroy(comms_[i]);
    }
  }

--- a/paddle/operators/nccl_op_test.cc
+++ b/paddle/operators/nccl_op_test.cc
@ -1,71 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-#include "paddle/operators/nccl_op.h"
-
-#include "glog/logging.h"
-#include "gtest/gtest.h"
-
-#include "paddle/platform/device_context.h"
-#include "paddle/platform/enforce.h"
-#include "paddle/platform/gpu_info.h"
-
-#include <thrust/device_vector.h>
-#include <memory>
-#include <vector>
-
-static std::vector<int> gpu_list;
-
-using f = paddle::framework;
-using ops = paddle::operators;
-
-void AddOp(const std::string &type, const f::VariableNameMap &inputs,
-           const f::VariableNameMap &outputs, f::AttributeMap attrs,
-           paddle::framework::BlockDescBind *block) {
-  for (auto kv : outputs) {
-    for (auto v : kv.second) {
-      auto var = block->Var(v);
-      var->SetDataType(paddle::framework::DataType::FP32);
-    }
-  }
-
-  auto op = block->AppendOp();
-  op->SetType(type);
-  for (auto &kv : inputs) {
-    op->SetInput(kv.first, kv.second);
-  }
-  for (auto &kv : outputs) {
-    op->SetOutput(kv.first, kv.second);
-  }
-  op->SetAttrMap(attrs);
-}
-
-TEST(NCCL, ncclInitOp) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.Block(0);
-}
-
-int main(int argc, char **argv) {
-  static constexpr int gpu_count = paddle::platform::GetCUDADeviceCount();
-  for (int i = 0; i < gpu_count; ++i) {
-    gpu_list.emplace_back(i);
-  }
-  if (dev_count <= 1) {
-    LOG(WARNING)
-        << "Cannot test multi-gpu nccl, because the CUDA device count is "
-        << dev_count;
-    return 0;
-  }
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
--- a/paddle/operators/nccl_op_test.cu
+++ b/paddle/operators/nccl_op_test.cu
@ -16,6 +16,11 @@
 #include "glog/logging.h"
 #include "gtest/gtest.h"

+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/op_desc.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/program_desc.h"
+#include "paddle/framework/var_desc.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/gpu_info.h"
@ -26,8 +31,8 @@

 static std::vector<int> gpu_list;

-using f = paddle::framework;
-using ops = paddle::operators;
+namespace f = paddle::framework;
+namespace ops = paddle::operators;

 void AddOp(const std::string &type, const f::VariableNameMap &inputs,
           const f::VariableNameMap &outputs, f::AttributeMap attrs,
@ -50,22 +55,40 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs,
  op->SetAttrMap(attrs);
 }

-TEST(NCCL, ncclInitOp) {
+TEST(NCCL, ncclInit) {
  f::ProgramDescBind program;
  f::BlockDescBind *block = program.Block(0);
+  f::OpDescBind *op = block->AppendOp();
+
+  paddle::platform::Communicator comm;
+  op->SetType("ncclInit");
+  op->SetOutput("Communicator", )
+
+      AddOp("ncclInit", {}, {{"Communicator", {comm}}}, {{"gpus", {gpu_list}}},
+            block);
 }

+// TEST(NCCL, ncclAllReduce) {
+//   f::ProgramDescBind program;
+//   f::BlockDescBind *block = program.Block(0);
+
+//   paddle::platform::Communicator comm;
+//   AddOp("ncclInit", {}, {{"Communicator", {comm}}, {"gpus", {gpu_list}}},
+//   block);
+// }
+
 int main(int argc, char **argv) {
-  static constexpr int gpu_count = paddle::platform::GetCUDADeviceCount();
-  for (int i = 0; i < gpu_count; ++i) {
-    gpu_list.emplace_back(i);
-  }
+  static int dev_count = paddle::platform::GetCUDADeviceCount();
  if (dev_count <= 1) {
    LOG(WARNING)
        << "Cannot test multi-gpu nccl, because the CUDA device count is "
        << dev_count;
    return 0;
  }
+
+  for (int i = 0; i < dev_count; ++i) {
+    gpu_list.emplace_back(i);
+  }
  testing::InitGoogleTest(&argc, argv);
  return RUN_ALL_TESTS();
 }
--- a/paddle/platform/nccl_test.cu
+++ b/paddle/platform/nccl_test.cu
@ -31,9 +31,7 @@ namespace platform {
 TEST(NCCL, init) {
  std::vector<ncclComm_t> comms;
  comms.resize(dev_count);
-
-  auto status = dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
-  PADDLE_ENFORCE(status);
+  PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
  for (int i = 0; i < dev_count; ++i) {
    dynload::ncclCommDestroy(comms[i]);
  }
@ -64,8 +62,7 @@ TEST(NCCL, all_reduce) {
  std::vector<ncclComm_t> comms;
  comms.resize(dev_count);
  VLOG(1) << "Initializing ncclComm";
-  auto status = dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
-  PADDLE_ENFORCE(status);
+  PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
  VLOG(1) << "ncclComm initialized";
  VLOG(1) << "Creating thread data";
  std::vector<std::unique_ptr<PerThreadData<double>>> data;
--- a/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py
+++ b/python/paddle/v2/framework/tests/test_nccl_allreduce_op.py
@ -53,6 +53,9 @@ def thread_allreduce_op(thread_id, gpu_id):
    op = create_op(scope, "ncclAllReduce", inputs, outputs, attrs={})
    place = core.GPUPlace(gpus[i])
    set_input(scope, op, inputs, place)
+    # # print scope.find_var("Out").get_tensor()
+    # # print scope.find_var("X").get_tensor()
+    print scope.find_var("Communicator").get_communicator()

    ctx = core.DeviceContext.create(place)

@ -83,13 +86,13 @@ class TestNCCLAllReduce(unittest.TestCase):
                    i,
                    gpus[i], ))
            th.start()
-            ops.append(ops)
-        for th in ops:
-            th.join()
+            ops.append(th)
+        for t in ops:
+            t.join()

        idx = 0
-        for out_name, out_dup in Operator.get_op_outputs(self.op.type()):
-            actual = np.array(scope.find_var(out_name).get_tensor())
+        for out_name, out_dup in Operator.get_op_outputs(self.op_type):
+            actual = np.array(g_scope.find_var(out_name).get_tensor())
            expect = output_data[idx]

            idx += 1