Repair nccl op test (#8575)

* fix nccl op unit test * fix build error * format code * refine nccl related unit test * fix build error * add setGPUData * clean up * follow comments * rm test_nccl.cu * follow comment * rm wait
7 years ago · 7287630e83
parent ada82a3e24
commit 7287630e83
6 changed files with 69 additions and 261 deletions
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -244,11 +244,11 @@ function(cc_test TARGET_NAME)
    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_executable(${TARGET_NAME} ${cc_test_SRCS})
    # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
-    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
    if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
      list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
    endif()
-    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
    add_test(NAME ${TARGET_NAME}
             COMMAND ${TARGET_NAME} ${cc_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
@ -311,8 +311,8 @@ function(nv_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
-    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
+    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
    add_test(${TARGET_NAME} ${TARGET_NAME})
  endif()
 endfunction(nv_test)
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@ -222,8 +222,6 @@ cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
 cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
 cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
-if(WITH_GPU)
-    cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
-endif()
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
+nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
--- a/paddle/fluid/operators/nccl_op.cc
+++ b/paddle/fluid/operators/nccl_op.cc
@ -14,7 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
-#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"

 namespace paddle {
 namespace operators {
--- a/paddle/fluid/operators/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl_op_test.cu.cc
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@ -48,7 +48,6 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_

 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
-nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context)

 cc_library(device_tracer SRCS device_tracer.cc DEPS profiler_proto ${GPU_CTX_DEPS})
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
--- a/paddle/fluid/platform/nccl_test.cu
+++ b/paddle/fluid/platform/nccl_test.cu
@ -1,153 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/device_vector.h>
-#include <memory>
-#include <vector>
-
-#include "glog/logging.h"
-#include "gtest/gtest.h"
-
-#include "paddle/fluid/framework/init.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/dynload/nccl.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"
-
-static int dev_count = 0;
-
-namespace paddle {
-namespace platform {
-
-TEST(NCCL, init) {
-  std::vector<ncclComm_t> comms;
-  comms.resize(dev_count);
-  PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
-
-  for (int i = 0; i < dev_count; ++i) {
-    dynload::ncclCommDestroy(comms[i]);
-  }
-}
-
-template <typename T>
-struct PerThreadData {
-  thrust::device_vector<T> send_buff;
-  thrust::device_vector<T> recv_buff;
-  CUDADeviceContext dev_ctx;
-
-  T* SendBuff() { return thrust::raw_pointer_cast(send_buff.data()); }
-
-  T* RecvBuff() { return thrust::raw_pointer_cast(recv_buff.data()); }
-
-  PerThreadData(int gpu_id, size_t size) : dev_ctx(CUDAPlace(gpu_id)) {
-    send_buff.resize(size);
-    for (size_t i = 0; i < size; ++i) {
-      send_buff[i] = static_cast<T>(i);
-    }
-    recv_buff.resize(size);
-  }
-};
-
-static constexpr int ELEM_COUNT = 10000;
-
-TEST(NCCL, all_reduce) {
-  std::vector<ncclComm_t> comms;
-  comms.resize(dev_count);
-  VLOG(1) << "Initializing ncclComm";
-  dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
-  VLOG(1) << "ncclComm initialized";
-  VLOG(1) << "Creating thread data";
-  std::vector<std::unique_ptr<PerThreadData<double>>> data;
-  data.reserve(dev_count);
-  for (int i = 0; i < dev_count; ++i) {
-    VLOG(1) << "Creating thread data for device " << i;
-    SetDeviceId(i);
-    data.emplace_back(new PerThreadData<double>(i, ELEM_COUNT));
-  }
-  VLOG(1) << "Thread data created";
-
-  VLOG(1) << "Check send_buf data";
-  for (int i = 0; i < dev_count; ++i) {
-    VLOG(1) << "Check on device " << i;
-    SetDeviceId(i);
-    thrust::host_vector<double> tmp = data[i]->send_buff;
-    for (size_t j = 0; j < tmp.size(); ++j) {
-      ASSERT_NEAR(static_cast<double>(j), tmp[j], 1e-5);
-    }
-  }
-
-  VLOG(1) << "Invoking ncclAllReduce";
-
-  dynload::ncclGroupStart();
-  for (int i = 0; i < dev_count; ++i) {
-    VLOG(1) << "Invoking ncclAllReduce with device " << i;
-    SetDeviceId(i);
-    PADDLE_ENFORCE(dynload::ncclAllReduce(
-        data[i]->SendBuff(), data[i]->RecvBuff(), ELEM_COUNT, ncclDouble,
-        ncclSum, comms[i], data[i]->dev_ctx.stream()));
-    VLOG(1) << "Invoked ncclAllReduce for device " << i;
-  }
-  dynload::ncclGroupEnd();
-
-  VLOG(1) << "Invoked ncclAllReduce";
-
-  VLOG(1) << "Sync devices";
-  for (int i = 0; i < dev_count; ++i) {
-    VLOG(1) << "Sync device " << i;
-    SetDeviceId(i);
-    data[i]->dev_ctx.Wait();
-  }
-  VLOG(1) << "device synced";
-
-  for (int i = 0; i < dev_count; ++i) {
-    SetDeviceId(i);
-    VLOG(1) << "Checking vector on device " << i;
-    thrust::host_vector<double> tmp = data[i]->recv_buff;
-    for (size_t j = 0; j < tmp.size(); ++j) {
-      auto elem = static_cast<double>(j);
-      elem *= dev_count;
-      ASSERT_NEAR(tmp[j], elem, 1e-4);
-    }
-  }
-
-  for (int i = 0; i < dev_count; ++i) {
-    dynload::ncclCommDestroy(comms[i]);
-  }
-}
-}  // namespace platform
-}  // namespace paddle
-
-int main(int argc, char** argv) {
-  dev_count = paddle::platform::GetCUDADeviceCount();
-  if (dev_count <= 1) {
-    LOG(WARNING)
-        << "Cannot test multi-gpu nccl, because the CUDA device count is "
-        << dev_count;
-    return 0;
-  }
-
-  std::vector<paddle::platform::Place> places;
-
-  places.emplace_back(paddle::platform::CPUPlace());
-  int count = paddle::platform::GetCUDADeviceCount();
-  for (int i = 0; i < count; ++i) {
-    places.emplace_back(paddle::platform::CUDAPlace(i));
-  }
-
-  VLOG(0) << " DeviceCount " << count;
-  paddle::platform::DeviceContextPool::Init(places);
-
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}