Feature/nccl dso (#5001)
	
		
	
				
					
				
			* "add nccl enforce" * Dev * Update comment * Add nccl test * Follow commentsrevert-4814-Add_sequence_project_op
							parent
							
								
									fcd74e06b8
								
							
						
					
					
						commit
						43c6ff212e
					
				@ -0,0 +1,30 @@
 | 
				
			||||
if (NOT WITH_GPU)
 | 
				
			||||
  return ()
 | 
				
			||||
endif()
 | 
				
			||||
 | 
				
			||||
set(NCCL_ROOT "/usr" CACHE PATH "CUDNN ROOT")
 | 
				
			||||
find_path(NCCL_INCLUDE_DIR nccl.h PATHS
 | 
				
			||||
        ${NCCL_ROOT} ${NCCL_ROOT}/include
 | 
				
			||||
        $ENV{NCCL_ROOT} $ENV{NCCL_ROOT}/include ${CUDA_TOOLKIT_INCLUDE}
 | 
				
			||||
        NO_DEFAULT_PATH)
 | 
				
			||||
 | 
				
			||||
get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
 | 
				
			||||
 | 
				
			||||
set(TARGET_ARCH "x86_64")
 | 
				
			||||
if(NOT ${CMAKE_SYSTEM_PROCESSOR})
 | 
				
			||||
  set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
 | 
				
			||||
endif()
 | 
				
			||||
 | 
				
			||||
list(APPEND NCCL_CHECK_LIBRARY_DIRS
 | 
				
			||||
        ${NCCL_ROOT}
 | 
				
			||||
        ${NCCL_ROOT}/lib64
 | 
				
			||||
        ${NCCL_ROOT}/lib
 | 
				
			||||
        ${NCCL_ROOT}/lib/${TARGET_ARCH}-linux-gnu
 | 
				
			||||
        $ENV{NCCL_ROOT}
 | 
				
			||||
        $ENV{NCCL_ROOT}/lib64
 | 
				
			||||
        $ENV{NCCL_ROOT}/lib
 | 
				
			||||
        /usr/lib)
 | 
				
			||||
find_library(NCCL_LIBRARY NAMES libnccl.so libnccl.dylib # libcudnn_static.a
 | 
				
			||||
        PATHS ${NCCL_CHECK_LIBRARY_DIRS} ${NCCL_INCLUDE_DIR} ${__libpath_hist}
 | 
				
			||||
        NO_DEFAULT_PATH
 | 
				
			||||
        DOC "Path to nccl library.")
 | 
				
			||||
@ -1,2 +1,2 @@
 | 
				
			||||
cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)
 | 
				
			||||
nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc DEPS dynamic_loader)
 | 
				
			||||
nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc nccl.cc DEPS dynamic_loader)
 | 
				
			||||
 | 
				
			||||
@ -0,0 +1,30 @@
 | 
				
			||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 | 
				
			||||
 | 
				
			||||
   Licensed under the Apache License, Version 2.0 (the "License");
 | 
				
			||||
   you may not use this file except in compliance with the License.
 | 
				
			||||
   You may obtain a copy of the License at
 | 
				
			||||
 | 
				
			||||
   http://www.apache.org/licenses/LICENSE-2.0
 | 
				
			||||
 | 
				
			||||
   Unless required by applicable law or agreed to in writing, software
 | 
				
			||||
   distributed under the License is distributed on an "AS IS" BASIS,
 | 
				
			||||
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
				
			||||
   See the License for the specific language governing permissions and
 | 
				
			||||
   limitations under the License. */
 | 
				
			||||
 | 
				
			||||
#include "paddle/platform/dynload/nccl.h"
 | 
				
			||||
 | 
				
			||||
namespace paddle {
 | 
				
			||||
namespace platform {
 | 
				
			||||
namespace dynload {
 | 
				
			||||
 | 
				
			||||
std::once_flag nccl_dso_flag;
 | 
				
			||||
void *nccl_dso_handle;
 | 
				
			||||
 | 
				
			||||
#define DEFINE_WRAP(__name) DynLoad__##__name __name
 | 
				
			||||
 | 
				
			||||
NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
 | 
				
			||||
 | 
				
			||||
}  // namespace dynload
 | 
				
			||||
}  // namespace platform
 | 
				
			||||
}  // namespace paddle
 | 
				
			||||
@ -0,0 +1,72 @@
 | 
				
			||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 | 
				
			||||
 | 
				
			||||
   Licensed under the Apache License, Version 2.0 (the "License");
 | 
				
			||||
   you may not use this file except in compliance with the License.
 | 
				
			||||
   You may obtain a copy of the License at
 | 
				
			||||
 | 
				
			||||
   http://www.apache.org/licenses/LICENSE-2.0
 | 
				
			||||
 | 
				
			||||
   Unless required by applicable law or agreed to in writing, software
 | 
				
			||||
   distributed under the License is distributed on an "AS IS" BASIS,
 | 
				
			||||
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
				
			||||
   See the License for the specific language governing permissions and
 | 
				
			||||
   limitations under the License. */
 | 
				
			||||
 | 
				
			||||
#pragma once
 | 
				
			||||
 | 
				
			||||
#include <dlfcn.h>
 | 
				
			||||
#include <nccl.h>
 | 
				
			||||
#include <mutex>
 | 
				
			||||
#include "paddle/platform/dynload/dynamic_loader.h"
 | 
				
			||||
 | 
				
			||||
namespace paddle {
 | 
				
			||||
namespace platform {
 | 
				
			||||
namespace dynload {
 | 
				
			||||
 | 
				
			||||
extern std::once_flag nccl_dso_flag;
 | 
				
			||||
extern void* nccl_dso_handle;
 | 
				
			||||
 | 
				
			||||
#ifdef PADDLE_USE_DSO
 | 
				
			||||
#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                    \
 | 
				
			||||
  struct DynLoad__##__name {                                      \
 | 
				
			||||
    template <typename... Args>                                   \
 | 
				
			||||
    auto operator()(Args... args) -> decltype(__name(args...)) {  \
 | 
				
			||||
      using nccl_func = decltype(__name(args...)) (*)(Args...);   \
 | 
				
			||||
      std::call_once(nccl_dso_flag,                               \
 | 
				
			||||
                     paddle::platform::dynload::GetNCCLDsoHandle, \
 | 
				
			||||
                     &nccl_dso_handle);                           \
 | 
				
			||||
      void* p_##__name = dlsym(nccl_dso_handle, #__name);         \
 | 
				
			||||
      return reinterpret_cast<nccl_func>(p_##__name)(args...);    \
 | 
				
			||||
    }                                                             \
 | 
				
			||||
  };                                                              \
 | 
				
			||||
  extern DynLoad__##__name __name
 | 
				
			||||
#else
 | 
				
			||||
#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
 | 
				
			||||
  struct DynLoad__##__name {                   \
 | 
				
			||||
    template <typename... Args>                \
 | 
				
			||||
    ncclResult_t operator()(Args... args) {    \
 | 
				
			||||
      return __name(args...);                  \
 | 
				
			||||
    }                                          \
 | 
				
			||||
  };                                           \
 | 
				
			||||
  extern DynLoad__##__name __name
 | 
				
			||||
#endif
 | 
				
			||||
 | 
				
			||||
#define NCCL_RAND_ROUTINE_EACH(__macro) \
 | 
				
			||||
  __macro(ncclCommInitAll);             \
 | 
				
			||||
  __macro(ncclGetUniqueId);             \
 | 
				
			||||
  __macro(ncclCommInitRank);            \
 | 
				
			||||
  __macro(ncclCommDestroy);             \
 | 
				
			||||
  __macro(ncclCommCount);               \
 | 
				
			||||
  __macro(ncclCommCuDevice);            \
 | 
				
			||||
  __macro(ncclCommUserRank);            \
 | 
				
			||||
  __macro(ncclAllReduce);               \
 | 
				
			||||
  __macro(ncclBcast);                   \
 | 
				
			||||
  __macro(ncclAllGather);               \
 | 
				
			||||
  __macro(ncclReduce);                  \
 | 
				
			||||
  __macro(ncclGetErrorString);
 | 
				
			||||
 | 
				
			||||
NCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
 | 
				
			||||
 | 
				
			||||
}  // namespace dynload
 | 
				
			||||
}  // namespace platform
 | 
				
			||||
}  // namespace paddle
 | 
				
			||||
@ -0,0 +1,139 @@
 | 
				
			||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 | 
				
			||||
 | 
				
			||||
   Licensed under the Apache License, Version 2.0 (the "License");
 | 
				
			||||
   you may not use this file except in compliance with the License.
 | 
				
			||||
   You may obtain a copy of the License at
 | 
				
			||||
 | 
				
			||||
   http://www.apache.org/licenses/LICENSE-2.0
 | 
				
			||||
 | 
				
			||||
   Unless required by applicable law or agreed to in writing, software
 | 
				
			||||
   distributed under the License is distributed on an "AS IS" BASIS,
 | 
				
			||||
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
				
			||||
   See the License for the specific language governing permissions and
 | 
				
			||||
   limitations under the License. */
 | 
				
			||||
 | 
				
			||||
#include "glog/logging.h"
 | 
				
			||||
#include "gtest/gtest.h"
 | 
				
			||||
#include "paddle/platform/device_context.h"
 | 
				
			||||
#include "paddle/platform/dynload/nccl.h"
 | 
				
			||||
#include "paddle/platform/enforce.h"
 | 
				
			||||
#include "paddle/platform/gpu_info.h"
 | 
				
			||||
 | 
				
			||||
#include <thrust/device_vector.h>
 | 
				
			||||
#include <memory>
 | 
				
			||||
#include <vector>
 | 
				
			||||
 | 
				
			||||
static int dev_count = 0;
 | 
				
			||||
 | 
				
			||||
namespace paddle {
 | 
				
			||||
namespace platform {
 | 
				
			||||
 | 
				
			||||
TEST(NCCL, init) {
 | 
				
			||||
  std::vector<ncclComm_t> comms;
 | 
				
			||||
  comms.resize(dev_count);
 | 
				
			||||
 | 
				
			||||
  auto status = dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
 | 
				
			||||
  PADDLE_ENFORCE(status);
 | 
				
			||||
  for (int i = 0; i < dev_count; ++i) {
 | 
				
			||||
    dynload::ncclCommDestroy(comms[i]);
 | 
				
			||||
  }
 | 
				
			||||
}
 | 
				
			||||
 | 
				
			||||
template <typename T>
 | 
				
			||||
struct PerThreadData {
 | 
				
			||||
  thrust::device_vector<T> send_buff;
 | 
				
			||||
  thrust::device_vector<T> recv_buff;
 | 
				
			||||
  CUDADeviceContext dev_ctx;
 | 
				
			||||
 | 
				
			||||
  T* SendBuff() { return thrust::raw_pointer_cast(send_buff.data()); }
 | 
				
			||||
 | 
				
			||||
  T* RecvBuff() { return thrust::raw_pointer_cast(recv_buff.data()); }
 | 
				
			||||
 | 
				
			||||
  PerThreadData(int gpu_id, size_t size) : dev_ctx(GPUPlace(gpu_id)) {
 | 
				
			||||
    send_buff.resize(size);
 | 
				
			||||
    for (size_t i = 0; i < size; ++i) {
 | 
				
			||||
      send_buff[i] = static_cast<T>(i);
 | 
				
			||||
    }
 | 
				
			||||
    recv_buff.resize(size);
 | 
				
			||||
  }
 | 
				
			||||
};
 | 
				
			||||
 | 
				
			||||
static constexpr int ELEM_COUNT = 10000;
 | 
				
			||||
 | 
				
			||||
TEST(NCCL, all_reduce) {
 | 
				
			||||
  std::vector<ncclComm_t> comms;
 | 
				
			||||
  comms.resize(dev_count);
 | 
				
			||||
  VLOG(1) << "Initializing ncclComm";
 | 
				
			||||
  auto status = dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
 | 
				
			||||
  PADDLE_ENFORCE(status);
 | 
				
			||||
  VLOG(1) << "ncclComm initialized";
 | 
				
			||||
  VLOG(1) << "Creating thread data";
 | 
				
			||||
  std::vector<std::unique_ptr<PerThreadData<double>>> data;
 | 
				
			||||
  data.reserve(dev_count);
 | 
				
			||||
  for (int i = 0; i < dev_count; ++i) {
 | 
				
			||||
    VLOG(1) << "Creating thread data for device " << i;
 | 
				
			||||
    SetDeviceId(i);
 | 
				
			||||
    data.emplace_back(new PerThreadData<double>(i, ELEM_COUNT));
 | 
				
			||||
  }
 | 
				
			||||
  VLOG(1) << "Thread data created";
 | 
				
			||||
 | 
				
			||||
  VLOG(1) << "Check send_buf data";
 | 
				
			||||
  for (int i = 0; i < dev_count; ++i) {
 | 
				
			||||
    VLOG(1) << "Check on device " << i;
 | 
				
			||||
    SetDeviceId(i);
 | 
				
			||||
    thrust::host_vector<double> tmp = data[i]->send_buff;
 | 
				
			||||
    for (size_t j = 0; j < tmp.size(); ++j) {
 | 
				
			||||
      ASSERT_NEAR(static_cast<double>(j), tmp[j], 1e-5);
 | 
				
			||||
    }
 | 
				
			||||
  }
 | 
				
			||||
 | 
				
			||||
  VLOG(1) << "Invoking ncclAllReduce";
 | 
				
			||||
 | 
				
			||||
  for (int i = 0; i < dev_count; ++i) {
 | 
				
			||||
    VLOG(1) << "Invoking ncclAllReduce with device " << i;
 | 
				
			||||
    SetDeviceId(i);
 | 
				
			||||
    PADDLE_ENFORCE(dynload::ncclAllReduce(
 | 
				
			||||
        data[i]->SendBuff(), data[i]->RecvBuff(), ELEM_COUNT, ncclDouble,
 | 
				
			||||
        ncclSum, comms[i], data[i]->dev_ctx.stream()));
 | 
				
			||||
    VLOG(1) << "Invoked ncclAllReduce for device " << i;
 | 
				
			||||
  }
 | 
				
			||||
 | 
				
			||||
  VLOG(1) << "Invoked ncclAllReduce";
 | 
				
			||||
 | 
				
			||||
  VLOG(1) << "Sync devices";
 | 
				
			||||
  for (int i = 0; i < dev_count; ++i) {
 | 
				
			||||
    VLOG(1) << "Sync device " << i;
 | 
				
			||||
    SetDeviceId(i);
 | 
				
			||||
    data[i]->dev_ctx.Wait();
 | 
				
			||||
  }
 | 
				
			||||
  VLOG(1) << "device synced";
 | 
				
			||||
 | 
				
			||||
  for (int i = 0; i < dev_count; ++i) {
 | 
				
			||||
    SetDeviceId(i);
 | 
				
			||||
    VLOG(1) << "Checking vector on device " << i;
 | 
				
			||||
    thrust::host_vector<double> tmp = data[i]->recv_buff;
 | 
				
			||||
    for (size_t j = 0; j < tmp.size(); ++j) {
 | 
				
			||||
      auto elem = static_cast<double>(j);
 | 
				
			||||
      elem *= dev_count;
 | 
				
			||||
      ASSERT_NEAR(tmp[j], elem, 1e-4);
 | 
				
			||||
    }
 | 
				
			||||
  }
 | 
				
			||||
 | 
				
			||||
  for (int i = 0; i < dev_count; ++i) {
 | 
				
			||||
    dynload::ncclCommDestroy(comms[i]);
 | 
				
			||||
  }
 | 
				
			||||
}
 | 
				
			||||
}  // namespace platform
 | 
				
			||||
}  // namespace paddle
 | 
				
			||||
 | 
				
			||||
int main(int argc, char** argv) {
 | 
				
			||||
  dev_count = paddle::platform::GetCUDADeviceCount();
 | 
				
			||||
  if (dev_count <= 1) {
 | 
				
			||||
    LOG(WARNING)
 | 
				
			||||
        << "Cannot test multi-gpu nccl, because the CUDA device count is "
 | 
				
			||||
        << dev_count;
 | 
				
			||||
    return 0;
 | 
				
			||||
  }
 | 
				
			||||
  testing::InitGoogleTest(&argc, argv);
 | 
				
			||||
  return RUN_ALL_TESTS();
 | 
				
			||||
}
 | 
				
			||||
					Loading…
					
					
				
		Reference in new issue