From 3f9d9c5b2e1ac35d71e901a6eabe48f61d905f52 Mon Sep 17 00:00:00 2001 From: lizhenyu Date: Thu, 25 Mar 2021 11:29:18 +0800 Subject: [PATCH] add error log when set device id failed --- mindspore/ccsrc/backend/session/gpu_session.cc | 2 +- mindspore/ccsrc/ps/ps_cache/gpu/gpu_ps_cache.cc | 7 ++++++- mindspore/ccsrc/runtime/device/gpu/cuda_driver.cc | 11 ++++++++--- mindspore/ccsrc/runtime/device/gpu/cuda_driver.h | 2 +- mindspore/ccsrc/runtime/device/gpu/gpu_buffer_mgr.cc | 9 ++++++++- .../ccsrc/runtime/device/gpu/gpu_device_manager.cc | 2 +- 6 files changed, 25 insertions(+), 8 deletions(-) diff --git a/mindspore/ccsrc/backend/session/gpu_session.cc b/mindspore/ccsrc/backend/session/gpu_session.cc index ad7b74c498..55edf1b8d6 100644 --- a/mindspore/ccsrc/backend/session/gpu_session.cc +++ b/mindspore/ccsrc/backend/session/gpu_session.cc @@ -95,7 +95,7 @@ void GPUSession::Init(uint32_t device_id) { MS_EXCEPTION_IF_NULL(get_local_rank_funcptr); device_id = IntToUint((*get_local_rank_funcptr)()); } - bool ret = device::gpu::CudaDriver::set_current_device(UintToInt(device_id)); + bool ret = device::gpu::CudaDriver::SetDevice(UintToInt(device_id)); if (!ret) { MS_LOG(EXCEPTION) << "GPUSession failed to set current device id:" << device_id; } diff --git a/mindspore/ccsrc/ps/ps_cache/gpu/gpu_ps_cache.cc b/mindspore/ccsrc/ps/ps_cache/gpu/gpu_ps_cache.cc index 536b142f99..a3327e26de 100644 --- a/mindspore/ccsrc/ps/ps_cache/gpu/gpu_ps_cache.cc +++ b/mindspore/ccsrc/ps/ps_cache/gpu/gpu_ps_cache.cc @@ -18,6 +18,7 @@ #include "ps/ps_cache/ps_cache_factory.h" #include "backend/kernel_compiler/gpu/cuda_impl/hash_impl.cuh" #include "runtime/device/gpu/gpu_common.h" +#include "runtime/device/gpu/cuda_driver.h" #include "runtime/device/gpu/gpu_memory_allocator.h" #include "utils/ms_context.h" @@ -26,7 +27,11 @@ namespace ps { namespace gpu { MS_REG_PS_CACHE(kGPUDevice, GPUPsCache); bool GPUPsCache::InitDevice(uint32_t device_id, const void *) { - CHECK_CUDA_RET_WITH_RETURN_ERROR_NOTRACE(cudaSetDevice(device_id), "Cuda set device failed") + bool ret = device::gpu::CudaDriver::SetDevice(UintToInt(device_id)); + if (!ret) { + MS_LOG(ERROR) << "Failed to set device id:" << device_id; + return false; + } CHECK_CUDA_RET_WITH_RETURN_ERROR_NOTRACE(cudaStreamCreate(reinterpret_cast(&stream_)), "Cuda create stream failed"); return true; diff --git a/mindspore/ccsrc/runtime/device/gpu/cuda_driver.cc b/mindspore/ccsrc/runtime/device/gpu/cuda_driver.cc index d0e7b668d8..211919be5e 100644 --- a/mindspore/ccsrc/runtime/device/gpu/cuda_driver.cc +++ b/mindspore/ccsrc/runtime/device/gpu/cuda_driver.cc @@ -238,11 +238,16 @@ int CudaDriver::device_count() { return dev_count; } -bool CudaDriver::set_current_device(int index) { +bool CudaDriver::SetDevice(int index) { auto ret = cudaSetDevice(index); if (ret != cudaSuccess) { - MS_LOG(ERROR) << "cudaSetDevice " << index << " failed, ret[" << static_cast(ret) << "], " - << cudaGetErrorString(ret); + MS_LOG(ERROR) + << "SetDevice for id:" << index << " failed, ret[" << static_cast(ret) << "], " << cudaGetErrorString(ret) + << ". Please make sure that the 'device_id' set in context is in the range:[0, total number of GPU). " + "If the environment variable 'CUDA_VISIBLE_DEVICES' is set, the total number of GPU will be the number set " + "in the environment variable 'CUDA_VISIBLE_DEVICES'. For example, if export CUDA_VISIBLE_DEVICES=4,5,6, the " + "'device_id' can be 0,1,2 at the moment, 'device_id' starts from 0, and 'device_id'=0 means using GPU of " + "number 4."; return false; } return true; diff --git a/mindspore/ccsrc/runtime/device/gpu/cuda_driver.h b/mindspore/ccsrc/runtime/device/gpu/cuda_driver.h index 12a6c666d3..2d609f8eda 100644 --- a/mindspore/ccsrc/runtime/device/gpu/cuda_driver.h +++ b/mindspore/ccsrc/runtime/device/gpu/cuda_driver.h @@ -63,7 +63,7 @@ class CudaDriver { // Encapsulate the cuda APIs associated with device management. static int device_count(); - static bool set_current_device(int index); + static bool SetDevice(int index); private: CudaDriver() = delete; diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_buffer_mgr.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_buffer_mgr.cc index 5d56f1e71c..947490dfde 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_buffer_mgr.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_buffer_mgr.cc @@ -106,7 +106,14 @@ void GpuBufferMgr::set_device_id(int device_id) { cur_dev_id_ = device_id; } void GpuBufferMgr::set_device() const { auto ret = cudaSetDevice(cur_dev_id_); if (ret != cudaSuccess) { - MS_LOG(ERROR) << "cudaSetDevice, ret[" << static_cast(ret) << "]"; + MS_LOG(ERROR) + << "Set device for id:" << cur_dev_id_ << " failed, ret[" << static_cast(ret) << "], " + << cudaGetErrorString(ret) + << ". Please make sure that the 'device_id' set in context is in the range:[0, total number of GPU). " + "If the environment variable 'CUDA_VISIBLE_DEVICES' is set, the total number of GPU will be the number set " + "in the environment variable 'CUDA_VISIBLE_DEVICES'. For example, if export CUDA_VISIBLE_DEVICES=4,5,6, the " + "'device_id' can be 0,1,2 at the moment, 'device_id' starts from 0, and 'device_id'=0 means using GPU of " + "number 4."; } } diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_device_manager.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_device_manager.cc index ee1c52c5bc..0420d9f6e4 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_manager.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_manager.cc @@ -24,7 +24,7 @@ namespace mindspore { namespace device { namespace gpu { void GPUDeviceManager::InitDevice() { - CHECK_OP_RET_WITH_EXCEPT(CudaDriver::set_current_device(SizeToInt(cur_dev_id_)), "Failed to set current device id"); + CHECK_OP_RET_WITH_EXCEPT(CudaDriver::SetDevice(SizeToInt(cur_dev_id_)), "Failed to set current device id"); CHECK_OP_RET_WITH_EXCEPT(CreateStream(&default_stream_), "Failed to create CUDA stream."); CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnCreate(&cudnn_handle_), "Failed to create cuDNN handle"); CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnSetStream(cudnn_handle_, reinterpret_cast(default_stream())),