add error log when set device id failed

pull/14055/head
lizhenyu 4 years ago
parent dfd368a574
commit 3f9d9c5b2e

@ -95,7 +95,7 @@ void GPUSession::Init(uint32_t device_id) {
MS_EXCEPTION_IF_NULL(get_local_rank_funcptr);
device_id = IntToUint((*get_local_rank_funcptr)());
}
bool ret = device::gpu::CudaDriver::set_current_device(UintToInt(device_id));
bool ret = device::gpu::CudaDriver::SetDevice(UintToInt(device_id));
if (!ret) {
MS_LOG(EXCEPTION) << "GPUSession failed to set current device id:" << device_id;
}

@ -18,6 +18,7 @@
#include "ps/ps_cache/ps_cache_factory.h"
#include "backend/kernel_compiler/gpu/cuda_impl/hash_impl.cuh"
#include "runtime/device/gpu/gpu_common.h"
#include "runtime/device/gpu/cuda_driver.h"
#include "runtime/device/gpu/gpu_memory_allocator.h"
#include "utils/ms_context.h"
@ -26,7 +27,11 @@ namespace ps {
namespace gpu {
MS_REG_PS_CACHE(kGPUDevice, GPUPsCache);
bool GPUPsCache::InitDevice(uint32_t device_id, const void *) {
CHECK_CUDA_RET_WITH_RETURN_ERROR_NOTRACE(cudaSetDevice(device_id), "Cuda set device failed")
bool ret = device::gpu::CudaDriver::SetDevice(UintToInt(device_id));
if (!ret) {
MS_LOG(ERROR) << "Failed to set device id:" << device_id;
return false;
}
CHECK_CUDA_RET_WITH_RETURN_ERROR_NOTRACE(cudaStreamCreate(reinterpret_cast<CUstream_st **>(&stream_)),
"Cuda create stream failed");
return true;

@ -238,11 +238,16 @@ int CudaDriver::device_count() {
return dev_count;
}
bool CudaDriver::set_current_device(int index) {
bool CudaDriver::SetDevice(int index) {
auto ret = cudaSetDevice(index);
if (ret != cudaSuccess) {
MS_LOG(ERROR) << "cudaSetDevice " << index << " failed, ret[" << static_cast<int>(ret) << "], "
<< cudaGetErrorString(ret);
MS_LOG(ERROR)
<< "SetDevice for id:" << index << " failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret)
<< ". Please make sure that the 'device_id' set in context is in the range:[0, total number of GPU). "
"If the environment variable 'CUDA_VISIBLE_DEVICES' is set, the total number of GPU will be the number set "
"in the environment variable 'CUDA_VISIBLE_DEVICES'. For example, if export CUDA_VISIBLE_DEVICES=4,5,6, the "
"'device_id' can be 0,1,2 at the moment, 'device_id' starts from 0, and 'device_id'=0 means using GPU of "
"number 4.";
return false;
}
return true;

@ -63,7 +63,7 @@ class CudaDriver {
// Encapsulate the cuda APIs associated with device management.
static int device_count();
static bool set_current_device(int index);
static bool SetDevice(int index);
private:
CudaDriver() = delete;

@ -106,7 +106,14 @@ void GpuBufferMgr::set_device_id(int device_id) { cur_dev_id_ = device_id; }
void GpuBufferMgr::set_device() const {
auto ret = cudaSetDevice(cur_dev_id_);
if (ret != cudaSuccess) {
MS_LOG(ERROR) << "cudaSetDevice, ret[" << static_cast<int>(ret) << "]";
MS_LOG(ERROR)
<< "Set device for id:" << cur_dev_id_ << " failed, ret[" << static_cast<int>(ret) << "], "
<< cudaGetErrorString(ret)
<< ". Please make sure that the 'device_id' set in context is in the range:[0, total number of GPU). "
"If the environment variable 'CUDA_VISIBLE_DEVICES' is set, the total number of GPU will be the number set "
"in the environment variable 'CUDA_VISIBLE_DEVICES'. For example, if export CUDA_VISIBLE_DEVICES=4,5,6, the "
"'device_id' can be 0,1,2 at the moment, 'device_id' starts from 0, and 'device_id'=0 means using GPU of "
"number 4.";
}
}

@ -24,7 +24,7 @@ namespace mindspore {
namespace device {
namespace gpu {
void GPUDeviceManager::InitDevice() {
CHECK_OP_RET_WITH_EXCEPT(CudaDriver::set_current_device(SizeToInt(cur_dev_id_)), "Failed to set current device id");
CHECK_OP_RET_WITH_EXCEPT(CudaDriver::SetDevice(SizeToInt(cur_dev_id_)), "Failed to set current device id");
CHECK_OP_RET_WITH_EXCEPT(CreateStream(&default_stream_), "Failed to create CUDA stream.");
CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnCreate(&cudnn_handle_), "Failed to create cuDNN handle");
CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnSetStream(cudnn_handle_, reinterpret_cast<cudaStream_t>(default_stream())),

Loading…
Cancel
Save