diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index e1438a1eef..e8b1d58712 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -214,8 +214,8 @@ class CUDAContext { << "Please recompile or reinstall Paddle with compatible CUDNN " "version."; } - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreate(&cudnn_handle_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_RETRY_CUDA_SUCCESS(dynload::cudnnCreate(&cudnn_handle_)); + PADDLE_RETRY_CUDA_SUCCESS( dynload::cudnnSetStream(cudnn_handle_, RawStream())); } else { cudnn_handle_ = nullptr; @@ -223,9 +223,8 @@ class CUDAContext { } void InitCuSolverContext() { - PADDLE_ENFORCE_CUDA_SUCCESS( - dynload::cusolverDnCreate(&cusolver_dn_handle_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_RETRY_CUDA_SUCCESS(dynload::cusolverDnCreate(&cusolver_dn_handle_)); + PADDLE_RETRY_CUDA_SUCCESS( dynload::cusolverDnSetStream(cusolver_dn_handle_, RawStream())); } diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 6a27249817..fc57d3a4d0 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -904,6 +904,25 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess); } \ } while (0) +#define PADDLE_RETRY_CUDA_SUCCESS(COND) \ + do { \ + auto __cond__ = (COND); \ + int retry_count = 1; \ + using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ + constexpr auto __success_type__ = \ + ::paddle::platform::details::CudaStatusType< \ + __CUDA_STATUS_TYPE__>::kSuccess; \ + while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \ + __cond__ = (COND); \ + ++retry_count; \ + } \ + if (UNLIKELY(__cond__ != __success_type__)) { \ + auto __summary__ = ::paddle::platform::errors::External( \ + ::paddle::platform::build_nvidia_error_msg(__cond__)); \ + __THROW_ERROR_INTERNAL__(__summary__); \ + } \ + } while (0) + #undef DEFINE_CUDA_STATUS_TYPE #endif // PADDLE_WITH_CUDA diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 22550de5b3..c2f4d6ff2f 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -114,7 +114,7 @@ struct NCCLContextMap { // if num_trainers == 1, should create a new nccl id for local comms. if (num_trainers == 1 && nccl_id == nullptr) { std::lock_guard guard(NCCLGroupGuard::NCCLMutex()); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitAll( + PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitAll( comms.get(), static_cast(order_.size()), order_.data())); } else { PADDLE_ENFORCE_NOT_NULL(nccl_id, platform::errors::InvalidArgument( @@ -132,8 +132,8 @@ struct NCCLContextMap { } VLOG(1) << "init nccl rank:" << rank << ", nranks:" << nranks << ", gpu_id:" << gpu_id << ", dev_id:" << order_[i]; - PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(gpu_id)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitRank( + PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(gpu_id)); + PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitRank( comms.get() + i, nranks, *nccl_id, rank)); } } diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py index 76d93259a6..fd47dc37e7 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py @@ -36,7 +36,7 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase): opt = fluid.optimizer.SGD(learning_rate=0.001) opt.minimize(loss) - batch_size = 16 + batch_size = 32 image = np.random.normal(size=(batch_size, 784)).astype('float32') label = np.random.randint(0, 10, (batch_size, 1), dtype="int64")