Modify CublasHandleHolder to Fix Random Unittest Failure. test=develop (#29617)

Modify CublasHandleHolder from using PADDLE_ENFORCE_CUDA_SUCCESS to PADDLE_RETRY_CUDA_SUCCESS to fix random unittest failure. We checked that the unittest log showed CUDA allocation error at this file, which may due to GPU not enough. We fixed similar failure in the past, so we applied PADDLE_RETRY_CUDA_SUCCESS here.
revert-31562-mean
Huihuang Zheng 4 years ago committed by GitHub
parent 6cfa59de1b
commit 4c4d4ba5e0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -78,11 +78,11 @@ namespace platform {
class CublasHandleHolder {
public:
CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasCreate(&handle_));
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream));
PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasCreate(&handle_));
PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream));
#if CUDA_VERSION >= 9000
if (math_type == CUBLAS_TENSOR_OP_MATH) {
PADDLE_ENFORCE_CUDA_SUCCESS(
PADDLE_RETRY_CUDA_SUCCESS(
dynload::cublasSetMathMode(handle_, CUBLAS_TENSOR_OP_MATH));
#if CUDA_VERSION >= 11000
} else if (math_type == CUBLAS_TF32_TENSOR_OP_MATH) {
@ -94,7 +94,7 @@ class CublasHandleHolder {
}
~CublasHandleHolder() PADDLE_MAY_THROW {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasDestroy(handle_));
PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasDestroy(handle_));
}
template <typename Callback>

Loading…
Cancel
Save