Refine cublas to support CUBLAS_TENSOR_OP_MATH (#13929)

* refine cublase
test=develop

* code refine

* refine cublas

* add GEMME_EX

* add enable_cublas_tensor_op_math doc and add cublasCall
test=develop

* fix CublasCall for cuda version
test=develop

* fix error
test=develop

* fix GEMM_EX to be compatible with gcc 4.8
test=develop

* add GEMM_EX
test=develop

* to compatiable with gcc4.8
test=develop
panyx0718-patch-1
chengduo 6 years ago committed by GitHub
parent dd6fd4c747
commit 00b9e9a135
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

File diff suppressed because it is too large Load Diff

@ -143,6 +143,39 @@ class CudnnWorkspaceHandle {
std::unique_ptr<std::lock_guard<std::mutex>> guard_;
};
#if CUDA_VERSION >= 9000
class ScopedCublasMathMode {
public:
ScopedCublasMathMode(cublasHandle_t handle, cublasMath_t new_math_mode)
: handle_(handle) {
need_reset = false;
PADDLE_ENFORCE(
platform::dynload::cublasGetMathMode(handle_, &old_math_mode_),
"Failed to get old cublas math mode");
if (old_math_mode_ != new_math_mode) {
PADDLE_ENFORCE(
platform::dynload::cublasSetMathMode(handle_, new_math_mode),
"Failed to set old cublas math mode");
need_reset = true;
}
}
~ScopedCublasMathMode() {
if (need_reset) {
PADDLE_ENFORCE(
platform::dynload::cublasSetMathMode(handle_, old_math_mode_),
"Failed to set old cublas math mode");
}
}
private:
cublasHandle_t handle_;
cublasMath_t old_math_mode_;
bool need_reset;
};
#endif
class CUDADeviceContext : public DeviceContext {
public:
explicit CUDADeviceContext(CUDAPlace place);
@ -199,6 +232,18 @@ class CUDADeviceContext : public DeviceContext {
callback_manager_->Wait();
}
#if CUDA_VERSION >= 9000
/*! \brief CublasCall may need to change cublas's config,
* but the cublas may be hold by multi-thread, so we should
* add lock here. */
template <typename Callback>
void CublasCall(Callback callback, cublasMath_t new_math) {
std::lock_guard<std::mutex> guard(cublas_mtx_);
ScopedCublasMathMode scoped_cublas_math(cublas_handle_, new_math);
callback();
}
#endif
private:
CUDAPlace place_;
@ -220,6 +265,8 @@ class CUDADeviceContext : public DeviceContext {
// If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes
mutable std::mutex callback_mtx_;
std::unique_ptr<StreamCallbackManager> callback_manager_;
mutable std::mutex cublas_mtx_;
};
template <>

@ -61,9 +61,6 @@ extern void *cublas_dso_handle;
extern DynLoad__##__name __name
#endif
#define DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)
#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
__macro(cublasSaxpy_v2); \
__macro(cublasDaxpy_v2); \
@ -93,22 +90,23 @@ CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
// APIs available after CUDA 8.0
#if CUDA_VERSION >= 8000
#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
__macro(cublasGemmEx); \
__macro(cublasSgemmStridedBatched); \
__macro(cublasDgemmStridedBatched); \
__macro(cublasCgemmStridedBatched); \
__macro(cublasZgemmStridedBatched); \
__macro(cublasHgemmStridedBatched);
CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmEx);
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmStridedBatched);
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmStridedBatched);
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmStridedBatched);
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmStridedBatched);
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasHgemmStridedBatched);
#endif
// APIs available after CUDA 9.0
#if CUDA_VERSION >= 9000
#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) __macro(cublasSetMathMode);
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSetMathMode);
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGetMathMode);
#endif
CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
#if CUDA_VERSION >= 9010
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmBatchedEx);
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmStridedBatchedEx);
#endif
#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP

@ -26,6 +26,16 @@ DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
"additional trunks of the same size will be requested from gpu "
"until the gpu has no memory left for another trunk.");
DEFINE_bool(
enable_cublas_tensor_op_math, false,
"The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
"but it may loss precision. Currently, There are two CUDA libraries that"
" use Tensor Cores, cuBLAS and cuDNN. cuBLAS uses Tensor Cores to speed up"
" GEMM computations(the matrices must be either half precision or single "
"precision); cuDNN uses Tensor Cores to speed up both convolutions(the "
"input and output must be half precision) and recurrent neural networks "
"(RNNs).");
namespace paddle {
namespace platform {
@ -64,6 +74,16 @@ int GetCUDADriverVersion(int id) {
return driver_version;
}
bool TensorCoreAvailable() {
#if CUDA_VERSION >= 9000
int device = GetCurrentDeviceId();
int driver_version = GetCUDAComputeCapability(device);
return driver_version >= 70;
#else
return false;
#endif
}
int GetCUDAMultiProcessors(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
int count;

@ -35,6 +35,9 @@ int GetCUDARuntimeVersion(int id);
//! Get the driver version of the ith GPU
int GetCUDADriverVersion(int id);
//! Wheter the current device support TensorCore
bool TensorCoreAvailable();
//! Get the MultiProcessors of the ith GPU.
int GetCUDAMultiProcessors(int i);

@ -133,7 +133,8 @@ def __bootstrap__():
if core.is_compiled_with_cuda():
read_env_flags += [
'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
'conv_workspace_size_limit', 'cudnn_exhaustive_search'
'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
'cudnn_exhaustive_search'
]
core.init_gflags([sys.argv[0]] +
["--tryfromenv=" + ",".join(read_env_flags)])

Loading…
Cancel
Save