|
|
|
@ -114,7 +114,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
|
|
|
|
|
<< "CUDNN_BN_MIN_EPSILON instead.";
|
|
|
|
|
}
|
|
|
|
|
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
|
|
|
|
|
#if CUDNN_VERSION_MIN(7, 0, 0)
|
|
|
|
|
#if CUDNN_VERSION_MIN(7, 0, 1)
|
|
|
|
|
if (FLAGS_cudnn_batchnorm_spatial_persistent) {
|
|
|
|
|
mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
|
|
|
|
|
} else {
|
|
|
|
@ -122,7 +122,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
|
|
|
|
|
}
|
|
|
|
|
#else
|
|
|
|
|
mode_ = CUDNN_BATCHNORM_SPATIAL;
|
|
|
|
|
#endif
|
|
|
|
|
#endif // CUDNN_VERSION_MIN(7, 0, 1)
|
|
|
|
|
|
|
|
|
|
VLOG(3) << "Setting descriptors.";
|
|
|
|
|
std::vector<int> dims;
|
|
|
|
@ -151,7 +151,10 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
|
|
|
|
|
auto handle = dev_ctx.cudnn_handle();
|
|
|
|
|
|
|
|
|
|
// Now, depending on whether we are running test or not, we have two paths.
|
|
|
|
|
if (test_mode || use_global_stats) {
|
|
|
|
|
// It is training mode when it's not reference AND not using pre-trained
|
|
|
|
|
// model.
|
|
|
|
|
bool training = !test_mode && !use_global_stats;
|
|
|
|
|
if (!training) {
|
|
|
|
|
// only when test we use input to do computation.
|
|
|
|
|
const auto *est_mean = ctx.Input<Tensor>("Mean");
|
|
|
|
|
const auto *est_var = ctx.Input<Tensor>("Variance");
|
|
|
|
@ -234,7 +237,6 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
|
|
|
|
|
|
|
|
|
|
bool called = false;
|
|
|
|
|
#if CUDNN_VERSION_MIN(7, 4, 1)
|
|
|
|
|
if (compute_format == DataLayout::kNHWC) {
|
|
|
|
|
called = true;
|
|
|
|
|
size_t workspace_size = 0;
|
|
|
|
|
size_t reserve_space_size = 0;
|
|
|
|
@ -281,11 +283,11 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
|
|
|
|
|
ctx.GetPlace(), transformed_x.type(), workspace_size);
|
|
|
|
|
PADDLE_ENFORCE_CUDA_SUCCESS(
|
|
|
|
|
platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
|
|
|
|
|
handle, mode_, CUDNN_BATCHNORM_OPS_BN,
|
|
|
|
|
CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
|
|
|
|
|
data_desc_, transformed_x.template data<T>(), nullptr,
|
|
|
|
|
nullptr, data_desc_, transformed_y.template data<T>(),
|
|
|
|
|
bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
|
|
|
|
|
handle, mode_, CUDNN_BATCHNORM_OPS_BN, CudnnDataType<T>::kOne(),
|
|
|
|
|
CudnnDataType<T>::kZero(), data_desc_,
|
|
|
|
|
transformed_x.template data<T>(), nullptr, nullptr, data_desc_,
|
|
|
|
|
transformed_y.template data<T>(), bn_param_desc_,
|
|
|
|
|
scale->template data<BatchNormParamType<T>>(),
|
|
|
|
|
bias->template data<BatchNormParamType<T>>(), this_factor,
|
|
|
|
|
mean_out->template mutable_data<BatchNormParamType<T>>(
|
|
|
|
|
ctx.GetPlace()),
|
|
|
|
@ -298,8 +300,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
|
|
|
|
|
ctx.GetPlace()),
|
|
|
|
|
nullptr, workspace_ptr, workspace_size, reserve_space_ptr,
|
|
|
|
|
reserve_space_size));
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
#endif // CUDNN_VERSION_MIN(7, 4, 1)
|
|
|
|
|
if (!called) {
|
|
|
|
|
PADDLE_ENFORCE_CUDA_SUCCESS(
|
|
|
|
|
platform::dynload::cudnnBatchNormalizationForwardTraining(
|
|
|
|
@ -640,7 +641,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
|
|
|
|
|
<< "CUDNN_BN_MIN_EPSILON instead.";
|
|
|
|
|
}
|
|
|
|
|
epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
|
|
|
|
|
#if CUDNN_VERSION_MIN(7, 0, 0)
|
|
|
|
|
#if CUDNN_VERSION_MIN(7, 0, 1)
|
|
|
|
|
if (FLAGS_cudnn_batchnorm_spatial_persistent) {
|
|
|
|
|
mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
|
|
|
|
|
} else {
|
|
|
|
@ -648,7 +649,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
|
|
|
|
|
}
|
|
|
|
|
#else
|
|
|
|
|
mode_ = CUDNN_BATCHNORM_SPATIAL;
|
|
|
|
|
#endif
|
|
|
|
|
#endif // CUDNN_VERSION_MIN(7, 0, 1)
|
|
|
|
|
|
|
|
|
|
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
|
|
|
|
|
data_desc_, CudnnDataType<T>::type,
|
|
|
|
@ -672,10 +673,10 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
|
|
|
|
|
num, transformed_x.data<T>(), grid2, block, stream);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// This branch calls CUDNN APIs
|
|
|
|
|
if (d_scale && d_bias) {
|
|
|
|
|
bool called = false;
|
|
|
|
|
#if CUDNN_VERSION_MIN(7, 4, 1)
|
|
|
|
|
if (compute_format == DataLayout::kNHWC) {
|
|
|
|
|
called = true;
|
|
|
|
|
size_t workspace_size = 0;
|
|
|
|
|
void *workspace_ptr = nullptr;
|
|
|
|
@ -738,8 +739,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
|
|
|
|
|
/*reserveSpace=*/const_cast<T *>(
|
|
|
|
|
reserve_space->template data<T>()),
|
|
|
|
|
/*reserveSpaceSizeInBytes=*/reserve_space_size));
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
#endif // CUDNN_VERSION_MIN(7, 4, 1)
|
|
|
|
|
if (!called) {
|
|
|
|
|
PADDLE_ENFORCE_CUDA_SUCCESS(
|
|
|
|
|
platform::dynload::cudnnBatchNormalizationBackward(
|
|
|
|
@ -764,6 +764,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
|
|
|
|
|
ctx, &transformed_d_x, d_x);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// This branch call CUDA kernels
|
|
|
|
|
if (compute_format == DataLayout::kNCHW) {
|
|
|
|
|
if (d_x) {
|
|
|
|
|
BNBackwardData<T, block, framework::DataLayout::kNCHW><<<
|
|
|
|
|