|
|
|
@ -289,7 +289,7 @@ __global__ void SoftmaxKernelWithEltadd2<half2>(
|
|
|
|
|
const int head_num, const int seq_len, const unsigned mask) {
|
|
|
|
|
// operator "+" of half only suppotted after cuda version 10.0
|
|
|
|
|
// HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
|
|
|
|
|
#if defined(PADDLE_WITH_CUDA) || \
|
|
|
|
|
#if defined(PADDLE_WITH_CUDA) && \
|
|
|
|
|
(CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) && CUDA_VERSION >= 10000)
|
|
|
|
|
int qk_offset = blockIdx.x * seq_len;
|
|
|
|
|
int idx = threadIdx.x;
|
|
|
|
@ -407,7 +407,7 @@ template class MultiHeadGPUComputeFunctor<float>;
|
|
|
|
|
|
|
|
|
|
// device function 'operator()' is not supportted until cuda 10.0
|
|
|
|
|
// HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
|
|
|
|
|
#if defined(PADDLE_WITH_CUDA) || CUDA_VERSION >= 10000
|
|
|
|
|
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10000
|
|
|
|
|
template class MultiHeadGPUComputeFunctor<half>;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
@ -646,7 +646,7 @@ template class SkipLayerNormFunctor<float>;
|
|
|
|
|
|
|
|
|
|
// device function 'operator()' is not supportted until cuda 10.0
|
|
|
|
|
// HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
|
|
|
|
|
#if defined(PADDLE_WITH_CUDA) || CUDA_VERSION >= 10000
|
|
|
|
|
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10000
|
|
|
|
|
template class SkipLayerNormFunctor<half>;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|