|
|
|
@ -179,6 +179,7 @@ __global__ void MatrixColReduce(const T *__restrict__ in, T *__restrict__ out,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#if CUDA_VERSION >= 10000
|
|
|
|
|
template <int SIZE>
|
|
|
|
|
__global__ void VecFP16MatrixColReduce(const __half2 *__restrict__ in,
|
|
|
|
|
__half2 *__restrict__ out, size_t width,
|
|
|
|
@ -199,6 +200,7 @@ __global__ void VecFP16MatrixColReduce(const __half2 *__restrict__ in,
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
__global__ void MatrixReduceLongWidth(const T *__restrict__ in, T *out,
|
|
|
|
@ -365,6 +367,7 @@ class ElementwiseAddGradKernel : public ElemwiseGradKernel<T> {
|
|
|
|
|
int max_blocks = std::max(max_physical_threads / (block_x * block_y), 1);
|
|
|
|
|
int theory_block = (width + blocks.x - 1) / blocks.x;
|
|
|
|
|
dim3 grids(std::min(theory_block, max_blocks));
|
|
|
|
|
#if CUDA_VERSION >= 10000
|
|
|
|
|
if (std::is_same<T, paddle::platform::float16>::value && width < 2048 &&
|
|
|
|
|
width % 2 == 0 && height % 64 == 0) {
|
|
|
|
|
auto &dev_ctx =
|
|
|
|
@ -382,6 +385,7 @@ class ElementwiseAddGradKernel : public ElemwiseGradKernel<T> {
|
|
|
|
|
width, height);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
if (width / height < 32) {
|
|
|
|
|
MatrixColReduce<T, block_x, block_y><<<grids, blocks, 0, stream>>>(
|
|
|
|
|