@ -18,108 +18,76 @@ namespace paddle {
namespace operators {
namespace math {
static const int CUDA_NUM_THREADS = 1024;
static const int CUDA_MAX_NUM_BLOCKS = 65535;
inline static int GET_NUM_BLOCKS(const int N) {
#define CUDA_NUM_THREADS 1024
// CUDA: grid stride looping
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
inline static int PADDLE_GET_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
template <typename T>
__global__ void PReluChannelWiseKernel(const T *input, const T *alpha,
T *output, int channel,
size_t spatial_size) {
size_t offset = blockIdx.x * spatial_size;
const T *in = input + offset;
T *out = output + offset;
T scale = alpha[blockIdx.x % channel];
for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
T x = in[i];
out[i] = (x > 0) ? x : scale * x;
T *output, size_t channel_num,
size_t plane_size, size_t numel) {
size_t index;
CUDA_KERNEL_LOOP(index, numel) {
size_t temp = index / plane_size;
size_t channel_index = temp % channel_num;
T scale = alpha[channel_index];
T x = input[index];
output[index] = (x > 0) ? x : scale * x;
}
}
template <typename T>
__global__ void PReluElementWiseKernel(const T *input, const T *alpha,
T *output, size_t spatial_size) {
size_t offset = blockIdx.x * spatial_size;
const T *in = input + offset;
const T *scale = alpha + offset;
T *out = output + offset;
for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
T x = in[i];
out[i] = (x > 0) ? x : scale[i] * x;
T *output, size_t spatial_size,
size_t numel) {
size_t index;
CUDA_KERNEL_LOOP(index, numel) {
size_t element_index = index % spatial_size;
T scale = alpha[element_index];
T x = input[index];
output[index] = (x > 0) ? x : scale * x;
}
}
template <typename T>
__global__ void PReluScalarKernel(const T *input, const T *alpha, T *output,
size_t spatial_size) {
size_t offset = blockIdx.x * spatial_size;
const T *in = input + offset;
T scale = *alpha;
T *out = output + offset;
for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
T x = in[i];
out[i] = (x > 0) ? x : scale * x;
size_t numel) {
T scale = alpha[0];
size_t index;
CUDA_KERNEL_LOOP(index, numel) {
T x = input[index];
output[index] = (x > 0) ? x : scale * x;
}
}
template <typename T>
static inline void PReluChannelWise(cudaStream_t stream, const T *input,
const T *alpha, T *output,
std::vector<int> input_shape) {
size_t unroll = input_shape[0] * input_shape[1];
size_t spatial_size = input_shape[2] * input_shape[3];
CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
PReluChannelWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
input, alpha, output, input_shape[1], spatial_size);
}
template <typename T>
static inline void PReluElementWise(cudaStream_t stream, const T *input,
const T *alpha, T *output,
std::vector<int> input_shape) {
size_t unroll = input_shape[0] * input_shape[1];
size_t spatial_size = input_shape[2] * input_shape[3];
CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
PReluElementWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
input, alpha, output, spatial_size);
}
template <typename T>
static inline void PReluScalar(cudaStream_t stream, const T *input,
const T *alpha, T *output,
std::vector<int> input_shape) {
size_t unroll = input_shape[0] * input_shape[1];
size_t spatial_size = input_shape[2] * input_shape[3];
CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
PReluScalarKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
input, alpha, output, spatial_size);
}
template <typename T>
void PreluChannelWiseDirectCUDAFunctor<T>::operator()(
cudaStream_t stream, const T *input, const T *alpha, T *output,
std::vector<int> input_shape) {
size_t unroll = input_shape[0] * input_shape[1];
size_t spatial_size = input_shape[2] * input_shape[3];
CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
PReluChannelWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
input, alpha, output, input_shape[1], spatial_size);
size_t plane_size = input_shape[2] * input_shape[3];
size_t spatial_size = input_shape[1] * plane_size;
size_t numel = input_shape[0] * spatial_size;
PReluChannelWiseKernel<<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0,
stream>>>(input, alpha, output, input_shape[1],
plane_size, numel);
}
template <typename T>
void PreluElementWiseDirectCUDAFunctor<T>::operator()(
cudaStream_t stream, const T *input, const T *alpha, T *output,
std::vector<int> input_shape) {
size_t unroll = input_shape[0] * input_shape[1 ];
size_t spatial_size = input_shape[2] * input_shape[3] ;
CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS) ;
PReluElementWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
input, alpha, output, spatial_size);
size_t plane_size = input_shape[2] * input_shape[3];
size_t spatial_size = input_shape[1] * plane_size;
size_t numel = input_shape[0] * spatial_size;
PReluElementWiseKernel<<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0,
stream>>>(input, alpha, output, spatial_size, numel);
}
template <typename T>
@ -127,11 +95,11 @@ void PreluScalarDirectCUDAFunctor<T>::operator()(cudaStream_t stream,
const T *input, const T *alpha,
T *output,
std::vector<int> input_shape) {
size_t unroll = input_shape[0] * input_shape[1 ];
size_t spatial_size = input_shape[2] * input_shape[3] ;
CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS) ;
PReluScalarKernel<<<unroll , CUDA_NUM_THREADS, 0, stream>>>(
input, alpha, output, spatial_size );
size_t plane_size = input_shape[2] * input_shape[3 ];
size_t spatial_size = input_shape[1] * plane_size ;
size_t numel = input_shape[0] * spatial_size ;
PReluScalarKernel<<<PADDLE_GET_BLOCKS(numel) , CUDA_NUM_THREADS, 0, stream>>>(
input, alpha, output, numel );
}
template class PreluChannelWiseDirectCUDAFunctor<float>;