|
|
@ -22,6 +22,7 @@ limitations under the License. */
|
|
|
|
#ifdef __NVCC__
|
|
|
|
#ifdef __NVCC__
|
|
|
|
#include <cuda.h>
|
|
|
|
#include <cuda.h>
|
|
|
|
#include <thrust/iterator/iterator_adaptor.h>
|
|
|
|
#include <thrust/iterator/iterator_adaptor.h>
|
|
|
|
|
|
|
|
#include "paddle/fluid/platform/cuda_primitives.h"
|
|
|
|
constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
|
|
|
|
constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
|
|
|
|
#endif
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
@ -333,24 +334,12 @@ static void ElemwiseGradBroadcast1CPU(const T* x, const T* y, const T* out,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#ifdef __NVCC__
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// __shfl_down has been deprecated as of CUDA 9.0.
|
|
|
|
#ifdef __NVCC__
|
|
|
|
#if CUDA_VERSION < 9000
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
|
|
|
__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
|
|
|
|
|
|
|
|
return __shfl_down(val, delta);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
#define FULL_WARP_MASK 0xFFFFFFFF
|
|
|
|
|
|
|
|
#define CREATE_SHFL_MASK(mask, predicate) \
|
|
|
|
|
|
|
|
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
template <typename T>
|
|
|
|
__device__ T reduceSum(T val, int tid, int len) {
|
|
|
|
__device__ T reduceSum(T val, int tid, int len) {
|
|
|
|
// TODO(zcd): The warp size should be taken from the
|
|
|
|
// NOTE(zcd): The warp size should be taken from the
|
|
|
|
// parameters of the GPU but not specified as 32 simply.
|
|
|
|
// parameters of the GPU but not specified as 32 simply.
|
|
|
|
// To make the reduceSum more efficiently,
|
|
|
|
// To make the reduceSum more efficiently,
|
|
|
|
// I use Warp-Level Parallelism and assume the Warp size
|
|
|
|
// I use Warp-Level Parallelism and assume the Warp size
|
|
|
@ -362,7 +351,7 @@ __device__ T reduceSum(T val, int tid, int len) {
|
|
|
|
CREATE_SHFL_MASK(mask, tid < len);
|
|
|
|
CREATE_SHFL_MASK(mask, tid < len);
|
|
|
|
|
|
|
|
|
|
|
|
for (int offset = warpSize / 2; offset > 0; offset /= 2)
|
|
|
|
for (int offset = warpSize / 2; offset > 0; offset /= 2)
|
|
|
|
val += __shfl_down_sync(mask, val, offset);
|
|
|
|
val += platform::__shfl_down_sync(mask, val, offset);
|
|
|
|
|
|
|
|
|
|
|
|
if (tid < warpSize) shm[tid] = 0;
|
|
|
|
if (tid < warpSize) shm[tid] = 0;
|
|
|
|
|
|
|
|
|
|
|
@ -378,7 +367,7 @@ __device__ T reduceSum(T val, int tid, int len) {
|
|
|
|
if (tid < warpSize) {
|
|
|
|
if (tid < warpSize) {
|
|
|
|
val = shm[tid];
|
|
|
|
val = shm[tid];
|
|
|
|
for (int offset = warpSize / 2; offset > 0; offset /= 2)
|
|
|
|
for (int offset = warpSize / 2; offset > 0; offset /= 2)
|
|
|
|
val += __shfl_down_sync(mask, val, offset);
|
|
|
|
val += platform::__shfl_down_sync(mask, val, offset);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return val;
|
|
|
|
return val;
|
|
|
|