|
|
@ -18,34 +18,33 @@ limitations under the License. */
|
|
|
|
namespace paddle {
|
|
|
|
namespace paddle {
|
|
|
|
namespace platform {
|
|
|
|
namespace platform {
|
|
|
|
|
|
|
|
|
|
|
|
// __shfl_down and __shfl have been deprecated as of CUDA 9.0.
|
|
|
|
|
|
|
|
#if CUDA_VERSION < 9000
|
|
|
|
#if CUDA_VERSION < 9000
|
|
|
|
template <typename T>
|
|
|
|
|
|
|
|
__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
|
|
|
|
|
|
|
|
return __shfl_down(val, delta);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
|
|
|
__forceinline__ __device__ T __shfl_sync(unsigned, T val, int src_line,
|
|
|
|
|
|
|
|
int width) {
|
|
|
|
|
|
|
|
return __shfl(val, src_line, width);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
|
|
|
|
#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
|
|
|
|
#else
|
|
|
|
#else
|
|
|
|
#define FULL_WARP_MASK 0xFFFFFFFF
|
|
|
|
#define FULL_WARP_MASK 0xFFFFFFFF
|
|
|
|
#define CREATE_SHFL_MASK(mask, predicate) \
|
|
|
|
#define CREATE_SHFL_MASK(mask, predicate) \
|
|
|
|
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
|
|
|
|
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
template <typename T>
|
|
|
|
__forceinline__ __device__ T __shfl_down_sync(unsigned mask, T val, int delta) {
|
|
|
|
__forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
|
|
|
|
return __shfl_down_sync(mask, val, delta);
|
|
|
|
int delta, int width = 32) {
|
|
|
|
|
|
|
|
#if CUDA_VERSION < 9000
|
|
|
|
|
|
|
|
return __shfl_down(val, delta, width);
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
return __shfl_down_sync(mask, val, delta, width);
|
|
|
|
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
template <typename T>
|
|
|
|
__forceinline__ __device__ T __shfl_sync(unsigned mask, T val, int src_line,
|
|
|
|
__forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line,
|
|
|
|
int width) {
|
|
|
|
int width = 32) {
|
|
|
|
|
|
|
|
#if CUDA_VERSION < 9000
|
|
|
|
|
|
|
|
return __shfl(val, src_line, width);
|
|
|
|
|
|
|
|
#else
|
|
|
|
return __shfl_sync(mask, val, src_line, width);
|
|
|
|
return __shfl_sync(mask, val, src_line, width);
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
#endif
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
template <typename T>
|
|
|
|
__device__ T reduceSum(T val, int tid, int len) {
|
|
|
|
__device__ T reduceSum(T val, int tid, int len) {
|
|
|
@ -61,7 +60,7 @@ __device__ T reduceSum(T val, int tid, int len) {
|
|
|
|
CREATE_SHFL_MASK(mask, tid < len);
|
|
|
|
CREATE_SHFL_MASK(mask, tid < len);
|
|
|
|
|
|
|
|
|
|
|
|
for (int offset = warpSize / 2; offset > 0; offset /= 2)
|
|
|
|
for (int offset = warpSize / 2; offset > 0; offset /= 2)
|
|
|
|
val += platform::__shfl_down_sync(mask, val, offset);
|
|
|
|
val += platform::CudaShuffleDownSync(mask, val, offset);
|
|
|
|
|
|
|
|
|
|
|
|
if (tid < warpSize) shm[tid] = 0;
|
|
|
|
if (tid < warpSize) shm[tid] = 0;
|
|
|
|
|
|
|
|
|
|
|
@ -75,7 +74,7 @@ __device__ T reduceSum(T val, int tid, int len) {
|
|
|
|
if (tid < warpSize) {
|
|
|
|
if (tid < warpSize) {
|
|
|
|
val = shm[tid];
|
|
|
|
val = shm[tid];
|
|
|
|
for (int offset = warpSize / 2; offset > 0; offset /= 2)
|
|
|
|
for (int offset = warpSize / 2; offset > 0; offset /= 2)
|
|
|
|
val += platform::__shfl_down_sync(mask, val, offset);
|
|
|
|
val += platform::CudaShuffleDownSync(mask, val, offset);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return val;
|
|
|
|
return val;
|
|
|
|
}
|
|
|
|
}
|
|
|
|