|
|
|
@ -63,6 +63,7 @@ __device__ T reduceSum(T val, int tid, int len) {
|
|
|
|
val += platform::CudaShuffleDownSync(mask, val, offset);
|
|
|
|
val += platform::CudaShuffleDownSync(mask, val, offset);
|
|
|
|
|
|
|
|
|
|
|
|
if (tid < warpSize) shm[tid] = 0;
|
|
|
|
if (tid < warpSize) shm[tid] = 0;
|
|
|
|
|
|
|
|
__syncthreads();
|
|
|
|
|
|
|
|
|
|
|
|
if (tid % warpSize == 0) {
|
|
|
|
if (tid % warpSize == 0) {
|
|
|
|
shm[tid / warpSize] = val;
|
|
|
|
shm[tid / warpSize] = val;
|
|
|
|
|