@ -63,6 +63,7 @@ __device__ T reduceSum(T val, int tid, int len) {
val += platform::CudaShuffleDownSync(mask, val, offset);
if (tid < warpSize) shm[tid] = 0;
__syncthreads();
if (tid % warpSize == 0) {
shm[tid / warpSize] = val;