|
|
|
@ -26,6 +26,7 @@
|
|
|
|
|
|
|
|
|
|
#ifdef __HIPCC__
|
|
|
|
|
#include <hipcub/hipcub.hpp>
|
|
|
|
|
namespace cub = hipcub;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#include "paddle/fluid/framework/tensor.h"
|
|
|
|
@ -71,12 +72,7 @@ template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
|
|
|
|
|
__global__ void ReduceKernel2D(const Tx* x, Ty* y, ReduceOp reducer,
|
|
|
|
|
TransformOp transformer, Ty init,
|
|
|
|
|
int reduce_num) {
|
|
|
|
|
#ifdef __HIPCC__
|
|
|
|
|
__shared__
|
|
|
|
|
typename hipcub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
|
|
|
|
|
#else
|
|
|
|
|
__shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
|
|
|
|
|
#endif
|
|
|
|
|
int idx_x = blockIdx.x * reduce_num;
|
|
|
|
|
int idx_y = threadIdx.x;
|
|
|
|
|
Ty reduce_var = init;
|
|
|
|
@ -85,13 +81,8 @@ __global__ void ReduceKernel2D(const Tx* x, Ty* y, ReduceOp reducer,
|
|
|
|
|
reducer(reduce_var, static_cast<Ty>(transformer(x[idx_x + idx_y])));
|
|
|
|
|
__syncthreads();
|
|
|
|
|
|
|
|
|
|
#ifdef __HIPCC__
|
|
|
|
|
reduce_var = hipcub::BlockReduce<Ty, BlockDim>(temp_storage)
|
|
|
|
|
.Reduce(reduce_var, reducer);
|
|
|
|
|
#else
|
|
|
|
|
reduce_var =
|
|
|
|
|
cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
if (threadIdx.x == 0) {
|
|
|
|
|
y[blockIdx.x] = reduce_var;
|
|
|
|
@ -107,12 +98,7 @@ __global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer,
|
|
|
|
|
Array<int, ReduceRank> reduce_strides,
|
|
|
|
|
Array<int, Rank - ReduceRank> left_dim,
|
|
|
|
|
Array<int, Rank - ReduceRank> left_strides) {
|
|
|
|
|
#ifdef __HIPCC__
|
|
|
|
|
__shared__
|
|
|
|
|
typename hipcub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
|
|
|
|
|
#else
|
|
|
|
|
__shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
|
|
|
|
|
#endif
|
|
|
|
|
Array<int, Rank> sub_index;
|
|
|
|
|
int left_idx = blockIdx.x;
|
|
|
|
|
for (int i = 0; i < Rank - ReduceRank; ++i) {
|
|
|
|
@ -144,13 +130,8 @@ __global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer,
|
|
|
|
|
}
|
|
|
|
|
__syncthreads();
|
|
|
|
|
|
|
|
|
|
#ifdef __HIPCC__
|
|
|
|
|
reduce_var = hipcub::BlockReduce<Ty, BlockDim>(temp_storage)
|
|
|
|
|
.Reduce(reduce_var, reducer);
|
|
|
|
|
#else
|
|
|
|
|
reduce_var =
|
|
|
|
|
cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
if (threadIdx.x == 0) {
|
|
|
|
|
y[blockIdx.x] = reduce_var;
|
|
|
|
@ -238,32 +219,17 @@ static void TensorReduceImpl(
|
|
|
|
|
int rank = x_strides.size();
|
|
|
|
|
int reduce_rank = reduce_strides.size();
|
|
|
|
|
if (rank == reduce_rank) {
|
|
|
|
|
#ifdef __HIPCC__
|
|
|
|
|
hipcub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(
|
|
|
|
|
x_data, transformer);
|
|
|
|
|
#else
|
|
|
|
|
cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(
|
|
|
|
|
x_data, transformer);
|
|
|
|
|
#endif
|
|
|
|
|
size_t temp_storage_bytes = 0;
|
|
|
|
|
#ifdef __HIPCC__
|
|
|
|
|
hipcub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
|
|
|
|
|
reduce_num, reducer, init, stream);
|
|
|
|
|
#else
|
|
|
|
|
cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
|
|
|
|
|
reduce_num, reducer, init, stream);
|
|
|
|
|
#endif
|
|
|
|
|
framework::Tensor tmp;
|
|
|
|
|
auto* temp_storage = tmp.mutable_data<uint8_t>(
|
|
|
|
|
framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
|
|
|
|
|
place);
|
|
|
|
|
#ifdef __HIPCC__
|
|
|
|
|
hipcub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x,
|
|
|
|
|
y_data, reduce_num, reducer, init, stream);
|
|
|
|
|
#else
|
|
|
|
|
cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data,
|
|
|
|
|
reduce_num, reducer, init, stream);
|
|
|
|
|
#endif
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) {
|
|
|
|
|