|
|
|
@ -18,9 +18,10 @@ limitations under the License. */
|
|
|
|
|
#include <set>
|
|
|
|
|
#include <string>
|
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
|
|
#include "paddle/fluid/framework/data_type_transform.h"
|
|
|
|
|
#include "paddle/fluid/framework/tensor_util.h"
|
|
|
|
|
#include "paddle/fluid/operators/cast_op.h"
|
|
|
|
|
#include "paddle/fluid/operators/math/math_function.h"
|
|
|
|
|
#include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
|
|
|
|
|
|
|
|
|
|
namespace paddle {
|
|
|
|
@ -34,6 +35,110 @@ namespace operators {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
using Tensor = framework::Tensor;
|
|
|
|
|
using DDim = framework::DDim;
|
|
|
|
|
|
|
|
|
|
inline void GetShuffledDim(const DDim& src_dims, DDim* dst_dims,
|
|
|
|
|
const std::vector<int>& reduced_dims,
|
|
|
|
|
std::vector<int>* perm_axis) {
|
|
|
|
|
// check if it's a reduced dim
|
|
|
|
|
std::vector<bool> src_dims_check(src_dims.size(), false);
|
|
|
|
|
size_t src_size = src_dims.size();
|
|
|
|
|
size_t reduce_size = reduced_dims.size();
|
|
|
|
|
for (size_t i = 0; i < reduce_size; ++i) {
|
|
|
|
|
dst_dims->at(src_size - reduce_size + i) = src_dims[reduced_dims[i]];
|
|
|
|
|
(*perm_axis)[src_size - reduce_size + i] = reduced_dims[i];
|
|
|
|
|
src_dims_check[reduced_dims[i]] = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t offset = 0;
|
|
|
|
|
for (size_t i = 0; i < src_dims_check.size(); ++i) {
|
|
|
|
|
bool is_reduced = src_dims_check[i];
|
|
|
|
|
if (!is_reduced) {
|
|
|
|
|
(*perm_axis)[offset] = i;
|
|
|
|
|
dst_dims->at(offset++) = src_dims[i];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename DeviceContext, typename OutT>
|
|
|
|
|
void GetShuffledInput(const framework::ExecutionContext& context,
|
|
|
|
|
const Tensor* input, Tensor* shuffled_input,
|
|
|
|
|
const std::vector<int>& dims) {
|
|
|
|
|
DDim shuffled_dims(input->dims());
|
|
|
|
|
std::vector<int> perm_axis(input->dims().size());
|
|
|
|
|
GetShuffledDim(input->dims(), &shuffled_dims, dims, &perm_axis);
|
|
|
|
|
|
|
|
|
|
shuffled_input->Resize(shuffled_dims);
|
|
|
|
|
shuffled_input->mutable_data<OutT>(context.GetPlace());
|
|
|
|
|
|
|
|
|
|
math::TransposeNormal<DeviceContext, OutT> trans;
|
|
|
|
|
trans(context.template device_context<DeviceContext>(), *input,
|
|
|
|
|
shuffled_input, perm_axis);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inline void GetOriginDimFromShuffled(const DDim& src_dim,
|
|
|
|
|
const std::vector<int>& dims,
|
|
|
|
|
std::vector<int>* origin_dim) {
|
|
|
|
|
DDim shuffled_dims(src_dim);
|
|
|
|
|
size_t n = src_dim.size();
|
|
|
|
|
std::vector<int> perm_axis(n);
|
|
|
|
|
GetShuffledDim(src_dim, &shuffled_dims, dims, &perm_axis);
|
|
|
|
|
for (size_t i = 0; i < n; ++i) {
|
|
|
|
|
(*origin_dim)[perm_axis[i]] = i;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename DeviceContext, typename OutT, typename Functor>
|
|
|
|
|
void HandleLargeDim(const framework::ExecutionContext& context,
|
|
|
|
|
const Tensor* input, Tensor* output,
|
|
|
|
|
const std::vector<int>& dims, bool keep_dim) {
|
|
|
|
|
// shuffle the reduced dim to the end
|
|
|
|
|
Tensor shuffled_input;
|
|
|
|
|
GetShuffledInput<DeviceContext, OutT>(context, input, &shuffled_input, dims);
|
|
|
|
|
|
|
|
|
|
// transpose to 2D tensor whose shape is {unreduced, reduced}.
|
|
|
|
|
const int64_t unreduced = output->numel();
|
|
|
|
|
const int64_t reduced = shuffled_input.numel() / unreduced;
|
|
|
|
|
shuffled_input.Resize({unreduced, reduced});
|
|
|
|
|
DDim output_dim = output->dims();
|
|
|
|
|
output->Resize({unreduced});
|
|
|
|
|
ReduceFunctor<DeviceContext, OutT, 2, 1, Functor>(
|
|
|
|
|
context.template device_context<DeviceContext>(), shuffled_input, output,
|
|
|
|
|
{1}, keep_dim);
|
|
|
|
|
output->Resize(output_dim);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename DeviceContext, typename T, typename Functor>
|
|
|
|
|
void HandleLargeDimGrad(const framework::ExecutionContext& context,
|
|
|
|
|
const framework::Tensor* x,
|
|
|
|
|
const framework::Tensor* out,
|
|
|
|
|
const framework::Tensor* dout, framework::Tensor* dx,
|
|
|
|
|
const std::vector<int>& dims) {
|
|
|
|
|
const int64_t unreduced = out->numel();
|
|
|
|
|
const int64_t reduced = x->numel() / unreduced;
|
|
|
|
|
DDim out_dim(out->dims());
|
|
|
|
|
DDim x_dim(x->dims());
|
|
|
|
|
// transpose and reshape X
|
|
|
|
|
Tensor shuffled_x;
|
|
|
|
|
GetShuffledInput<DeviceContext, T>(context, x, &shuffled_x, dims);
|
|
|
|
|
DDim shuffled_dim = shuffled_x.dims();
|
|
|
|
|
shuffled_x.Resize({unreduced, reduced});
|
|
|
|
|
// reshape dX {unreduced, reduced}
|
|
|
|
|
dx->Resize({unreduced, reduced});
|
|
|
|
|
ReduceGradFunctor<DeviceContext, T, 2, Functor>(
|
|
|
|
|
context.template device_context<DeviceContext>(), shuffled_x, *out, *dout,
|
|
|
|
|
dx, {1});
|
|
|
|
|
// transpose dX
|
|
|
|
|
std::vector<int> origin_axis(x_dim.size());
|
|
|
|
|
GetOriginDimFromShuffled(x_dim, dims, &origin_axis);
|
|
|
|
|
Tensor dx_tmp;
|
|
|
|
|
framework::TensorCopy(*dx, context.GetPlace(), &dx_tmp);
|
|
|
|
|
dx_tmp.Resize(shuffled_dim);
|
|
|
|
|
dx->Resize(x_dim);
|
|
|
|
|
math::TransposeNormal<DeviceContext, T> trans;
|
|
|
|
|
trans(context.template device_context<DeviceContext>(), dx_tmp, dx,
|
|
|
|
|
origin_axis);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename DeviceContext, typename T, typename Functor>
|
|
|
|
|
struct ReduceKernelFunctor {
|
|
|
|
@ -69,6 +174,10 @@ struct ReduceKernelFunctor {
|
|
|
|
|
} else {
|
|
|
|
|
int ndim = input->dims().size();
|
|
|
|
|
int rdim = dims.size();
|
|
|
|
|
if (ndim > 6) {
|
|
|
|
|
HandleLargeDim<DeviceContext, OutT, Functor>(context, input, output,
|
|
|
|
|
dims, keep_dim);
|
|
|
|
|
} else {
|
|
|
|
|
HANDLE_DIM(6, 5);
|
|
|
|
|
HANDLE_DIM(6, 4);
|
|
|
|
|
HANDLE_DIM(6, 3);
|
|
|
|
@ -87,6 +196,7 @@ struct ReduceKernelFunctor {
|
|
|
|
|
HANDLE_DIM(1, 1);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
template <typename DeviceContext, typename T, typename Functor>
|
|
|
|
|
class ReduceKernel : public framework::OpKernel<T> {
|
|
|
|
@ -137,7 +247,6 @@ class ReduceKernel : public framework::OpKernel<T> {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
template <typename DeviceContext, typename OutT, typename Functor>
|
|
|
|
|
class BoolReduceKernel : public framework::OpKernel<OutT> {
|
|
|
|
|
public:
|
|
|
|
@ -175,15 +284,19 @@ class BoolReduceKernel : public framework::OpKernel<OutT> {
|
|
|
|
|
int ndim = input->dims().size();
|
|
|
|
|
int rdim = dims.size();
|
|
|
|
|
// comments for accelerating compiling temporarily.
|
|
|
|
|
// HANDLE_DIM(6, 5);
|
|
|
|
|
// HANDLE_DIM(6, 4);
|
|
|
|
|
// HANDLE_DIM(6, 3);
|
|
|
|
|
// HANDLE_DIM(6, 2);
|
|
|
|
|
// HANDLE_DIM(6, 1);
|
|
|
|
|
// HANDLE_DIM(5, 4);
|
|
|
|
|
// HANDLE_DIM(5, 3);
|
|
|
|
|
// HANDLE_DIM(5, 2);
|
|
|
|
|
// HANDLE_DIM(5, 1);
|
|
|
|
|
if (ndim > 6) {
|
|
|
|
|
HandleLargeDim<DeviceContext, OutT, Functor>(context, input, output,
|
|
|
|
|
dims, keep_dim);
|
|
|
|
|
} else {
|
|
|
|
|
HANDLE_DIM(6, 5);
|
|
|
|
|
HANDLE_DIM(6, 4);
|
|
|
|
|
HANDLE_DIM(6, 3);
|
|
|
|
|
HANDLE_DIM(6, 2);
|
|
|
|
|
HANDLE_DIM(6, 1);
|
|
|
|
|
HANDLE_DIM(5, 4);
|
|
|
|
|
HANDLE_DIM(5, 3);
|
|
|
|
|
HANDLE_DIM(5, 2);
|
|
|
|
|
HANDLE_DIM(5, 1);
|
|
|
|
|
HANDLE_DIM(4, 3);
|
|
|
|
|
HANDLE_DIM(4, 2);
|
|
|
|
|
HANDLE_DIM(4, 1);
|
|
|
|
@ -193,6 +306,7 @@ class BoolReduceKernel : public framework::OpKernel<OutT> {
|
|
|
|
|
HANDLE_DIM(1, 1);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
template <typename DeviceContext, typename T, typename Functor,
|
|
|
|
|
bool kNoNeedBufferX = false, bool kNoNeedBufferY = false>
|
|
|
|
@ -279,6 +393,10 @@ class ReduceGradKernel : public framework::OpKernel<T> {
|
|
|
|
|
context.template device_context<DeviceContext>(), *input0,
|
|
|
|
|
*input1, *input2, output, dims);
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
HandleLargeDimGrad<DeviceContext, T, Functor>(context, input0, input1,
|
|
|
|
|
input2, output, dims);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -313,12 +431,6 @@ class ReduceOp : public framework::OperatorWithKernel {
|
|
|
|
|
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ReduceOp");
|
|
|
|
|
auto x_dims = ctx->GetInputDim("X");
|
|
|
|
|
auto x_rank = x_dims.size();
|
|
|
|
|
PADDLE_ENFORCE_LE(x_rank, 6,
|
|
|
|
|
platform::errors::InvalidArgument(
|
|
|
|
|
"The input tensor X's dimensions of ReduceOp "
|
|
|
|
|
"should be less equal than 6. But received X's "
|
|
|
|
|
"dimensions = %d, X's shape = [%s].",
|
|
|
|
|
x_rank, x_dims));
|
|
|
|
|
auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
|
|
|
|
|
PADDLE_ENFORCE_GT(dims.size(), 0,
|
|
|
|
|
platform::errors::InvalidArgument(
|
|
|
|
@ -402,11 +514,6 @@ class ReduceGradOp : public framework::OperatorWithKernel {
|
|
|
|
|
"Out@GRAD", "ReduceOp");
|
|
|
|
|
auto x_dims = ctx->GetInputDim("X");
|
|
|
|
|
auto x_rank = x_dims.size();
|
|
|
|
|
PADDLE_ENFORCE_LE(x_rank, 6,
|
|
|
|
|
platform::errors::InvalidArgument(
|
|
|
|
|
"Tensors with rank at most 6 are supported by "
|
|
|
|
|
"ReduceOp. Received tensor with rank %d.",
|
|
|
|
|
x_rank));
|
|
|
|
|
auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
|
|
|
|
|
for (size_t i = 0; i < dims.size(); ++i) {
|
|
|
|
|
PADDLE_ENFORCE_LT(dims[i], x_rank,
|
|
|
|
|