From 518325f1e77c28ec5583e082e96983a219d837dd Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 27 Feb 2019 18:00:06 +0800 Subject: [PATCH 01/18] add softmax_axis CPU kernel. test=develop --- paddle/fluid/operators/softmax_op.cc | 11 ++++++ paddle/fluid/operators/softmax_op.h | 51 ++++++++++++++++++++++++++-- 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index 8fbf299a7c..bd3b14775f 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -37,6 +37,13 @@ class SoftmaxOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of SoftmaxOp should not be null."); + auto dim_x = ctx->GetInputDim("X"); + auto rank_x = dim_x.size(); + auto axis = ctx->Attrs().Get("axis"); + PADDLE_ENFORCE(axis >= -1 && axis < rank_x, + "Attr(axis) value should larger equal then -1" + "and less then the rank of Input(X)"); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); ctx->ShareLoD("X", /*->*/ "Out"); } @@ -80,6 +87,10 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { "The input tensor of softmax, " "whose last dimension is the input_feature_dimensions."); AddOutput("Out", "The normalized values with the same shape as X."); + AddAttr("axis", + "The dimension of Input(x) to perform softmax," + "default -1 for last dimension") + .SetDefault(-1); AddAttr( "use_cudnn", "(bool, default false) Only used in cudnn kernel, need install cudnn") diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h index 91829d5761..ad41e52116 100644 --- a/paddle/fluid/operators/softmax_op.h +++ b/paddle/fluid/operators/softmax_op.h @@ -13,27 +13,69 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/softmax.h" +#include "paddle/fluid/operators/transpose_op.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; +template +static inline void TransposeAxisToEnd(const Tensor& x, const Tensor& out, + Tensor* x_trans, Tensor* out_trans, + const int axis, std::vector perm, + const framework::ExecutionContext& ctx) { + auto dim_x = x.dims(); + int rank = dim_x.size(); + + if (axis == -1 || axis == rank - 1) { + *x_trans = x; + *out_trans = out; + return; + } + + auto& dev_ctx = ctx.template device_context(); + std::vector shape; + for (int i = 0; i < rank - 1; i++) { + if (i == axis) { + perm.push_back(rank - 1); + shape.push_back(dim_x[rank - 1]); + } else { + perm.push_back(i); + shape.push_back(dim_x[i]); + } + } + perm.push_back(axis); + shape.push_back(dim_x[axis]); + + x_trans->mutable_data(framework::make_ddim(shape), ctx.GetPlace()); + out_trans->mutable_data(framework::make_ddim(shape), ctx.GetPlace()); + TransCompute(rank, dev_ctx, x, x_trans, perm); + TransCompute(rank, dev_ctx, out, out_trans, perm); +} + template class SoftmaxKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { auto* X = context.Input("X"); auto* Out = context.Output("Out"); + const int axis = context.Attr("axis"); // allocate memory on device. Out->mutable_data(context.GetPlace()); + Tensor X_trans, Out_trans; + std::vector perm; + TransposeAxisToEnd(*X, *Out, &X_trans, &Out_trans, axis, + perm, context); + int rank = X->dims().size(); - Tensor X_2d = framework::ReshapeToMatrix(*X, rank - 1); - Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1); + Tensor X_2d = framework::ReshapeToMatrix(X_trans, rank - 1); + Tensor Out_2d = framework::ReshapeToMatrix(Out_trans, rank - 1); #ifdef PADDLE_ON_INFERENCE math::SoftmaxFunctor()( @@ -42,6 +84,11 @@ class SoftmaxKernel : public framework::OpKernel { math::SoftmaxFunctor()( context.template device_context(), &X_2d, &Out_2d); #endif + + if (axis != -1 && axis != rank - 1) { + auto& dev_ctx = context.template device_context(); + TransCompute(rank, dev_ctx, Out_trans, Out, perm); + } } }; From 6cb66721d2e98d9f8f6b15478ba4796f14eecab0 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Mon, 4 Mar 2019 15:23:35 +0000 Subject: [PATCH 02/18] add cudnn support. test=develop --- paddle/fluid/operators/softmax_cudnn_op.cu.cc | 70 ++++++++++++---- paddle/fluid/operators/softmax_op.h | 83 ++++++++++++------- .../fluid/tests/unittests/test_softmax_op.py | 61 +++++++++++++- 3 files changed, 164 insertions(+), 50 deletions(-) diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.cc b/paddle/fluid/operators/softmax_cudnn_op.cu.cc index ad3e5543f1..84151d70b9 100644 --- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc +++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/softmax.h" +#include "paddle/fluid/operators/softmax_op.h" #include "paddle/fluid/framework/op_registry.h" namespace paddle { @@ -24,22 +25,40 @@ template class SoftmaxCUDNNKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { + auto& dev_ctx = context.template device_context(); auto* X = context.Input("X"); auto* Out = context.Output("Out"); + // auto dims = X->dims(); + const int axis = context.Attr("axis"); + int rank = X->dims().size(); // allocate memory on device. Out->mutable_data(context.GetPlace()); - auto dims = X->dims(); - auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); - framework::LoDTensor flattened_x; - framework::LoDTensor flattened_out; - flattened_x.ShareDataWith(*X).Resize(flattened_dims); - flattened_out.ShareDataWith(*Out).Resize(flattened_dims); + std::vector perm, shape; + CalcTransPermAndShapeByAxis(*X, axis, &perm, &shape); + + Tensor X_2d, Out_2d; + Tensor X_trans, Out_trans; + if (axis != -1 && axis != rank - 1) { + X_trans.mutable_data(framework::make_ddim(shape), context.GetPlace()); + Out_trans.mutable_data(framework::make_ddim(shape), context.GetPlace()); + TransCompute(rank, dev_ctx, *X, &X_trans, perm); + TransCompute(rank, dev_ctx, *Out, &Out_trans, perm); + X_2d = framework::ReshapeToMatrix(X_trans, rank - 1); + Out_2d = framework::ReshapeToMatrix(Out_trans, rank - 1); + } else { + X_2d = framework::ReshapeToMatrix(*X, rank - 1); + Out_2d = framework::ReshapeToMatrix(*Out, rank - 1); + } math::SoftmaxCUDNNFunctor()( context.template device_context(), - &flattened_x, &flattened_out); + &X_2d, &Out_2d); + + if (axis != -1 && axis != rank - 1) { + TransCompute(rank, dev_ctx, Out_trans, Out, perm); + } } }; @@ -47,25 +66,44 @@ template class SoftmaxGradCUDNNKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { + auto& dev_ctx = context.template device_context(); auto* Out = context.Input("Out"); auto* dOut = context.Input(framework::GradVarName("Out")); auto* dX = context.Output(framework::GradVarName("X")); + const int axis = context.Attr("axis"); + int rank = Out->dims().size(); // allocate memory on device. dX->mutable_data(context.GetPlace()); - auto dims = Out->dims(); - auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); - framework::LoDTensor flattened_out; - framework::LoDTensor flattened_d_out; - framework::LoDTensor flattened_d_x; - flattened_out.ShareDataWith(*Out).Resize(flattened_dims); - flattened_d_out.ShareDataWith(*dOut).Resize(flattened_dims); - flattened_d_x.ShareDataWith(*dX).Resize(flattened_dims); + std::vector perm, shape; + CalcTransPermAndShapeByAxis(*dX, axis, &perm, &shape); + + Tensor dX_2d, Out_2d, dOut_2d; + Tensor dX_trans, Out_trans, dOut_trans; + if (axis != -1 && axis != rank - 1) { + dX_trans.mutable_data(framework::make_ddim(shape), context.GetPlace()); + Out_trans.mutable_data(framework::make_ddim(shape), context.GetPlace()); + dOut_trans.mutable_data(framework::make_ddim(shape), context.GetPlace()); + TransCompute(rank, dev_ctx, *dX, &dX_trans, perm); + TransCompute(rank, dev_ctx, *Out, &Out_trans, perm); + TransCompute(rank, dev_ctx, *dOut, &dOut_trans, perm); + dX_2d = framework::ReshapeToMatrix(dX_trans, rank - 1); + Out_2d = framework::ReshapeToMatrix(Out_trans, rank - 1); + dOut_2d = framework::ReshapeToMatrix(dOut_trans, rank - 1); + } else { + dX_2d = framework::ReshapeToMatrix(*dX, rank - 1); + Out_2d = framework::ReshapeToMatrix(*Out, rank - 1); + dOut_2d = framework::ReshapeToMatrix(*dOut, rank - 1); + } math::SoftmaxGradCUDNNFunctor()( context.template device_context(), - &flattened_out, &flattened_d_out, &flattened_d_x); + &Out_2d, &dOut_2d, &dX_2d); + + if (axis != -1 && axis != rank - 1) { + TransCompute(rank, dev_ctx, dX_trans, dX, perm); + } } }; diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h index ad41e52116..1810b23e0d 100644 --- a/paddle/fluid/operators/softmax_op.h +++ b/paddle/fluid/operators/softmax_op.h @@ -23,59 +23,58 @@ namespace operators { using Tensor = framework::Tensor; -template -static inline void TransposeAxisToEnd(const Tensor& x, const Tensor& out, - Tensor* x_trans, Tensor* out_trans, - const int axis, std::vector perm, - const framework::ExecutionContext& ctx) { +static inline void CalcTransPermAndShapeByAxis(const Tensor& x, const int axis, + std::vector* perm, std::vector* shape) { auto dim_x = x.dims(); int rank = dim_x.size(); if (axis == -1 || axis == rank - 1) { - *x_trans = x; - *out_trans = out; return; } - auto& dev_ctx = ctx.template device_context(); - std::vector shape; for (int i = 0; i < rank - 1; i++) { if (i == axis) { - perm.push_back(rank - 1); - shape.push_back(dim_x[rank - 1]); + perm->push_back(rank - 1); + shape->push_back(dim_x[rank - 1]); } else { - perm.push_back(i); - shape.push_back(dim_x[i]); + perm->push_back(i); + shape->push_back(dim_x[i]); } } - perm.push_back(axis); - shape.push_back(dim_x[axis]); - - x_trans->mutable_data(framework::make_ddim(shape), ctx.GetPlace()); - out_trans->mutable_data(framework::make_ddim(shape), ctx.GetPlace()); - TransCompute(rank, dev_ctx, x, x_trans, perm); - TransCompute(rank, dev_ctx, out, out_trans, perm); + perm->push_back(axis); + shape->push_back(dim_x[axis]); } template class SoftmaxKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { + auto& dev_ctx = context.template device_context(); auto* X = context.Input("X"); auto* Out = context.Output("Out"); const int axis = context.Attr("axis"); + int rank = X->dims().size(); // allocate memory on device. Out->mutable_data(context.GetPlace()); + std::vector perm, shape; + CalcTransPermAndShapeByAxis(*X, axis, &perm, &shape); + + Tensor X_2d, Out_2d; Tensor X_trans, Out_trans; - std::vector perm; - TransposeAxisToEnd(*X, *Out, &X_trans, &Out_trans, axis, - perm, context); + if (axis != -1 && axis != rank - 1) { + X_trans.mutable_data(framework::make_ddim(shape), context.GetPlace()); + Out_trans.mutable_data(framework::make_ddim(shape), context.GetPlace()); + TransCompute(rank, dev_ctx, *X, &X_trans, perm); + TransCompute(rank, dev_ctx, *Out, &Out_trans, perm); + X_2d = framework::ReshapeToMatrix(X_trans, rank - 1); + Out_2d = framework::ReshapeToMatrix(Out_trans, rank - 1); + } else { + X_2d = framework::ReshapeToMatrix(*X, rank - 1); + Out_2d = framework::ReshapeToMatrix(*Out, rank - 1); + } - int rank = X->dims().size(); - Tensor X_2d = framework::ReshapeToMatrix(X_trans, rank - 1); - Tensor Out_2d = framework::ReshapeToMatrix(Out_trans, rank - 1); #ifdef PADDLE_ON_INFERENCE math::SoftmaxFunctor()( @@ -86,7 +85,6 @@ class SoftmaxKernel : public framework::OpKernel { #endif if (axis != -1 && axis != rank - 1) { - auto& dev_ctx = context.template device_context(); TransCompute(rank, dev_ctx, Out_trans, Out, perm); } } @@ -96,21 +94,44 @@ template class SoftmaxGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { + auto& dev_ctx = context.template device_context(); auto* Out = context.Input("Out"); auto* dOut = context.Input(framework::GradVarName("Out")); auto* dX = context.Output(framework::GradVarName("X")); + const int axis = context.Attr("axis"); + int rank = Out->dims().size(); // allocate memory on device. dX->mutable_data(context.GetPlace()); - int rank = Out->dims().size(); - Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1); - Tensor dOut_2d = framework::ReshapeToMatrix(*dOut, rank - 1); - Tensor dX_2d = framework::ReshapeToMatrix(*dX, rank - 1); + std::vector perm, shape; + CalcTransPermAndShapeByAxis(*dX, axis, &perm, &shape); + + Tensor dX_2d, Out_2d, dOut_2d; + Tensor dX_trans, Out_trans, dOut_trans; + if (axis != -1 && axis != rank - 1) { + dX_trans.mutable_data(framework::make_ddim(shape), context.GetPlace()); + Out_trans.mutable_data(framework::make_ddim(shape), context.GetPlace()); + dOut_trans.mutable_data(framework::make_ddim(shape), context.GetPlace()); + TransCompute(rank, dev_ctx, *dX, &dX_trans, perm); + TransCompute(rank, dev_ctx, *Out, &Out_trans, perm); + TransCompute(rank, dev_ctx, *dOut, &dOut_trans, perm); + dX_2d = framework::ReshapeToMatrix(dX_trans, rank - 1); + Out_2d = framework::ReshapeToMatrix(Out_trans, rank - 1); + dOut_2d = framework::ReshapeToMatrix(dOut_trans, rank - 1); + } else { + dX_2d = framework::ReshapeToMatrix(*dX, rank - 1); + Out_2d = framework::ReshapeToMatrix(*Out, rank - 1); + dOut_2d = framework::ReshapeToMatrix(*dOut, rank - 1); + } math::SoftmaxGradFunctor()( context.template device_context(), &Out_2d, &dOut_2d, &dX_2d); + + if (axis != -1 && axis != rank - 1) { + TransCompute(rank, dev_ctx, dX_trans, dX, perm); + } } }; diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py index 5c56de6779..084fa869e3 100644 --- a/python/paddle/fluid/tests/unittests/test_softmax_op.py +++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py @@ -31,6 +31,9 @@ class TestSoftmaxOp(OpTest): def get_x_shape(self): return [10, 10] + def get_axis(self): + return -1 + def setUp(self): self.op_type = "softmax" self.use_cudnn = False @@ -38,15 +41,15 @@ class TestSoftmaxOp(OpTest): self.dtype = np.float32 self.init_kernel_type() self.shape = self.get_x_shape() + self.axis = self.get_axis() x = np.random.uniform(0.1, 1, self.shape).astype(self.dtype) - out = np.apply_along_axis(stable_softmax, 1, - x.reshape([-1, self.shape[-1]])) - out = out.reshape(self.shape) + out = np.apply_along_axis(stable_softmax, self.axis, x) self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} self.outputs = {'Out': out} self.attrs = { + 'axis': self.axis, 'use_cudnn': self.use_cudnn, 'use_mkldnn': self.use_mkldnn } @@ -76,6 +79,38 @@ class TestSoftmaxOp2(TestSoftmaxOp): return [2, 3, 4, 5] +class TestSoftmaxOp3(TestSoftmaxOp): + def get_x_shape(self): + return [2, 3, 4, 5] + + def get_axis(self): + return 0 + + +class TestSoftmaxOp4(TestSoftmaxOp): + def get_x_shape(self): + return [2, 3, 4, 5] + + def get_axis(self): + return 1 + + +class TestSoftmaxOp5(TestSoftmaxOp): + def get_x_shape(self): + return [2, 3, 4, 5] + + def get_axis(self): + return 2 + + +class TestSoftmaxOp5(TestSoftmaxOp): + def get_x_shape(self): + return [2, 3, 4, 5] + + def get_axis(self): + return 3 + + @unittest.skipIf(not core.is_compiled_with_cuda(), "core is not compiled with CUDA") class TestSoftmaxCUDNNOp(TestSoftmaxOp): @@ -90,6 +125,26 @@ class TestSoftmaxCUDNNOp2(TestSoftmaxCUDNNOp): return [2, 3, 4, 5] +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestSoftmaxCUDNNOp3(TestSoftmaxCUDNNOp): + def get_x_shape(self): + return [2, 3, 4, 5] + + def get_axis(self): + return 1 + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestSoftmaxCUDNNOp2(TestSoftmaxCUDNNOp): + def get_x_shape(self): + return [2, 3, 4, 5] + + def get_axis(self): + return 2 + + @unittest.skipIf(not core.is_compiled_with_cuda(), "core is not compiled with CUDA") class TestSoftmaxFP16Op(TestSoftmaxOp): From 217db273371abd7b78c4a777992a6090c7e4d0ba Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 5 Mar 2019 03:55:33 +0000 Subject: [PATCH 03/18] add mkldnn support. test=develop --- .../operators/mkldnn/softmax_mkldnn_op.cc | 128 +++++++++++++----- paddle/fluid/operators/softmax_cudnn_op.cu.cc | 1 - paddle/fluid/operators/softmax_op.cc | 11 +- python/paddle/fluid/layers/nn.py | 17 ++- .../fluid/tests/unittests/test_layers.py | 2 +- 5 files changed, 111 insertions(+), 48 deletions(-) diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc index 0ce5522194..4e4f482987 100644 --- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc @@ -110,28 +110,51 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { "It must use CPUPlace."); auto& dev_ctx = ctx.template device_context(); auto mkldnn_engine = dev_ctx.GetEngine(); - const Tensor* input = ctx.Input("X"); - Tensor* output = ctx.Output("Out"); + const Tensor* X = ctx.Input("X"); + Tensor* Out = ctx.Output("Out"); PADDLE_ENFORCE_EQ( - input->dims(), output->dims(), + X->dims(), Out->dims(), "The shape of softmax's input and output must be identical."); + const int axis = ctx.Attr("axis"); + int rank = X->dims().size(); + // make sure 'output' holds memory, which will be shared by // 'flattened_output' later. - output->mutable_data(ctx.GetPlace()); + Out->mutable_data(ctx.GetPlace()); + + std::vector perm, shape; + CalcTransPermAndShapeByAxis(*X, axis, &perm, &shape); + + Tensor X_2d, Out_2d; + Tensor X_trans, Out_trans; + if (axis != -1 && axis != rank - 1) { + X_trans.mutable_data(framework::make_ddim(shape), ctx.GetPlace()); + Out_trans.mutable_data(framework::make_ddim(shape), ctx.GetPlace()); + TransCompute(rank, dev_ctx, *X, &X_trans, perm); + TransCompute(rank, dev_ctx, *Out, &Out_trans, perm); + X_2d = framework::ReshapeToMatrix(X_trans, rank - 1); + Out_2d = framework::ReshapeToMatrix(Out_trans, rank - 1); + } else { + X_2d = framework::ReshapeToMatrix(*X, rank - 1); + Out_2d = framework::ReshapeToMatrix(*Out, rank - 1); + } // flatten input and output to 2-D matrixs - auto dims = input->dims(); // input and output share the same shape - auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); - framework::Tensor flattened_input; - framework::Tensor flattened_output; - flattened_input.ShareDataWith(*input).Resize(flattened_dims); - flattened_output.ShareDataWith(*output).Resize(flattened_dims); - - const T* input_data = flattened_input.data(); - T* output_data = flattened_output.mutable_data(ctx.GetPlace()); - - std::vector src_tz = paddle::framework::vectorize2int(flattened_dims); + // auto dims = input->dims(); // input and output share the same shape + // auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); + // framework::Tensor flattened_input; + // framework::Tensor flattened_output; + // flattened_input.ShareDataWith(*input).Resize(flattened_dims); + // flattened_output.ShareDataWith(*output).Resize(flattened_dims); + + // const T* input_data = flattened_input.data(); + // T* output_data = flattened_output.mutable_data(ctx.GetPlace()); + const T* input_data = X_2d.data(); + T* output_data = Out_2d.mutable_data(ctx.GetPlace()); + + // std::vector src_tz = paddle::framework::vectorize2int(flattened_dims); + std::vector src_tz = paddle::framework::vectorize2int(X_2d.dims()); std::vector dst_tz = src_tz; // Same memory descriptor to be used for input and output memory::dims softmax_tz = {src_tz[0], src_tz[1]}; @@ -178,6 +201,10 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { output_data[i] < threshold ? threshold : output_data[i]; } } + + if (axis != -1 && axis != rank - 1) { + TransCompute(rank, dev_ctx, Out_trans, Out, perm); + } } }; @@ -190,33 +217,60 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel { auto& dev_ctx = ctx.template device_context(); auto mkldnn_engine = dev_ctx.GetEngine(); - const Tensor* output = ctx.Input("Out"); - auto* dout = ctx.template Input(framework::GradVarName("Out")); - auto* dx = + const Tensor* Out = ctx.Input("Out"); + auto* dOut = ctx.template Input(framework::GradVarName("Out")); + auto* dX = ctx.template Output(framework::GradVarName("X")); PADDLE_ENFORCE_EQ( - dout->dims(), dx->dims(), + dOut->dims(), dX->dims(), "The shape of softmax_grad's input and output must be identical."); + const int axis = ctx.Attr("axis"); + int rank = Out->dims().size(); + // make sure 'dx' holds memory, which will be shared by 'flattened_dx' // later. - dx->template mutable_data(ctx.GetPlace()); - - auto dims = dout->dims(); // input and output share the same shape - auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); - framework::Tensor flattened_output; - framework::Tensor flattened_dout; - framework::Tensor flattened_dx; - flattened_output.ShareDataWith(*output).Resize(flattened_dims); - flattened_dout.ShareDataWith(*dout).Resize(flattened_dims); - flattened_dx.ShareDataWith(*dx).Resize(flattened_dims); - - const T* dst_data = flattened_output.data(); - const T* diff_dst_ptr = flattened_dout.template data(); - T* diff_src_ptr = flattened_dx.template mutable_data(ctx.GetPlace()); - - std::vector dst_tz = paddle::framework::vectorize2int(flattened_dims); + dX->template mutable_data(ctx.GetPlace()); + + std::vector perm, shape; + CalcTransPermAndShapeByAxis(*dX, axis, &perm, &shape); + + Tensor dX_2d, Out_2d, dOut_2d; + Tensor dX_trans, Out_trans, dOut_trans; + if (axis != -1 && axis != rank - 1) { + dX_trans.mutable_data(framework::make_ddim(shape), ctx.GetPlace()); + Out_trans.mutable_data(framework::make_ddim(shape), ctx.GetPlace()); + dOut_trans.mutable_data(framework::make_ddim(shape), ctx.GetPlace()); + TransCompute(rank, dev_ctx, *dX, &dX_trans, perm); + TransCompute(rank, dev_ctx, *Out, &Out_trans, perm); + TransCompute(rank, dev_ctx, *dOut, &dOut_trans, perm); + dX_2d = framework::ReshapeToMatrix(dX_trans, rank - 1); + Out_2d = framework::ReshapeToMatrix(Out_trans, rank - 1); + dOut_2d = framework::ReshapeToMatrix(dOut_trans, rank - 1); + } else { + dX_2d = framework::ReshapeToMatrix(*dX, rank - 1); + Out_2d = framework::ReshapeToMatrix(*Out, rank - 1); + dOut_2d = framework::ReshapeToMatrix(*dOut, rank - 1); + } + + // auto dims = dout->dims(); // input and output share the same shape + // auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); + // framework::Tensor flattened_output; + // framework::Tensor flattened_dout; + // framework::Tensor flattened_dx; + // flattened_output.ShareDataWith(*output).Resize(flattened_dims); + // flattened_dout.ShareDataWith(*dout).Resize(flattened_dims); + // flattened_dx.ShareDataWith(*dx).Resize(flattened_dims); + + // const T* dst_data = flattened_output.data(); + // const T* diff_dst_ptr = flattened_dout.template data(); + // T* diff_src_ptr = flattened_dx.template mutable_data(ctx.GetPlace()); + const T* dst_data = Out_2d.data(); + const T* diff_dst_ptr = dOut_2d.template data(); + T* diff_src_ptr = dX_2d.template mutable_data(ctx.GetPlace()); + + std::vector dst_tz = paddle::framework::vectorize2int(Out_2d.dims()); std::vector src_tz(dst_tz); // Same memory descriptor to be used for input and output @@ -261,6 +315,10 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel { std::vector pipeline{*softmax_bwd_p}; stream(stream::kind::eager).submit(pipeline).wait(); + + if (axis != -1 && axis != rank - 1) { + TransCompute(rank, dev_ctx, dX_trans, dX, perm); + } } }; } // namespace operators diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.cc b/paddle/fluid/operators/softmax_cudnn_op.cu.cc index 84151d70b9..dc5b7bb0af 100644 --- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc +++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc @@ -28,7 +28,6 @@ class SoftmaxCUDNNKernel : public framework::OpKernel { auto& dev_ctx = context.template device_context(); auto* X = context.Input("X"); auto* Out = context.Output("Out"); - // auto dims = X->dims(); const int axis = context.Attr("axis"); int rank = X->dims().size(); diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index bd3b14775f..02f256fa64 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -85,10 +85,10 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("X", "The input tensor of softmax, " - "whose last dimension is the input_feature_dimensions."); + "whose :attr:`axis` dimension is the input_feature_dimensions."); AddOutput("Out", "The normalized values with the same shape as X."); AddAttr("axis", - "The dimension of Input(x) to perform softmax," + "The dimension index of Input(x) to perform softmax," "default -1 for last dimension") .SetDefault(-1); AddAttr( @@ -115,12 +115,13 @@ Softmax Operator. The input of the softmax operator is a tensor of any rank. The output tensor has the same shape as the input. -The input tensor will first be logically flattened to a 2-D matrix. The matrix's -second dimension(row length) is as same as the last dimension of the input +The :attr:`axis` th dimension of the input tensor will be permuted to the last. +Then the input tensor will be logically flattened to a 2-D matrix. The matrix's +second dimension(row length) is as same as the :attr:`axis` dimension of the input tensor, and the first dimension(column length) is the product of all other dimensions of the input tensor. For each row of the matrix, the softmax operator squashes the K-dimensional(K is the width of the matrix, which is also the size -of the input tensor's last dimension) vector of arbitrary real values to a +of the input tensor's :attr:`axis` dimension) vector of arbitrary real values to a K-dimensional vector of real values in the range [0, 1] that add up to 1. It computes the exponential of the given dimension and the sum of exponential values of all the other dimensions in the K-dimensional vector input. diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index dbe495b75c..273d74ca6e 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1819,17 +1819,18 @@ def sequence_softmax(input, use_cudnn=False, name=None): return softmax_out -def softmax(input, use_cudnn=False, name=None): +def softmax(input, use_cudnn=False, name=None, axis=-1): """ The input of the softmax operator is a tensor of any rank. The output tensor has the same shape as the input. - The input tensor will first be logically flattened to a 2-D matrix. The matrix's - second dimension(row length) is as same as the last dimension of the input + The :attr:`axis` th dimension of the input tensor will be permuted to the last. + Then the input tensor will be logically flattened to a 2-D matrix. The matrix's + second dimension(row length) is as same as the :attr:`axis` th dimension of the input tensor, and the first dimension(column length) is the product of all other dimensions of the input tensor. For each row of the matrix, the softmax operator squashes the K-dimensional(K is the width of the matrix, which is also the size - of the input tensor's last dimension) vector of arbitrary real values to a + of the input tensor's :attr:`axis` th dimension) vector of arbitrary real values to a K-dimensional vector of real values in the range [0, 1] that add up to 1. It computes the exponential of the given dimension and the sum of exponential @@ -1851,6 +1852,7 @@ def softmax(input, use_cudnn=False, name=None): False by default. Default: False name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. Default: None. + axis (int): The index of dimension to perform softmax calculation. Default: -1. Returns: Variable: output of softmax @@ -1860,7 +1862,7 @@ def softmax(input, use_cudnn=False, name=None): .. code-block:: python fc = fluid.layers.fc(input=x, size=10) - softmax = fluid.layers.softmax(input=fc) + softmax = fluid.layers.softmax(input=fc, axis=1) """ helper = LayerHelper('softmax', **locals()) @@ -1870,7 +1872,10 @@ def softmax(input, use_cudnn=False, name=None): type="softmax", inputs={"X": input}, outputs={"Out": softmax_out}, - attrs={"use_cudnn": use_cudnn}) + attrs={ + "axis": axis, + "use_cudnn": use_cudnn + }) return softmax_out diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 885ee170e8..4e255293b6 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -513,7 +513,7 @@ class TestBook(unittest.TestCase): with program_guard(program): data = layers.data(name='data', shape=[10], dtype='float32') hid = layers.fc(input=data, size=20) - self.assertIsNotNone(layers.softmax(hid)) + self.assertIsNotNone(layers.softmax(hid, axis=1)) print(str(program)) def test_space_to_depth(self): From 365e6cfd15e64e381d64ff8554ca8b08ff7f33cc Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 5 Mar 2019 07:35:42 +0000 Subject: [PATCH 04/18] add mkldnn support. test=develop --- paddle/fluid/API.spec | 2 +- .../operators/mkldnn/softmax_mkldnn_op.cc | 79 ++++++++----------- .../mkldnn/test_softmax_mkldnn_op.py | 24 ++++++ .../fluid/tests/unittests/test_softmax_op.py | 12 ++- 4 files changed, 71 insertions(+), 46 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 66fc323e6b..251b1673a9 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -86,7 +86,7 @@ paddle.fluid.layers.conv2d (ArgSpec(args=['input', 'num_filters', 'filter_size', paddle.fluid.layers.conv3d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)), ('document', '37042620f9bd3a2da6e5d3138b2f724b')) paddle.fluid.layers.sequence_pool (ArgSpec(args=['input', 'pool_type', 'is_test'], varargs=None, keywords=None, defaults=(False,)), ('document', 'a194fb80614023f543df3949fbd0d0b8')) paddle.fluid.layers.sequence_softmax (ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '19ef6f9cdd27feac8a1ae060f19c10b4')) -paddle.fluid.layers.softmax (ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'f19dd380864e61134ce3814e4be0de4b')) +paddle.fluid.layers.softmax (ArgSpec(args=['input', 'use_cudnn', 'name', 'axis'], varargs=None, keywords=None, defaults=(False, None, -1)), ('document', 'f19dd380864e61134ce3814e4be0de4b')) paddle.fluid.layers.pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', 'bbd84e855e660cd1084bb71a2fd0cdaa')) paddle.fluid.layers.pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', '043de7333b79ee0ac55053c14ed81625')) paddle.fluid.layers.adaptive_pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '859b887174d06f361658f69cb7c06d95')) diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc index 4e4f482987..cff8cdd8f5 100644 --- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc @@ -131,29 +131,22 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { if (axis != -1 && axis != rank - 1) { X_trans.mutable_data(framework::make_ddim(shape), ctx.GetPlace()); Out_trans.mutable_data(framework::make_ddim(shape), ctx.GetPlace()); - TransCompute(rank, dev_ctx, *X, &X_trans, perm); - TransCompute(rank, dev_ctx, *Out, &Out_trans, perm); - X_2d = framework::ReshapeToMatrix(X_trans, rank - 1); - Out_2d = framework::ReshapeToMatrix(Out_trans, rank - 1); + TransCompute(rank, dev_ctx, *X, &X_trans, perm); + TransCompute(rank, dev_ctx, *Out, &Out_trans, perm); + auto dims = X_trans.dims(); + auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); + X_2d.ShareDataWith(X_trans).Resize(flattened_dims); + Out_2d.ShareDataWith(Out_trans).Resize(flattened_dims); } else { - X_2d = framework::ReshapeToMatrix(*X, rank - 1); - Out_2d = framework::ReshapeToMatrix(*Out, rank - 1); + auto dims = X->dims(); + auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); + X_2d.ShareDataWith(*X).Resize(flattened_dims); + Out_2d.ShareDataWith(*Out).Resize(flattened_dims); } - // flatten input and output to 2-D matrixs - // auto dims = input->dims(); // input and output share the same shape - // auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); - // framework::Tensor flattened_input; - // framework::Tensor flattened_output; - // flattened_input.ShareDataWith(*input).Resize(flattened_dims); - // flattened_output.ShareDataWith(*output).Resize(flattened_dims); - - // const T* input_data = flattened_input.data(); - // T* output_data = flattened_output.mutable_data(ctx.GetPlace()); const T* input_data = X_2d.data(); T* output_data = Out_2d.mutable_data(ctx.GetPlace()); - // std::vector src_tz = paddle::framework::vectorize2int(flattened_dims); std::vector src_tz = paddle::framework::vectorize2int(X_2d.dims()); std::vector dst_tz = src_tz; // Same memory descriptor to be used for input and output @@ -184,10 +177,16 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { // We cannot use softmax_dst_memory_p to get prim desc as // it contains flattened dims (2D) while output tensor can // have 2,3,4+ dims - auto output_mem_pd = paddle::platform::create_prim_desc_from_dims( - paddle::framework::vectorize2int(output->dims()), - mkldnn::memory::format::blocked); - output->set_mkldnn_prim_desc(output_mem_pd); + if (axis != -1 && axis != rank - 1) { + auto output_mem_pd = paddle::platform::create_prim_desc_from_dims( + shape, mkldnn::memory::format::blocked); + Out_trans.set_mkldnn_prim_desc(output_mem_pd); + } else { + auto output_mem_pd = paddle::platform::create_prim_desc_from_dims( + paddle::framework::vectorize2int(Out->dims()), + mkldnn::memory::format::blocked); + Out->set_mkldnn_prim_desc(output_mem_pd); + } std::vector pipeline{ *(static_cast(softmax_p.get()))}; @@ -203,7 +202,7 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { } if (axis != -1 && axis != rank - 1) { - TransCompute(rank, dev_ctx, Out_trans, Out, perm); + TransCompute(rank, dev_ctx, Out_trans, Out, perm); } } }; @@ -242,30 +241,22 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel { dX_trans.mutable_data(framework::make_ddim(shape), ctx.GetPlace()); Out_trans.mutable_data(framework::make_ddim(shape), ctx.GetPlace()); dOut_trans.mutable_data(framework::make_ddim(shape), ctx.GetPlace()); - TransCompute(rank, dev_ctx, *dX, &dX_trans, perm); - TransCompute(rank, dev_ctx, *Out, &Out_trans, perm); - TransCompute(rank, dev_ctx, *dOut, &dOut_trans, perm); - dX_2d = framework::ReshapeToMatrix(dX_trans, rank - 1); - Out_2d = framework::ReshapeToMatrix(Out_trans, rank - 1); - dOut_2d = framework::ReshapeToMatrix(dOut_trans, rank - 1); + TransCompute(rank, dev_ctx, *dX, &dX_trans, perm); + TransCompute(rank, dev_ctx, *Out, &Out_trans, perm); + TransCompute(rank, dev_ctx, *dOut, &dOut_trans, perm); + auto dims = dX_trans.dims(); + auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); + dX_2d.ShareDataWith(dX_trans).Resize(flattened_dims); + Out_2d.ShareDataWith(Out_trans).Resize(flattened_dims); + dOut_2d.ShareDataWith(dOut_trans).Resize(flattened_dims); } else { - dX_2d = framework::ReshapeToMatrix(*dX, rank - 1); - Out_2d = framework::ReshapeToMatrix(*Out, rank - 1); - dOut_2d = framework::ReshapeToMatrix(*dOut, rank - 1); + auto dims = dX->dims(); + auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); + dX_2d.ShareDataWith(*dX).Resize(flattened_dims); + Out_2d.ShareDataWith(*Out).Resize(flattened_dims); + dOut_2d.ShareDataWith(*dOut).Resize(flattened_dims); } - // auto dims = dout->dims(); // input and output share the same shape - // auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); - // framework::Tensor flattened_output; - // framework::Tensor flattened_dout; - // framework::Tensor flattened_dx; - // flattened_output.ShareDataWith(*output).Resize(flattened_dims); - // flattened_dout.ShareDataWith(*dout).Resize(flattened_dims); - // flattened_dx.ShareDataWith(*dx).Resize(flattened_dims); - - // const T* dst_data = flattened_output.data(); - // const T* diff_dst_ptr = flattened_dout.template data(); - // T* diff_src_ptr = flattened_dx.template mutable_data(ctx.GetPlace()); const T* dst_data = Out_2d.data(); const T* diff_dst_ptr = dOut_2d.template data(); T* diff_src_ptr = dX_2d.template mutable_data(ctx.GetPlace()); @@ -317,7 +308,7 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel { stream(stream::kind::eager).submit(pipeline).wait(); if (axis != -1 && axis != rank - 1) { - TransCompute(rank, dev_ctx, dX_trans, dX, perm); + TransCompute(rank, dev_ctx, dX_trans, dX, perm); } } }; diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py index 748b77f2bf..3cf05d5d9f 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py @@ -32,6 +32,30 @@ class TestSoftmaxMKLDNNOp2(TestSoftmaxMKLDNNOp): return [2, 3, 4, 5] +class TestSoftmaxMKLDNNOp2(TestSoftmaxMKLDNNOp): + def get_x_shape(self): + return [2, 3, 4, 5] + + def get_axis(self): + return 0 + + +class TestSoftmaxMKLDNNOp2(TestSoftmaxMKLDNNOp): + def get_x_shape(self): + return [2, 3, 4, 5] + + def get_axis(self): + return 1 + + +class TestSoftmaxMKLDNNOp2(TestSoftmaxMKLDNNOp): + def get_x_shape(self): + return [2, 3, 4, 5] + + def get_axis(self): + return 2 + + # Check if primitives already exist in backward class TestSoftmaxMKLDNNPrimitivesAlreadyExist(unittest.TestCase): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py index 084fa869e3..2e779270f0 100644 --- a/python/paddle/fluid/tests/unittests/test_softmax_op.py +++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py @@ -131,13 +131,23 @@ class TestSoftmaxCUDNNOp3(TestSoftmaxCUDNNOp): def get_x_shape(self): return [2, 3, 4, 5] + def get_axis(self): + return 0 + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestSoftmaxCUDNNOp4(TestSoftmaxCUDNNOp): + def get_x_shape(self): + return [2, 3, 4, 5] + def get_axis(self): return 1 @unittest.skipIf(not core.is_compiled_with_cuda(), "core is not compiled with CUDA") -class TestSoftmaxCUDNNOp2(TestSoftmaxCUDNNOp): +class TestSoftmaxCUDNNOp5(TestSoftmaxCUDNNOp): def get_x_shape(self): return [2, 3, 4, 5] From 3e4f3434e69ac5bf38be30aa89137a481f21b2de Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 5 Mar 2019 13:02:15 +0000 Subject: [PATCH 05/18] fix API.spec. test=develop --- paddle/fluid/API.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 251b1673a9..8849e31025 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -86,7 +86,7 @@ paddle.fluid.layers.conv2d (ArgSpec(args=['input', 'num_filters', 'filter_size', paddle.fluid.layers.conv3d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)), ('document', '37042620f9bd3a2da6e5d3138b2f724b')) paddle.fluid.layers.sequence_pool (ArgSpec(args=['input', 'pool_type', 'is_test'], varargs=None, keywords=None, defaults=(False,)), ('document', 'a194fb80614023f543df3949fbd0d0b8')) paddle.fluid.layers.sequence_softmax (ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '19ef6f9cdd27feac8a1ae060f19c10b4')) -paddle.fluid.layers.softmax (ArgSpec(args=['input', 'use_cudnn', 'name', 'axis'], varargs=None, keywords=None, defaults=(False, None, -1)), ('document', 'f19dd380864e61134ce3814e4be0de4b')) +paddle.fluid.layers.softmax (ArgSpec(args=['input', 'use_cudnn', 'name', 'axis'], varargs=None, keywords=None, defaults=(False, None, -1)), ('document', '85f9690b1b285def19077a41d9dba36c')) paddle.fluid.layers.pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', 'bbd84e855e660cd1084bb71a2fd0cdaa')) paddle.fluid.layers.pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', '043de7333b79ee0ac55053c14ed81625')) paddle.fluid.layers.adaptive_pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '859b887174d06f361658f69cb7c06d95')) From 2ddd23dac8629d4e6f3294f438dd2be8e383c794 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Sat, 9 Mar 2019 17:30:18 +0800 Subject: [PATCH 06/18] fix format. test=develop --- .../operators/mkldnn/softmax_mkldnn_op.cc | 21 ++++++--- paddle/fluid/operators/softmax_cudnn_op.cu.cc | 46 ++++++++++++------- paddle/fluid/operators/softmax_op.cc | 1 + paddle/fluid/operators/softmax_op.h | 13 ++++-- python/paddle/fluid/layers/nn.py | 6 +-- 5 files changed, 54 insertions(+), 33 deletions(-) diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc index cff8cdd8f5..c73dfd65e7 100644 --- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc @@ -131,8 +131,10 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { if (axis != -1 && axis != rank - 1) { X_trans.mutable_data(framework::make_ddim(shape), ctx.GetPlace()); Out_trans.mutable_data(framework::make_ddim(shape), ctx.GetPlace()); - TransCompute(rank, dev_ctx, *X, &X_trans, perm); - TransCompute(rank, dev_ctx, *Out, &Out_trans, perm); + TransCompute(rank, dev_ctx, *X, &X_trans, + perm); + TransCompute(rank, dev_ctx, *Out, + &Out_trans, perm); auto dims = X_trans.dims(); auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); X_2d.ShareDataWith(X_trans).Resize(flattened_dims); @@ -202,7 +204,8 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { } if (axis != -1 && axis != rank - 1) { - TransCompute(rank, dev_ctx, Out_trans, Out, perm); + TransCompute(rank, dev_ctx, Out_trans, Out, + perm); } } }; @@ -241,9 +244,12 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel { dX_trans.mutable_data(framework::make_ddim(shape), ctx.GetPlace()); Out_trans.mutable_data(framework::make_ddim(shape), ctx.GetPlace()); dOut_trans.mutable_data(framework::make_ddim(shape), ctx.GetPlace()); - TransCompute(rank, dev_ctx, *dX, &dX_trans, perm); - TransCompute(rank, dev_ctx, *Out, &Out_trans, perm); - TransCompute(rank, dev_ctx, *dOut, &dOut_trans, perm); + TransCompute(rank, dev_ctx, *dX, &dX_trans, + perm); + TransCompute(rank, dev_ctx, *Out, + &Out_trans, perm); + TransCompute(rank, dev_ctx, *dOut, + &dOut_trans, perm); auto dims = dX_trans.dims(); auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); dX_2d.ShareDataWith(dX_trans).Resize(flattened_dims); @@ -308,7 +314,8 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel { stream(stream::kind::eager).submit(pipeline).wait(); if (axis != -1 && axis != rank - 1) { - TransCompute(rank, dev_ctx, dX_trans, dX, perm); + TransCompute(rank, dev_ctx, dX_trans, dX, + perm); } } }; diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.cc b/paddle/fluid/operators/softmax_cudnn_op.cu.cc index dc5b7bb0af..9e24c76793 100644 --- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc +++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/softmax.h" -#include "paddle/fluid/operators/softmax_op.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/softmax_op.h" namespace paddle { namespace operators { @@ -25,7 +25,8 @@ template class SoftmaxCUDNNKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto& dev_ctx = context.template device_context(); + auto& dev_ctx = + context.template device_context(); auto* X = context.Input("X"); auto* Out = context.Output("Out"); const int axis = context.Attr("axis"); @@ -41,9 +42,12 @@ class SoftmaxCUDNNKernel : public framework::OpKernel { Tensor X_trans, Out_trans; if (axis != -1 && axis != rank - 1) { X_trans.mutable_data(framework::make_ddim(shape), context.GetPlace()); - Out_trans.mutable_data(framework::make_ddim(shape), context.GetPlace()); - TransCompute(rank, dev_ctx, *X, &X_trans, perm); - TransCompute(rank, dev_ctx, *Out, &Out_trans, perm); + Out_trans.mutable_data(framework::make_ddim(shape), + context.GetPlace()); + TransCompute(rank, dev_ctx, *X, &X_trans, + perm); + TransCompute(rank, dev_ctx, *Out, + &Out_trans, perm); X_2d = framework::ReshapeToMatrix(X_trans, rank - 1); Out_2d = framework::ReshapeToMatrix(Out_trans, rank - 1); } else { @@ -52,11 +56,12 @@ class SoftmaxCUDNNKernel : public framework::OpKernel { } math::SoftmaxCUDNNFunctor()( - context.template device_context(), - &X_2d, &Out_2d); + context.template device_context(), &X_2d, + &Out_2d); if (axis != -1 && axis != rank - 1) { - TransCompute(rank, dev_ctx, Out_trans, Out, perm); + TransCompute(rank, dev_ctx, Out_trans, + Out, perm); } } }; @@ -65,7 +70,8 @@ template class SoftmaxGradCUDNNKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto& dev_ctx = context.template device_context(); + auto& dev_ctx = + context.template device_context(); auto* Out = context.Input("Out"); auto* dOut = context.Input(framework::GradVarName("Out")); auto* dX = context.Output(framework::GradVarName("X")); @@ -82,11 +88,16 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel { Tensor dX_trans, Out_trans, dOut_trans; if (axis != -1 && axis != rank - 1) { dX_trans.mutable_data(framework::make_ddim(shape), context.GetPlace()); - Out_trans.mutable_data(framework::make_ddim(shape), context.GetPlace()); - dOut_trans.mutable_data(framework::make_ddim(shape), context.GetPlace()); - TransCompute(rank, dev_ctx, *dX, &dX_trans, perm); - TransCompute(rank, dev_ctx, *Out, &Out_trans, perm); - TransCompute(rank, dev_ctx, *dOut, &dOut_trans, perm); + Out_trans.mutable_data(framework::make_ddim(shape), + context.GetPlace()); + dOut_trans.mutable_data(framework::make_ddim(shape), + context.GetPlace()); + TransCompute(rank, dev_ctx, *dX, + &dX_trans, perm); + TransCompute(rank, dev_ctx, *Out, + &Out_trans, perm); + TransCompute(rank, dev_ctx, *dOut, + &dOut_trans, perm); dX_2d = framework::ReshapeToMatrix(dX_trans, rank - 1); Out_2d = framework::ReshapeToMatrix(Out_trans, rank - 1); dOut_2d = framework::ReshapeToMatrix(dOut_trans, rank - 1); @@ -97,11 +108,12 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel { } math::SoftmaxGradCUDNNFunctor()( - context.template device_context(), - &Out_2d, &dOut_2d, &dX_2d); + context.template device_context(), &Out_2d, + &dOut_2d, &dX_2d); if (axis != -1 && axis != rank - 1) { - TransCompute(rank, dev_ctx, dX_trans, dX, perm); + TransCompute(rank, dev_ctx, dX_trans, dX, + perm); } } }; diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index 02f256fa64..f04c5db9e1 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/softmax_op.h" +#include #include #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h index 1810b23e0d..10b3f63339 100644 --- a/paddle/fluid/operators/softmax_op.h +++ b/paddle/fluid/operators/softmax_op.h @@ -24,7 +24,8 @@ namespace operators { using Tensor = framework::Tensor; static inline void CalcTransPermAndShapeByAxis(const Tensor& x, const int axis, - std::vector* perm, std::vector* shape) { + std::vector* perm, + std::vector* shape) { auto dim_x = x.dims(); int rank = dim_x.size(); @@ -65,7 +66,8 @@ class SoftmaxKernel : public framework::OpKernel { Tensor X_trans, Out_trans; if (axis != -1 && axis != rank - 1) { X_trans.mutable_data(framework::make_ddim(shape), context.GetPlace()); - Out_trans.mutable_data(framework::make_ddim(shape), context.GetPlace()); + Out_trans.mutable_data(framework::make_ddim(shape), + context.GetPlace()); TransCompute(rank, dev_ctx, *X, &X_trans, perm); TransCompute(rank, dev_ctx, *Out, &Out_trans, perm); X_2d = framework::ReshapeToMatrix(X_trans, rank - 1); @@ -75,7 +77,6 @@ class SoftmaxKernel : public framework::OpKernel { Out_2d = framework::ReshapeToMatrix(*Out, rank - 1); } - #ifdef PADDLE_ON_INFERENCE math::SoftmaxFunctor()( context.template device_context(), &X_2d, &Out_2d); @@ -111,8 +112,10 @@ class SoftmaxGradKernel : public framework::OpKernel { Tensor dX_trans, Out_trans, dOut_trans; if (axis != -1 && axis != rank - 1) { dX_trans.mutable_data(framework::make_ddim(shape), context.GetPlace()); - Out_trans.mutable_data(framework::make_ddim(shape), context.GetPlace()); - dOut_trans.mutable_data(framework::make_ddim(shape), context.GetPlace()); + Out_trans.mutable_data(framework::make_ddim(shape), + context.GetPlace()); + dOut_trans.mutable_data(framework::make_ddim(shape), + context.GetPlace()); TransCompute(rank, dev_ctx, *dX, &dX_trans, perm); TransCompute(rank, dev_ctx, *Out, &Out_trans, perm); TransCompute(rank, dev_ctx, *dOut, &dOut_trans, perm); diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 273d74ca6e..276344df58 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1872,10 +1872,8 @@ def softmax(input, use_cudnn=False, name=None, axis=-1): type="softmax", inputs={"X": input}, outputs={"Out": softmax_out}, - attrs={ - "axis": axis, - "use_cudnn": use_cudnn - }) + attrs={"axis": axis, + "use_cudnn": use_cudnn}) return softmax_out From 8b88960dcec6076a205c07ebbbd69e5f90e78bdb Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Sat, 9 Mar 2019 17:24:45 +0800 Subject: [PATCH 07/18] fix doc. test=develop --- paddle/fluid/operators/softmax_op.cc | 8 ++++---- python/paddle/fluid/layers/nn.py | 10 ++++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index f04c5db9e1..3592f20dbf 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -86,7 +86,7 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("X", "The input tensor of softmax, " - "whose :attr:`axis` dimension is the input_feature_dimensions."); + "whose dimension :attr:`axis` is the input_feature_dimensions."); AddOutput("Out", "The normalized values with the same shape as X."); AddAttr("axis", "The dimension index of Input(x) to perform softmax," @@ -116,13 +116,13 @@ Softmax Operator. The input of the softmax operator is a tensor of any rank. The output tensor has the same shape as the input. -The :attr:`axis` th dimension of the input tensor will be permuted to the last. +The dimension :attr:`axis` of the input tensor will be permuted to the last. Then the input tensor will be logically flattened to a 2-D matrix. The matrix's -second dimension(row length) is as same as the :attr:`axis` dimension of the input +second dimension(row length) is as same as the dimension :attr:`axis` of the input tensor, and the first dimension(column length) is the product of all other dimensions of the input tensor. For each row of the matrix, the softmax operator squashes the K-dimensional(K is the width of the matrix, which is also the size -of the input tensor's :attr:`axis` dimension) vector of arbitrary real values to a +of the input tensor's dimension :attr:`axis`) vector of arbitrary real values to a K-dimensional vector of real values in the range [0, 1] that add up to 1. It computes the exponential of the given dimension and the sum of exponential values of all the other dimensions in the K-dimensional vector input. diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 276344df58..19c9734a9e 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1824,13 +1824,13 @@ def softmax(input, use_cudnn=False, name=None, axis=-1): The input of the softmax operator is a tensor of any rank. The output tensor has the same shape as the input. - The :attr:`axis` th dimension of the input tensor will be permuted to the last. + The dimension :attr:`axis` of the input tensor will be permuted to the last. Then the input tensor will be logically flattened to a 2-D matrix. The matrix's - second dimension(row length) is as same as the :attr:`axis` th dimension of the input + second dimension(row length) is as same as the dimension :attr:`axis` of the input tensor, and the first dimension(column length) is the product of all other dimensions of the input tensor. For each row of the matrix, the softmax operator squashes the K-dimensional(K is the width of the matrix, which is also the size - of the input tensor's :attr:`axis` th dimension) vector of arbitrary real values to a + of the input tensor's dimension :attr:`axis`) vector of arbitrary real values to a K-dimensional vector of real values in the range [0, 1] that add up to 1. It computes the exponential of the given dimension and the sum of exponential @@ -1852,7 +1852,9 @@ def softmax(input, use_cudnn=False, name=None, axis=-1): False by default. Default: False name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. Default: None. - axis (int): The index of dimension to perform softmax calculation. Default: -1. + axis (int): The index of dimension to perform softmax calculations, it should + be in range :math:`[-1, rank - 1]`, while :math:`rank` is the rank of + input variable. Default: -1. Returns: Variable: output of softmax From 412b7cbdf168b872b4c07040d5193eb164708941 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Sun, 10 Mar 2019 12:08:07 +0800 Subject: [PATCH 08/18] fix format. test=develop --- paddle/fluid/operators/softmax_op.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index 3592f20dbf..578ab8eee3 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include +#include #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cudnn_helper.h" From 6c641827092fb10f6eeb56477819c76f2b331969 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Mon, 18 Mar 2019 11:57:16 +0000 Subject: [PATCH 09/18] refine softmax kernel. test=develop --- paddle/fluid/operators/math/softmax.h | 9 +- paddle/fluid/operators/math/softmax_impl.h | 22 +-- .../operators/mkldnn/softmax_mkldnn_op.cc | 134 +++++------------- paddle/fluid/operators/softmax_cudnn_op.cu.cc | 85 +++-------- paddle/fluid/operators/softmax_op.h | 114 ++++++--------- .../operators/softmax_with_cross_entropy_op.h | 2 +- paddle/fluid/operators/warpctc_cudnn_op.cu.cc | 2 +- 7 files changed, 119 insertions(+), 249 deletions(-) diff --git a/paddle/fluid/operators/math/softmax.h b/paddle/fluid/operators/math/softmax.h index 81beef56d9..f8e250fa2e 100644 --- a/paddle/fluid/operators/math/softmax.h +++ b/paddle/fluid/operators/math/softmax.h @@ -23,15 +23,16 @@ template class SoftmaxFunctor { public: - void operator()(const DeviceContext& context, const framework::Tensor* X, - framework::Tensor* Y); + void operator()(const DeviceContext& context, const int axis_dim, + const framework::Tensor* X, framework::Tensor* Y); }; template class SoftmaxGradFunctor { public: - void operator()(const DeviceContext& context, const framework::Tensor* y, - const framework::Tensor* y_grad, framework::Tensor* x_grad); + void operator()(const DeviceContext& context, const int axis_dim, + const framework::Tensor* y, const framework::Tensor* y_grad, + framework::Tensor* x_grad); }; #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index d77b6712c5..9bcb272b93 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -36,8 +36,8 @@ struct ValueClip { template void SoftmaxFunctor::operator()( - const DeviceContext& context, const framework::Tensor* X, - framework::Tensor* Y) { + const DeviceContext& context, const int axis_dim, + const framework::Tensor* X, framework::Tensor* Y) { auto logits = EigenMatrix::From(*X); auto softmax = EigenMatrix::From(*Y); @@ -46,10 +46,13 @@ void SoftmaxFunctor::operator()( const int batch_size = logits.dimension(kBatchDim); const int num_classes = logits.dimension(kClassDim); + const int num_remain = num_classes / axis_dim; Eigen::DSizes along_class(kClassDim); Eigen::DSizes batch_by_one(batch_size, 1); Eigen::DSizes one_by_class(1, num_classes); + Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); + Eigen::DSizes one_axis(1, axis_dim); auto shifted_logits = (logits - logits.maximum(along_class) @@ -60,11 +63,11 @@ void SoftmaxFunctor::operator()( softmax.device(*context.eigen_device()) = shifted_logits.exp(); softmax.device(*context.eigen_device()) = (softmax * - softmax.sum(along_class) + softmax.reshape(batch_axis_remain) + .sum(along_class) .inverse() .eval() - .reshape(batch_by_one) - .broadcast(one_by_class)); + .broadcast(one_axis)); } template @@ -90,7 +93,7 @@ class SoftmaxFunctor> { template void SoftmaxGradFunctor::operator()( - const DeviceContext& context, const framework::Tensor* y, + const DeviceContext& context, const int axis_dim, const framework::Tensor* y, const framework::Tensor* y_grad, framework::Tensor* x_grad) { auto softmax = EigenMatrix::From(*y); auto softmax_grad = EigenMatrix::From(*y_grad); @@ -101,16 +104,19 @@ void SoftmaxGradFunctor::operator()( const int batch_size = softmax.dimension(kBatchDim); const int num_classes = softmax.dimension(kClassDim); + const int num_remain = num_classes / axis_dim; Eigen::DSizes along_class(kClassDim); Eigen::DSizes batch_by_one(batch_size, 1); Eigen::DSizes one_by_class(1, num_classes); + Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); + Eigen::DSizes one_axis(1, axis_dim); auto dot = (softmax * softmax_grad) + .reshape(batch_axis_remain) .sum(along_class) .eval() - .reshape(batch_by_one) - .broadcast(one_by_class); + .broadcast(one_axis); logits_grad.device(*context.eigen_device()) = (softmax_grad - dot) * softmax; } diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc index c73dfd65e7..0ce5522194 100644 --- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc @@ -110,46 +110,28 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { "It must use CPUPlace."); auto& dev_ctx = ctx.template device_context(); auto mkldnn_engine = dev_ctx.GetEngine(); - const Tensor* X = ctx.Input("X"); - Tensor* Out = ctx.Output("Out"); + const Tensor* input = ctx.Input("X"); + Tensor* output = ctx.Output("Out"); PADDLE_ENFORCE_EQ( - X->dims(), Out->dims(), + input->dims(), output->dims(), "The shape of softmax's input and output must be identical."); - const int axis = ctx.Attr("axis"); - int rank = X->dims().size(); - // make sure 'output' holds memory, which will be shared by // 'flattened_output' later. - Out->mutable_data(ctx.GetPlace()); - - std::vector perm, shape; - CalcTransPermAndShapeByAxis(*X, axis, &perm, &shape); - - Tensor X_2d, Out_2d; - Tensor X_trans, Out_trans; - if (axis != -1 && axis != rank - 1) { - X_trans.mutable_data(framework::make_ddim(shape), ctx.GetPlace()); - Out_trans.mutable_data(framework::make_ddim(shape), ctx.GetPlace()); - TransCompute(rank, dev_ctx, *X, &X_trans, - perm); - TransCompute(rank, dev_ctx, *Out, - &Out_trans, perm); - auto dims = X_trans.dims(); - auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); - X_2d.ShareDataWith(X_trans).Resize(flattened_dims); - Out_2d.ShareDataWith(Out_trans).Resize(flattened_dims); - } else { - auto dims = X->dims(); - auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); - X_2d.ShareDataWith(*X).Resize(flattened_dims); - Out_2d.ShareDataWith(*Out).Resize(flattened_dims); - } + output->mutable_data(ctx.GetPlace()); + + // flatten input and output to 2-D matrixs + auto dims = input->dims(); // input and output share the same shape + auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); + framework::Tensor flattened_input; + framework::Tensor flattened_output; + flattened_input.ShareDataWith(*input).Resize(flattened_dims); + flattened_output.ShareDataWith(*output).Resize(flattened_dims); - const T* input_data = X_2d.data(); - T* output_data = Out_2d.mutable_data(ctx.GetPlace()); + const T* input_data = flattened_input.data(); + T* output_data = flattened_output.mutable_data(ctx.GetPlace()); - std::vector src_tz = paddle::framework::vectorize2int(X_2d.dims()); + std::vector src_tz = paddle::framework::vectorize2int(flattened_dims); std::vector dst_tz = src_tz; // Same memory descriptor to be used for input and output memory::dims softmax_tz = {src_tz[0], src_tz[1]}; @@ -179,16 +161,10 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { // We cannot use softmax_dst_memory_p to get prim desc as // it contains flattened dims (2D) while output tensor can // have 2,3,4+ dims - if (axis != -1 && axis != rank - 1) { - auto output_mem_pd = paddle::platform::create_prim_desc_from_dims( - shape, mkldnn::memory::format::blocked); - Out_trans.set_mkldnn_prim_desc(output_mem_pd); - } else { - auto output_mem_pd = paddle::platform::create_prim_desc_from_dims( - paddle::framework::vectorize2int(Out->dims()), - mkldnn::memory::format::blocked); - Out->set_mkldnn_prim_desc(output_mem_pd); - } + auto output_mem_pd = paddle::platform::create_prim_desc_from_dims( + paddle::framework::vectorize2int(output->dims()), + mkldnn::memory::format::blocked); + output->set_mkldnn_prim_desc(output_mem_pd); std::vector pipeline{ *(static_cast(softmax_p.get()))}; @@ -202,11 +178,6 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { output_data[i] < threshold ? threshold : output_data[i]; } } - - if (axis != -1 && axis != rank - 1) { - TransCompute(rank, dev_ctx, Out_trans, Out, - perm); - } } }; @@ -219,55 +190,33 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel { auto& dev_ctx = ctx.template device_context(); auto mkldnn_engine = dev_ctx.GetEngine(); - const Tensor* Out = ctx.Input("Out"); - auto* dOut = ctx.template Input(framework::GradVarName("Out")); - auto* dX = + const Tensor* output = ctx.Input("Out"); + auto* dout = ctx.template Input(framework::GradVarName("Out")); + auto* dx = ctx.template Output(framework::GradVarName("X")); PADDLE_ENFORCE_EQ( - dOut->dims(), dX->dims(), + dout->dims(), dx->dims(), "The shape of softmax_grad's input and output must be identical."); - const int axis = ctx.Attr("axis"); - int rank = Out->dims().size(); - // make sure 'dx' holds memory, which will be shared by 'flattened_dx' // later. - dX->template mutable_data(ctx.GetPlace()); - - std::vector perm, shape; - CalcTransPermAndShapeByAxis(*dX, axis, &perm, &shape); - - Tensor dX_2d, Out_2d, dOut_2d; - Tensor dX_trans, Out_trans, dOut_trans; - if (axis != -1 && axis != rank - 1) { - dX_trans.mutable_data(framework::make_ddim(shape), ctx.GetPlace()); - Out_trans.mutable_data(framework::make_ddim(shape), ctx.GetPlace()); - dOut_trans.mutable_data(framework::make_ddim(shape), ctx.GetPlace()); - TransCompute(rank, dev_ctx, *dX, &dX_trans, - perm); - TransCompute(rank, dev_ctx, *Out, - &Out_trans, perm); - TransCompute(rank, dev_ctx, *dOut, - &dOut_trans, perm); - auto dims = dX_trans.dims(); - auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); - dX_2d.ShareDataWith(dX_trans).Resize(flattened_dims); - Out_2d.ShareDataWith(Out_trans).Resize(flattened_dims); - dOut_2d.ShareDataWith(dOut_trans).Resize(flattened_dims); - } else { - auto dims = dX->dims(); - auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); - dX_2d.ShareDataWith(*dX).Resize(flattened_dims); - Out_2d.ShareDataWith(*Out).Resize(flattened_dims); - dOut_2d.ShareDataWith(*dOut).Resize(flattened_dims); - } - - const T* dst_data = Out_2d.data(); - const T* diff_dst_ptr = dOut_2d.template data(); - T* diff_src_ptr = dX_2d.template mutable_data(ctx.GetPlace()); - - std::vector dst_tz = paddle::framework::vectorize2int(Out_2d.dims()); + dx->template mutable_data(ctx.GetPlace()); + + auto dims = dout->dims(); // input and output share the same shape + auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); + framework::Tensor flattened_output; + framework::Tensor flattened_dout; + framework::Tensor flattened_dx; + flattened_output.ShareDataWith(*output).Resize(flattened_dims); + flattened_dout.ShareDataWith(*dout).Resize(flattened_dims); + flattened_dx.ShareDataWith(*dx).Resize(flattened_dims); + + const T* dst_data = flattened_output.data(); + const T* diff_dst_ptr = flattened_dout.template data(); + T* diff_src_ptr = flattened_dx.template mutable_data(ctx.GetPlace()); + + std::vector dst_tz = paddle::framework::vectorize2int(flattened_dims); std::vector src_tz(dst_tz); // Same memory descriptor to be used for input and output @@ -312,11 +261,6 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel { std::vector pipeline{*softmax_bwd_p}; stream(stream::kind::eager).submit(pipeline).wait(); - - if (axis != -1 && axis != rank - 1) { - TransCompute(rank, dev_ctx, dX_trans, dX, - perm); - } } }; } // namespace operators diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.cc b/paddle/fluid/operators/softmax_cudnn_op.cu.cc index 9e24c76793..ad3e5543f1 100644 --- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc +++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc @@ -14,7 +14,6 @@ limitations under the License. */ #include "paddle/fluid/operators/math/softmax.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/softmax_op.h" namespace paddle { namespace operators { @@ -25,44 +24,22 @@ template class SoftmaxCUDNNKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto& dev_ctx = - context.template device_context(); auto* X = context.Input("X"); auto* Out = context.Output("Out"); - const int axis = context.Attr("axis"); - int rank = X->dims().size(); // allocate memory on device. Out->mutable_data(context.GetPlace()); - std::vector perm, shape; - CalcTransPermAndShapeByAxis(*X, axis, &perm, &shape); - - Tensor X_2d, Out_2d; - Tensor X_trans, Out_trans; - if (axis != -1 && axis != rank - 1) { - X_trans.mutable_data(framework::make_ddim(shape), context.GetPlace()); - Out_trans.mutable_data(framework::make_ddim(shape), - context.GetPlace()); - TransCompute(rank, dev_ctx, *X, &X_trans, - perm); - TransCompute(rank, dev_ctx, *Out, - &Out_trans, perm); - X_2d = framework::ReshapeToMatrix(X_trans, rank - 1); - Out_2d = framework::ReshapeToMatrix(Out_trans, rank - 1); - } else { - X_2d = framework::ReshapeToMatrix(*X, rank - 1); - Out_2d = framework::ReshapeToMatrix(*Out, rank - 1); - } + auto dims = X->dims(); + auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); + framework::LoDTensor flattened_x; + framework::LoDTensor flattened_out; + flattened_x.ShareDataWith(*X).Resize(flattened_dims); + flattened_out.ShareDataWith(*Out).Resize(flattened_dims); math::SoftmaxCUDNNFunctor()( - context.template device_context(), &X_2d, - &Out_2d); - - if (axis != -1 && axis != rank - 1) { - TransCompute(rank, dev_ctx, Out_trans, - Out, perm); - } + context.template device_context(), + &flattened_x, &flattened_out); } }; @@ -70,51 +47,25 @@ template class SoftmaxGradCUDNNKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto& dev_ctx = - context.template device_context(); auto* Out = context.Input("Out"); auto* dOut = context.Input(framework::GradVarName("Out")); auto* dX = context.Output(framework::GradVarName("X")); - const int axis = context.Attr("axis"); - int rank = Out->dims().size(); // allocate memory on device. dX->mutable_data(context.GetPlace()); - std::vector perm, shape; - CalcTransPermAndShapeByAxis(*dX, axis, &perm, &shape); - - Tensor dX_2d, Out_2d, dOut_2d; - Tensor dX_trans, Out_trans, dOut_trans; - if (axis != -1 && axis != rank - 1) { - dX_trans.mutable_data(framework::make_ddim(shape), context.GetPlace()); - Out_trans.mutable_data(framework::make_ddim(shape), - context.GetPlace()); - dOut_trans.mutable_data(framework::make_ddim(shape), - context.GetPlace()); - TransCompute(rank, dev_ctx, *dX, - &dX_trans, perm); - TransCompute(rank, dev_ctx, *Out, - &Out_trans, perm); - TransCompute(rank, dev_ctx, *dOut, - &dOut_trans, perm); - dX_2d = framework::ReshapeToMatrix(dX_trans, rank - 1); - Out_2d = framework::ReshapeToMatrix(Out_trans, rank - 1); - dOut_2d = framework::ReshapeToMatrix(dOut_trans, rank - 1); - } else { - dX_2d = framework::ReshapeToMatrix(*dX, rank - 1); - Out_2d = framework::ReshapeToMatrix(*Out, rank - 1); - dOut_2d = framework::ReshapeToMatrix(*dOut, rank - 1); - } + auto dims = Out->dims(); + auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); + framework::LoDTensor flattened_out; + framework::LoDTensor flattened_d_out; + framework::LoDTensor flattened_d_x; + flattened_out.ShareDataWith(*Out).Resize(flattened_dims); + flattened_d_out.ShareDataWith(*dOut).Resize(flattened_dims); + flattened_d_x.ShareDataWith(*dX).Resize(flattened_dims); math::SoftmaxGradCUDNNFunctor()( - context.template device_context(), &Out_2d, - &dOut_2d, &dX_2d); - - if (axis != -1 && axis != rank - 1) { - TransCompute(rank, dev_ctx, dX_trans, dX, - perm); - } + context.template device_context(), + &flattened_out, &flattened_d_out, &flattened_d_x); } }; diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h index 10b3f63339..76e8eeab08 100644 --- a/paddle/fluid/operators/softmax_op.h +++ b/paddle/fluid/operators/softmax_op.h @@ -13,81 +13,66 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/softmax.h" -#include "paddle/fluid/operators/transpose_op.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; +using DDim = framework::DDim; -static inline void CalcTransPermAndShapeByAxis(const Tensor& x, const int axis, - std::vector* perm, - std::vector* shape) { - auto dim_x = x.dims(); - int rank = dim_x.size(); +static inline int CanonicalAxis(const int axis, const int rank) { + if (axis < 0) { + return axis + rank; + } + return axis; +} - if (axis == -1 || axis == rank - 1) { - return; +static inline int SizeToAxis(const int axis, DDim dims) { + int size = 1; + for (int i = 0; i < axis; i++) { + size *= dims[i]; } + return size; +} - for (int i = 0; i < rank - 1; i++) { - if (i == axis) { - perm->push_back(rank - 1); - shape->push_back(dim_x[rank - 1]); - } else { - perm->push_back(i); - shape->push_back(dim_x[i]); - } +static inline int SizeFromAxis(const int axis, DDim dims) { + int size = 1; + for (int i = axis; i < dims.size(); i++) { + size *= dims[i]; } - perm->push_back(axis); - shape->push_back(dim_x[axis]); + return size; } template class SoftmaxKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto& dev_ctx = context.template device_context(); auto* X = context.Input("X"); auto* Out = context.Output("Out"); - const int axis = context.Attr("axis"); - int rank = X->dims().size(); + const int rank = X->dims().size(); + const int axis = CanonicalAxis(context.Attr("axis"), rank); + int axis_dim = X->dims()[axis]; // allocate memory on device. Out->mutable_data(context.GetPlace()); - std::vector perm, shape; - CalcTransPermAndShapeByAxis(*X, axis, &perm, &shape); - + const int n = SizeToAxis(axis, X->dims()); + const int d = SizeFromAxis(axis, X->dims()); Tensor X_2d, Out_2d; - Tensor X_trans, Out_trans; - if (axis != -1 && axis != rank - 1) { - X_trans.mutable_data(framework::make_ddim(shape), context.GetPlace()); - Out_trans.mutable_data(framework::make_ddim(shape), - context.GetPlace()); - TransCompute(rank, dev_ctx, *X, &X_trans, perm); - TransCompute(rank, dev_ctx, *Out, &Out_trans, perm); - X_2d = framework::ReshapeToMatrix(X_trans, rank - 1); - Out_2d = framework::ReshapeToMatrix(Out_trans, rank - 1); - } else { - X_2d = framework::ReshapeToMatrix(*X, rank - 1); - Out_2d = framework::ReshapeToMatrix(*Out, rank - 1); - } + X_2d.ShareDataWith(*X).Resize({n, d}); + Out_2d.ShareDataWith(*Out).Resize({n, d}); + // Tensor X_2d = framework::ReshapeToMatrix(*X, axis - 1); + // Tensor Out_2d = framework::ReshapeToMatrix(*Out, axis - 1); #ifdef PADDLE_ON_INFERENCE math::SoftmaxFunctor()( - context.template device_context(), &X_2d, &Out_2d); + context.template device_context(), axis_dim, &X_2d, &Out_2d); #else math::SoftmaxFunctor()( - context.template device_context(), &X_2d, &Out_2d); + context.template device_context(), axis_dim, &X_2d, &Out_2d); #endif - - if (axis != -1 && axis != rank - 1) { - TransCompute(rank, dev_ctx, Out_trans, Out, perm); - } } }; @@ -95,46 +80,29 @@ template class SoftmaxGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto& dev_ctx = context.template device_context(); auto* Out = context.Input("Out"); auto* dOut = context.Input(framework::GradVarName("Out")); auto* dX = context.Output(framework::GradVarName("X")); - const int axis = context.Attr("axis"); - int rank = Out->dims().size(); + const int rank = dX->dims().size(); + const int axis = CanonicalAxis(context.Attr("axis"), rank); + int axis_dim = dX->dims()[axis]; // allocate memory on device. dX->mutable_data(context.GetPlace()); - std::vector perm, shape; - CalcTransPermAndShapeByAxis(*dX, axis, &perm, &shape); - + const int n = SizeToAxis(axis, dX->dims()); + const int d = SizeFromAxis(axis, dX->dims()); Tensor dX_2d, Out_2d, dOut_2d; - Tensor dX_trans, Out_trans, dOut_trans; - if (axis != -1 && axis != rank - 1) { - dX_trans.mutable_data(framework::make_ddim(shape), context.GetPlace()); - Out_trans.mutable_data(framework::make_ddim(shape), - context.GetPlace()); - dOut_trans.mutable_data(framework::make_ddim(shape), - context.GetPlace()); - TransCompute(rank, dev_ctx, *dX, &dX_trans, perm); - TransCompute(rank, dev_ctx, *Out, &Out_trans, perm); - TransCompute(rank, dev_ctx, *dOut, &dOut_trans, perm); - dX_2d = framework::ReshapeToMatrix(dX_trans, rank - 1); - Out_2d = framework::ReshapeToMatrix(Out_trans, rank - 1); - dOut_2d = framework::ReshapeToMatrix(dOut_trans, rank - 1); - } else { - dX_2d = framework::ReshapeToMatrix(*dX, rank - 1); - Out_2d = framework::ReshapeToMatrix(*Out, rank - 1); - dOut_2d = framework::ReshapeToMatrix(*dOut, rank - 1); - } + dX_2d.ShareDataWith(*dX).Resize({n, d}); + Out_2d.ShareDataWith(*Out).Resize({n, d}); + dOut_2d.ShareDataWith(*dOut).Resize({n, d}); + // Tensor Out_2d = framework::ReshapeToMatrix(*Out, axis - 1); + // Tensor dOut_2d = framework::ReshapeToMatrix(*dOut, axis - 1); + // Tensor dX_2d = framework::ReshapeToMatrix(*dX, axis - 1); math::SoftmaxGradFunctor()( - context.template device_context(), &Out_2d, &dOut_2d, + context.template device_context(), axis_dim, &Out_2d, &dOut_2d, &dX_2d); - - if (axis != -1 && axis != rank - 1) { - TransCompute(rank, dev_ctx, dX_trans, dX, perm); - } } }; diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h index c0530e3d8b..ff99e4207a 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h @@ -43,7 +43,7 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel { auto& dev_ctx = context.template device_context(); math::SoftmaxFunctor()( - dev_ctx, logits, softmax); + dev_ctx, -1, logits, softmax); math::CrossEntropyFunctor()( dev_ctx, loss, softmax, labels, context.Attr("soft_label"), context.Attr("ignore_index")); diff --git a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc b/paddle/fluid/operators/warpctc_cudnn_op.cu.cc index a764d59410..716faf2995 100644 --- a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc +++ b/paddle/fluid/operators/warpctc_cudnn_op.cu.cc @@ -69,7 +69,7 @@ class CudnnCTCKernel : public framework::OpKernel { int rank = logits->dims().size(); Tensor in_2d = framework::ReshapeToMatrix(*logits, rank - 1); Tensor out_2d = framework::ReshapeToMatrix(softmax_logits, rank - 1); - math::SoftmaxFunctor()(dev_ctx, &in_2d, &out_2d); + math::SoftmaxFunctor()(dev_ctx, -1, &in_2d, &out_2d); // ctc needs sequences data stored in transposed padding format // logits and grad using padding data of layout 'TNC' From 93701dba50e2555c7bd9cb69efe38debd5441cb7 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 20 Mar 2019 03:27:35 +0000 Subject: [PATCH 10/18] add jit kernel for softmax axis. test=develop --- paddle/fluid/operators/jit/benchmark.cc | 2 +- paddle/fluid/operators/jit/helper.cc | 2 + paddle/fluid/operators/jit/kernel_base.h | 24 ++++++- paddle/fluid/operators/jit/more/mix/mix.cc | 18 +++-- paddle/fluid/operators/jit/more/mix/mix.h | 2 +- .../operators/jit/more/mkl/CMakeLists.txt | 1 + paddle/fluid/operators/jit/more/mkl/mkl.cc | 35 ++++++++++ paddle/fluid/operators/jit/more/mkl/mkl.h | 23 +++++-- .../fluid/operators/jit/refer/CMakeLists.txt | 2 + paddle/fluid/operators/jit/refer/refer.cc | 2 + paddle/fluid/operators/jit/refer/refer.h | 36 ++++++++-- paddle/fluid/operators/jit/test.cc | 67 ++++++++++--------- paddle/fluid/operators/math/softmax_impl.h | 7 +- paddle/fluid/operators/softmax_op.cc | 15 ++++- paddle/fluid/operators/softmax_op.h | 5 -- .../fluid/tests/unittests/test_softmax_op.py | 22 +----- 16 files changed, 185 insertions(+), 78 deletions(-) diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index fbb04a166e..9ff1fe478d 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -386,7 +386,7 @@ void BenchKernelSoftmax() { RandomVec(bs * n, x.mutable_data(PlaceType()), -2.f, 2.f); const T* x_data = x.data(); T* y_data = y.mutable_data(PlaceType()); - BenchAllImpls(n, x_data, y_data, n, bs); + BenchAllImpls(n, x_data, y_data, n, bs, 1); } } } diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc index eb1c410b6f..fe508788ef 100644 --- a/paddle/fluid/operators/jit/helper.cc +++ b/paddle/fluid/operators/jit/helper.cc @@ -34,6 +34,7 @@ const char* to_string(KernelType kt) { ONE_CASE(kVAddRelu); ONE_CASE(kVSub); ONE_CASE(kVScal); + ONE_CASE(kStrideScal); ONE_CASE(kVAddBias); ONE_CASE(kVRelu); ONE_CASE(kVBroadcast); @@ -55,6 +56,7 @@ const char* to_string(KernelType kt) { ONE_CASE(kMatMul); ONE_CASE(kHMax); ONE_CASE(kHSum); + ONE_CASE(kStrideSum); ONE_CASE(kSoftmax); ONE_CASE(kEmbSeqPool); ONE_CASE(kSgd); diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index bd34d7dfc7..6fd8a59d55 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -53,6 +53,8 @@ typedef enum { kVSquare, kVSub, kVTanh, + kStrideSum, + kStrideScal, } KernelType; typedef enum { @@ -74,6 +76,14 @@ struct XYZNTuple { template struct AXYNTuple : public XYZNTuple {}; +// a, x, y, n, stride +template +struct AXYNSTuple { + typedef T data_type; + typedef int attr_type; + typedef void (*func_type)(const T*, const T*, T*, int, int); +}; + // x, y, n template struct XYNTuple { @@ -86,6 +96,14 @@ struct XYNTuple { template struct XRNTuple : public XYNTuple {}; +// x, returned value, n, stride +template +struct XRNSTuple { + typedef T data_type; + typedef int attr_type; + typedef void (*func_type)(const T*, T*, int, int); +}; + #define DECLARE_KERNELTUPLE(kernel_tuple, type) \ template \ struct type##Tuple : public kernel_tuple { \ @@ -101,6 +119,8 @@ DECLARE_KERNELTUPLE(XYZNTuple, VSub); DECLARE_KERNELTUPLE(AXYNTuple, VScal); DECLARE_KERNELTUPLE(AXYNTuple, VAddBias); +DECLARE_KERNELTUPLE(AXYNSTuple, StrideScal); + DECLARE_KERNELTUPLE(XYNTuple, VRelu); DECLARE_KERNELTUPLE(XYNTuple, VIdentity); DECLARE_KERNELTUPLE(XYNTuple, VSquare); @@ -112,6 +132,8 @@ DECLARE_KERNELTUPLE(XYNTuple, VCopy); DECLARE_KERNELTUPLE(XRNTuple, HMax); DECLARE_KERNELTUPLE(XRNTuple, HSum); +DECLARE_KERNELTUPLE(XRNSTuple, StrideSum); + typedef struct { void* gates; // gates: x_ch, x_ih, x_fh, x_oh const void* ct_1; @@ -285,7 +307,7 @@ struct SoftmaxTuple { static constexpr KernelType kernel_type = kSoftmax; typedef T data_type; typedef int attr_type; - typedef void (*func_type)(const T*, T*, int, int); + typedef void (*func_type)(const T*, T*, int, int, int); }; // nChw16c = nChw16c .* NC diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc index 6e709a16d2..58a44d4b55 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.cc +++ b/paddle/fluid/operators/jit/more/mix/mix.cc @@ -50,10 +50,12 @@ void VTanh(const T* x, T* y, int n) { compute_addbias(&b, y, y, n); } -void Softmax(const T* x, T* y, int n, int bs) { +void Softmax(const T* x, T* y, int n, int bs, int m) { auto compute_hmax = KernelFuncs, CPUPlace>::Cache().At(n); auto compute_hsum = KernelFuncs, CPUPlace>::Cache().At(n); auto compute_vscal = KernelFuncs, CPUPlace>::Cache().At(n); + auto compute_stridesum = KernelFuncs, CPUPlace>::Cache().At(n); + auto compute_stridescal = KernelFuncs, CPUPlace>::Cache().At(n); auto compute_vaddbias = KernelFuncs, CPUPlace>::Cache().At(n); auto compute_vexp = KernelFuncs, CPUPlace>::Cache().At(n); @@ -64,9 +66,17 @@ void Softmax(const T* x, T* y, int n, int bs) { scalar = static_cast(0) - scalar; compute_vaddbias(&scalar, x, y, n); // x - max compute_vexp(y, y, n); - compute_hsum(y, &scalar, n); - scalar = static_cast(1) / scalar; - compute_vscal(&scalar, y, y, n); + if (m == 1) { + compute_hsum(y, &scalar, n); + scalar = static_cast(1) / scalar; + compute_vscal(&scalar, y, y, n); + } else { + for (int j = 0; j < m; ++j) { + compute_stridesum(&y[j], &scalar, n, m); + scalar = static_cast(1) / scalar; + compute_stridescal(&scalar, &y[j], &y[j], n, m); + } + } x += n; y += n; } diff --git a/paddle/fluid/operators/jit/more/mix/mix.h b/paddle/fluid/operators/jit/more/mix/mix.h index 994d485909..a0079506f8 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.h +++ b/paddle/fluid/operators/jit/more/mix/mix.h @@ -26,7 +26,7 @@ using T = float; void VSigmoid(const T* x, T* y, int n); void VTanh(const T* x, T* y, int n); -void Softmax(const T* x, T* y, int n, int bs); +void Softmax(const T* x, T* y, int n, int bs, int m); void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr); void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr); diff --git a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt index f69417c370..56f1a62ad4 100644 --- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt @@ -7,6 +7,7 @@ USE_JITKERNEL_MORE(kMatMul, mkl) USE_JITKERNEL_MORE(kVMul, mkl) USE_JITKERNEL_MORE(kVAdd, mkl) USE_JITKERNEL_MORE(kVScal, mkl) +USE_JITKERNEL_MORE(kStrideScal, mkl) USE_JITKERNEL_MORE(kVExp, mkl) USE_JITKERNEL_MORE(kVSquare, mkl) USE_JITKERNEL_MORE(kVCopy, mkl) diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc index 4f600b3814..2828d75815 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.cc +++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc @@ -78,6 +78,24 @@ void VScal(const double* a, const double* x, double* y, int n) { } } +template <> +void StrideScal(const float* a, const float* x, float* y, int n, int stride) { + if (x == y) { + platform::dynload::cblas_sscal(n, *a, y, stride); + } else { + refer::StrideScal(a, x, y, n, stride); + } +} + +template <> +void StrideScal(const double* a, const double* x, double* y, int n, int stride) { + if (x == y) { + platform::dynload::cblas_dscal(n, *a, y, stride); + } else { + refer::StrideScal(a, x, y, n, stride); + } +} + template <> void VExp(const float* x, float* y, int n) { platform::dynload::vsExp(n, x, y); @@ -128,6 +146,16 @@ void ASum(const double* x, double* res, int n) { res[0] = platform::dynload::cblas_dasum(n, x, 1); } +template <> +void StrideSum(const float* x, float* res, int n, int stride) { + res[0] = platform::dynload::cblas_sasum(n, x, stride); +} + +template <> +void StrideSum(const double* x, double* res, int n, int stride) { + res[0] = platform::dynload::cblas_dasum(n, x, stride); +} + // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 template <> bool VMulKernel::CanBeUsed(const int& d) const { @@ -144,6 +172,11 @@ bool VScalKernel::CanBeUsed(const int& d) const { return platform::MayIUse(platform::avx512f) && d > 512; } +template <> +bool StrideScalKernel::CanBeUsed(const int& d) const { + return platform::MayIUse(platform::avx512f) && d > 512; +} + template <> bool VExpKernel::CanBeUsed(const int& d) const { return d > 7; @@ -235,6 +268,7 @@ bool SoftmaxKernel::CanBeUsed(const int& d) const { AWALYS_USE_ME_WITH_DOUBLE(VMul); AWALYS_USE_ME_WITH_DOUBLE(VAdd); AWALYS_USE_ME_WITH_DOUBLE(VScal); +AWALYS_USE_ME_WITH_DOUBLE(StrideScal); AWALYS_USE_ME_WITH_DOUBLE(VExp); AWALYS_USE_ME_WITH_DOUBLE(VSigmoid); AWALYS_USE_ME_WITH_DOUBLE(VTanh); @@ -259,6 +293,7 @@ REGISTER_MKL_KERNEL(MatMul); REGISTER_MKL_KERNEL(VMul); REGISTER_MKL_KERNEL(VAdd); REGISTER_MKL_KERNEL(VScal); +REGISTER_MKL_KERNEL(StrideScal); REGISTER_MKL_KERNEL(VExp); REGISTER_MKL_KERNEL(VSquare); REGISTER_MKL_KERNEL(VCopy); diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index f51dca654c..1e974c095f 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -129,7 +129,13 @@ template void ASum(const T* x, T* res, int n); template -void Softmax(const T* x, T* y, int n, int bs) { +void StrideSum(const T* x, T* res, int n, int stride); + +template +void StrideScal(const T* a, const T* x, T* y, int n, int stride); + +template +void Softmax(const T* x, T* y, int n, int bs, int m=1) { std::vector entities(bs); for (int i = 0; i < bs; ++i) { entities[i] = x[i * n]; @@ -143,9 +149,17 @@ void Softmax(const T* x, T* y, int n, int bs) { VExp(y, y, n * bs); for (int i = 0; i < bs; ++i) { T sum; - ASum(&y[i * n], &sum, n); - sum = static_cast(1) / sum; - VScal(&sum, &y[i * n], &y[i * n], n); + if (m == 1) { + ASum(&y[i * n], &sum, n); + sum = static_cast(1) / sum; + VScal(&sum, &y[i * n], &y[i * n], n); + } else { + for (int j = 0; j < m; ++j) { + StrideSum(&y[i * n + j], &sum, n/m, m); + sum = static_cast(1) / sum; + StrideScal(&sum, &y[i * n + j], &y[i * n + j], n/m, m); + } + } } } @@ -193,6 +207,7 @@ DECLARE_MKL_KERNEL(VAdd); // AXYN DECLARE_MKL_KERNEL(VScal); +DECLARE_MKL_KERNEL(StrideScal); // XYN DECLARE_MKL_KERNEL(VExp); diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt index ffab9c1457..9a39809c93 100644 --- a/paddle/fluid/operators/jit/refer/CMakeLists.txt +++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt @@ -12,6 +12,7 @@ USE_JITKERNEL_REFER(kVAdd) USE_JITKERNEL_REFER(kVAddRelu) USE_JITKERNEL_REFER(kVSub) USE_JITKERNEL_REFER(kVScal) +USE_JITKERNEL_REFER(kStrideScal) USE_JITKERNEL_REFER(kVAddBias) USE_JITKERNEL_REFER(kVCopy) USE_JITKERNEL_REFER(kVRelu) @@ -32,6 +33,7 @@ USE_JITKERNEL_REFER(kMatMul) USE_JITKERNEL_REFER(kVSquare) USE_JITKERNEL_REFER(kHSum) USE_JITKERNEL_REFER(kHMax) +USE_JITKERNEL_REFER(kStrideSum) USE_JITKERNEL_REFER(kSoftmax) USE_JITKERNEL_REFER(kEmbSeqPool) USE_JITKERNEL_REFER(kSgd) diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc index 0d1c477090..704124e805 100644 --- a/paddle/fluid/operators/jit/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -27,6 +27,7 @@ REGISTER_REFER_KERNEL(VAddRelu); REGISTER_REFER_KERNEL(VSub); REGISTER_REFER_KERNEL(VScal); +REGISTER_REFER_KERNEL(StrideScal); REGISTER_REFER_KERNEL(VAddBias); REGISTER_REFER_KERNEL(VRelu); @@ -51,6 +52,7 @@ REGISTER_REFER_KERNEL(SeqPool); REGISTER_REFER_KERNEL(MatMul); REGISTER_REFER_KERNEL(HMax); REGISTER_REFER_KERNEL(HSum); +REGISTER_REFER_KERNEL(StrideSum); REGISTER_REFER_KERNEL(Softmax); REGISTER_REFER_KERNEL(EmbSeqPool); REGISTER_REFER_KERNEL(Sgd); diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index cac705a484..dee9245524 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -411,19 +411,42 @@ void HSum(const T* x, T* res, int n) { } } +template +void StrideSum(const T* x, T* res, int n, int stride) { + res[0] = x[0]; + for (int i = stride; i < n; i+=stride) { + res[0] += x[i]; + } +} + +template +void StrideScal(const T* a, const T* x, T* y, int n , int stride) { + for (int i = 0; i < n; i+=stride) { + y[i] = x[i] * a[0]; + } +} + // y = e^(x - max(x)) // y = y / sum(y) template -void Softmax(const T* x, T* y, int n, int bs = 1) { +void Softmax(const T* x, T* y, int n, int bs = 1, int m = 1) { for (int i = 0; i < bs; ++i) { T scalar; HMax(x, &scalar, n); scalar = static_cast(0) - scalar; VAddBias(&scalar, x, y, n); // x - max VExp(y, y, n); - HSum(y, &scalar, n); - scalar = static_cast(1) / scalar; - VScal(&scalar, y, y, n); + if (m == 1) { + HSum(y, &scalar, n); + scalar = static_cast(1) / scalar; + VScal(&scalar, y, y, n); + } else { + for (int j = 0; j < m; j++) { + StrideSum(&y[j], &scalar, n, m); + scalar = static_cast(1) / scalar; + StrideScal(&scalar, &y[j], &y[j], n, m); + } + } x += n; y += n; } @@ -507,6 +530,9 @@ DECLARE_REFER_KERNEL(VSub); DECLARE_REFER_KERNEL(VScal); DECLARE_REFER_KERNEL(VAddBias); +// const T* a, const T* x, T* y, int n, int stride +DECLARE_REFER_KERNEL(StrideScal); + // const T* x, T* y, int n DECLARE_REFER_KERNEL(VRelu); DECLARE_REFER_KERNEL(VIdentity); @@ -528,6 +554,8 @@ DECLARE_REFER_KERNEL(GRUHtPart2); DECLARE_REFER_KERNEL(HMax); DECLARE_REFER_KERNEL(HSum); +DECLARE_REFER_KERNEL(StrideSum); + // others DECLARE_REFER_KERNEL(CRFDecoding); DECLARE_REFER_KERNEL(LayerNorm); diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 6c099a7a06..93a448166f 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -723,39 +723,44 @@ void TestKernelSoftmax() { VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int bs : {1, 2, 10}) { for (int n : TestSizes()) { - auto ref = jit::GetReferFunc(); - EXPECT_TRUE(ref != nullptr); - std::vector x(bs * n), y(bs * n); - RandomVec(bs * n, x.data()); - const T* x_data = x.data(); - T* y_data = y.data(); + for (int m : {1, 2}) { + if (m > n || n % m != 0) { + continue; + } + auto ref = jit::GetReferFunc(); + EXPECT_TRUE(ref != nullptr); + std::vector x(bs * n), y(bs * n); + RandomVec(bs * n, x.data()); + const T* x_data = x.data(); + T* y_data = y.data(); - std::vector xinp(x.size()); // inplace test - std::copy(x.begin(), x.end(), xinp.begin()); - ref(x_data, y_data, n, bs); - T* xinp_data = xinp.data(); - ref(xinp_data, xinp_data, n, bs); - ExpectEQ(xinp_data, y_data, n * bs); + std::vector xinp(x.size()); // inplace test + std::copy(x.begin(), x.end(), xinp.begin()); + ref(x_data, y_data, n, bs, m); + T* xinp_data = xinp.data(); + ref(xinp_data, xinp_data, n, bs, m); + ExpectEQ(xinp_data, y_data, n * bs); - auto verifier = [](const typename KernelTuple::func_type tgt, - const std::vector& x, const std::vector& yref, - int n, int bs) { - EXPECT_TRUE(tgt != nullptr); - EXPECT_EQ(yref.size(), x.size()); - EXPECT_EQ(x.size(), static_cast(n * bs)); - const T* x_data = x.data(); - const T* yref_data = yref.data(); - std::vector ytgt(n * bs); - T* ytgt_data = ytgt.data(); - // test normal - tgt(x_data, ytgt_data, n, bs); - ExpectEQ(ytgt_data, yref_data, n * bs); - // test inplace x - std::copy(x.begin(), x.end(), ytgt.begin()); - tgt(ytgt_data, ytgt_data, n, bs); - ExpectEQ(ytgt_data, yref_data, n * bs); - }; - TestAllImpls(n, verifier, x, y, n, bs); + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& x, const std::vector& yref, + int n, int bs, int m) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(yref.size(), x.size()); + EXPECT_EQ(x.size(), static_cast(n * bs)); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + std::vector ytgt(n * bs); + T* ytgt_data = ytgt.data(); + // test normal + tgt(x_data, ytgt_data, n, bs, m); + ExpectEQ(ytgt_data, yref_data, n * bs); + // test inplace x + std::copy(x.begin(), x.end(), ytgt.begin()); + tgt(ytgt_data, ytgt_data, n, bs, m); + ExpectEQ(ytgt_data, yref_data, n * bs); + }; + TestAllImpls(n, verifier, x, y, n, bs, m); + } } } } diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index 9bcb272b93..dea8142cc8 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -76,8 +76,8 @@ using enable_if_CPU = typename std::enable_if< template class SoftmaxFunctor> { - void operator()(const DeviceContext& context, const framework::Tensor* X, - framework::Tensor* Y) { + void operator()(const DeviceContext& context, const int axis_dim, + const framework::Tensor* X, framework::Tensor* Y) { auto in_dims = X->dims(); const float* in_data = X->data(); float* out_data = Y->data(); @@ -87,7 +87,8 @@ class SoftmaxFunctor> { auto compute_softmax = jit::KernelFuncs, platform::CPUPlace>::Cache() .At(in_dims[kClassDim]); - compute_softmax(in_data, out_data, in_dims[kClassDim], in_dims[kBatchDim]); + compute_softmax(in_data, out_data, in_dims[kClassDim], in_dims[kBatchDim], + in_dims[kClassDim] / axis_dim); } }; diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index 578ab8eee3..9cbb6691f4 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -42,9 +42,18 @@ class SoftmaxOp : public framework::OperatorWithKernel { auto dim_x = ctx->GetInputDim("X"); auto rank_x = dim_x.size(); auto axis = ctx->Attrs().Get("axis"); - PADDLE_ENFORCE(axis >= -1 && axis < rank_x, - "Attr(axis) value should larger equal then -1" - "and less then the rank of Input(X)"); + PADDLE_ENFORCE(axis >= -rank_x && axis < rank_x, + "Attr(axis) value should be in range [-R, R-1], " + "R is the rank of Input(X)."); + + auto use_cudnn = ctx->Attrs().Get("use_cudnn"); + auto use_mkldnn = ctx->Attrs().Get("use_mkldnn"); + if (axis != rank_x - 1 && axis != -1) { + PADDLE_ENFORCE(!use_cudnn, + "CUDNN kernel only support axis as -1."); + PADDLE_ENFORCE(!use_mkldnn, + "MKLDNN kernel only support axis as -1."); + } ctx->SetOutputDim("Out", ctx->GetInputDim("X")); ctx->ShareLoD("X", /*->*/ "Out"); diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h index 76e8eeab08..bbea935101 100644 --- a/paddle/fluid/operators/softmax_op.h +++ b/paddle/fluid/operators/softmax_op.h @@ -63,8 +63,6 @@ class SoftmaxKernel : public framework::OpKernel { Tensor X_2d, Out_2d; X_2d.ShareDataWith(*X).Resize({n, d}); Out_2d.ShareDataWith(*Out).Resize({n, d}); - // Tensor X_2d = framework::ReshapeToMatrix(*X, axis - 1); - // Tensor Out_2d = framework::ReshapeToMatrix(*Out, axis - 1); #ifdef PADDLE_ON_INFERENCE math::SoftmaxFunctor()( @@ -96,9 +94,6 @@ class SoftmaxGradKernel : public framework::OpKernel { dX_2d.ShareDataWith(*dX).Resize({n, d}); Out_2d.ShareDataWith(*Out).Resize({n, d}); dOut_2d.ShareDataWith(*dOut).Resize({n, d}); - // Tensor Out_2d = framework::ReshapeToMatrix(*Out, axis - 1); - // Tensor dOut_2d = framework::ReshapeToMatrix(*dOut, axis - 1); - // Tensor dX_2d = framework::ReshapeToMatrix(*dX, axis - 1); math::SoftmaxGradFunctor()( context.template device_context(), axis_dim, &Out_2d, &dOut_2d, diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py index 2e779270f0..8b07126028 100644 --- a/python/paddle/fluid/tests/unittests/test_softmax_op.py +++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py @@ -125,26 +125,6 @@ class TestSoftmaxCUDNNOp2(TestSoftmaxCUDNNOp): return [2, 3, 4, 5] -@unittest.skipIf(not core.is_compiled_with_cuda(), - "core is not compiled with CUDA") -class TestSoftmaxCUDNNOp3(TestSoftmaxCUDNNOp): - def get_x_shape(self): - return [2, 3, 4, 5] - - def get_axis(self): - return 0 - - -@unittest.skipIf(not core.is_compiled_with_cuda(), - "core is not compiled with CUDA") -class TestSoftmaxCUDNNOp4(TestSoftmaxCUDNNOp): - def get_x_shape(self): - return [2, 3, 4, 5] - - def get_axis(self): - return 1 - - @unittest.skipIf(not core.is_compiled_with_cuda(), "core is not compiled with CUDA") class TestSoftmaxCUDNNOp5(TestSoftmaxCUDNNOp): @@ -152,7 +132,7 @@ class TestSoftmaxCUDNNOp5(TestSoftmaxCUDNNOp): return [2, 3, 4, 5] def get_axis(self): - return 2 + return 3 @unittest.skipIf(not core.is_compiled_with_cuda(), From 51536f7f52130237ea9e9ad1a00687ba5dd5b955 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Thu, 21 Mar 2019 05:25:34 +0000 Subject: [PATCH 11/18] StrideASum. test=develop --- paddle/fluid/operators/jit/helper.cc | 2 +- paddle/fluid/operators/jit/kernel_base.h | 4 ++-- paddle/fluid/operators/jit/more/mix/mix.cc | 2 +- paddle/fluid/operators/jit/more/mkl/mkl.cc | 6 +++--- paddle/fluid/operators/jit/more/mkl/mkl.h | 4 ++-- paddle/fluid/operators/jit/refer/CMakeLists.txt | 2 +- paddle/fluid/operators/jit/refer/refer.cc | 2 +- paddle/fluid/operators/jit/refer/refer.h | 8 ++++---- paddle/fluid/operators/jit/test.cc | 1 + 9 files changed, 16 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc index fe508788ef..f868c847bd 100644 --- a/paddle/fluid/operators/jit/helper.cc +++ b/paddle/fluid/operators/jit/helper.cc @@ -56,7 +56,7 @@ const char* to_string(KernelType kt) { ONE_CASE(kMatMul); ONE_CASE(kHMax); ONE_CASE(kHSum); - ONE_CASE(kStrideSum); + ONE_CASE(kStrideASum); ONE_CASE(kSoftmax); ONE_CASE(kEmbSeqPool); ONE_CASE(kSgd); diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index 6fd8a59d55..fdd41a830a 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -53,7 +53,7 @@ typedef enum { kVSquare, kVSub, kVTanh, - kStrideSum, + kStrideASum, kStrideScal, } KernelType; @@ -132,7 +132,7 @@ DECLARE_KERNELTUPLE(XYNTuple, VCopy); DECLARE_KERNELTUPLE(XRNTuple, HMax); DECLARE_KERNELTUPLE(XRNTuple, HSum); -DECLARE_KERNELTUPLE(XRNSTuple, StrideSum); +DECLARE_KERNELTUPLE(XRNSTuple, StrideASum); typedef struct { void* gates; // gates: x_ch, x_ih, x_fh, x_oh diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc index 58a44d4b55..463e45f6ce 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.cc +++ b/paddle/fluid/operators/jit/more/mix/mix.cc @@ -54,7 +54,7 @@ void Softmax(const T* x, T* y, int n, int bs, int m) { auto compute_hmax = KernelFuncs, CPUPlace>::Cache().At(n); auto compute_hsum = KernelFuncs, CPUPlace>::Cache().At(n); auto compute_vscal = KernelFuncs, CPUPlace>::Cache().At(n); - auto compute_stridesum = KernelFuncs, CPUPlace>::Cache().At(n); + auto compute_stridesum = KernelFuncs, CPUPlace>::Cache().At(n); auto compute_stridescal = KernelFuncs, CPUPlace>::Cache().At(n); auto compute_vaddbias = KernelFuncs, CPUPlace>::Cache().At(n); diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc index 2828d75815..9e21e2b8d3 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.cc +++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc @@ -147,12 +147,12 @@ void ASum(const double* x, double* res, int n) { } template <> -void StrideSum(const float* x, float* res, int n, int stride) { +void StrideASum(const float* x, float* res, int n, int stride) { res[0] = platform::dynload::cblas_sasum(n, x, stride); } template <> -void StrideSum(const double* x, double* res, int n, int stride) { +void StrideASum(const double* x, double* res, int n, int stride) { res[0] = platform::dynload::cblas_dasum(n, x, stride); } @@ -174,7 +174,7 @@ bool VScalKernel::CanBeUsed(const int& d) const { template <> bool StrideScalKernel::CanBeUsed(const int& d) const { - return platform::MayIUse(platform::avx512f) && d > 512; + return true; } template <> diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index 1e974c095f..2f135f9e7a 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -129,7 +129,7 @@ template void ASum(const T* x, T* res, int n); template -void StrideSum(const T* x, T* res, int n, int stride); +void StrideASum(const T* x, T* res, int n, int stride); template void StrideScal(const T* a, const T* x, T* y, int n, int stride); @@ -155,7 +155,7 @@ void Softmax(const T* x, T* y, int n, int bs, int m=1) { VScal(&sum, &y[i * n], &y[i * n], n); } else { for (int j = 0; j < m; ++j) { - StrideSum(&y[i * n + j], &sum, n/m, m); + StrideASum(&y[i * n + j], &sum, n/m, m); sum = static_cast(1) / sum; StrideScal(&sum, &y[i * n + j], &y[i * n + j], n/m, m); } diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt index 9a39809c93..7133f59662 100644 --- a/paddle/fluid/operators/jit/refer/CMakeLists.txt +++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt @@ -33,7 +33,7 @@ USE_JITKERNEL_REFER(kMatMul) USE_JITKERNEL_REFER(kVSquare) USE_JITKERNEL_REFER(kHSum) USE_JITKERNEL_REFER(kHMax) -USE_JITKERNEL_REFER(kStrideSum) +USE_JITKERNEL_REFER(kStrideASum) USE_JITKERNEL_REFER(kSoftmax) USE_JITKERNEL_REFER(kEmbSeqPool) USE_JITKERNEL_REFER(kSgd) diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc index 704124e805..460cb6c580 100644 --- a/paddle/fluid/operators/jit/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -52,7 +52,7 @@ REGISTER_REFER_KERNEL(SeqPool); REGISTER_REFER_KERNEL(MatMul); REGISTER_REFER_KERNEL(HMax); REGISTER_REFER_KERNEL(HSum); -REGISTER_REFER_KERNEL(StrideSum); +REGISTER_REFER_KERNEL(StrideASum); REGISTER_REFER_KERNEL(Softmax); REGISTER_REFER_KERNEL(EmbSeqPool); REGISTER_REFER_KERNEL(Sgd); diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index dee9245524..e3387f60a6 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -412,10 +412,10 @@ void HSum(const T* x, T* res, int n) { } template -void StrideSum(const T* x, T* res, int n, int stride) { +void StrideASum(const T* x, T* res, int n, int stride) { res[0] = x[0]; for (int i = stride; i < n; i+=stride) { - res[0] += x[i]; + res[0] += std::abs(x[i]); } } @@ -442,7 +442,7 @@ void Softmax(const T* x, T* y, int n, int bs = 1, int m = 1) { VScal(&scalar, y, y, n); } else { for (int j = 0; j < m; j++) { - StrideSum(&y[j], &scalar, n, m); + StrideASum(&y[j], &scalar, n, m); scalar = static_cast(1) / scalar; StrideScal(&scalar, &y[j], &y[j], n, m); } @@ -554,7 +554,7 @@ DECLARE_REFER_KERNEL(GRUHtPart2); DECLARE_REFER_KERNEL(HMax); DECLARE_REFER_KERNEL(HSum); -DECLARE_REFER_KERNEL(StrideSum); +DECLARE_REFER_KERNEL(StrideASum); // others DECLARE_REFER_KERNEL(CRFDecoding); diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 93a448166f..c47ec01d3e 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -727,6 +727,7 @@ void TestKernelSoftmax() { if (m > n || n % m != 0) { continue; } + VLOG(10) << "Softmax: " << bs << ", " << n << ", " << m; auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector x(bs * n), y(bs * n); From f45aced59b819de607fc6560c737be63d7c74d7a Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Sun, 24 Mar 2019 07:34:30 +0000 Subject: [PATCH 12/18] add jit test. develop=test --- paddle/fluid/operators/jit/more/mix/mix.cc | 10 +-- paddle/fluid/operators/jit/more/mix/mix.h | 2 +- paddle/fluid/operators/jit/more/mkl/mkl.cc | 8 +- paddle/fluid/operators/jit/more/mkl/mkl.h | 10 +-- paddle/fluid/operators/jit/refer/refer.h | 18 +++-- paddle/fluid/operators/jit/test.cc | 90 +++++++++++++++++++++- 6 files changed, 112 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc index 463e45f6ce..4f309501b6 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.cc +++ b/paddle/fluid/operators/jit/more/mix/mix.cc @@ -50,7 +50,7 @@ void VTanh(const T* x, T* y, int n) { compute_addbias(&b, y, y, n); } -void Softmax(const T* x, T* y, int n, int bs, int m) { +void Softmax(const T* x, T* y, int n, int bs, int remain) { auto compute_hmax = KernelFuncs, CPUPlace>::Cache().At(n); auto compute_hsum = KernelFuncs, CPUPlace>::Cache().At(n); auto compute_vscal = KernelFuncs, CPUPlace>::Cache().At(n); @@ -66,15 +66,15 @@ void Softmax(const T* x, T* y, int n, int bs, int m) { scalar = static_cast(0) - scalar; compute_vaddbias(&scalar, x, y, n); // x - max compute_vexp(y, y, n); - if (m == 1) { + if (remain == 1) { compute_hsum(y, &scalar, n); scalar = static_cast(1) / scalar; compute_vscal(&scalar, y, y, n); } else { - for (int j = 0; j < m; ++j) { - compute_stridesum(&y[j], &scalar, n, m); + for (int j = 0; j < remain; ++j) { + compute_stridesum(&y[j], &scalar, n, remain); scalar = static_cast(1) / scalar; - compute_stridescal(&scalar, &y[j], &y[j], n, m); + compute_stridescal(&scalar, &y[j], &y[j], n, remain); } } x += n; diff --git a/paddle/fluid/operators/jit/more/mix/mix.h b/paddle/fluid/operators/jit/more/mix/mix.h index a0079506f8..035425317e 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.h +++ b/paddle/fluid/operators/jit/more/mix/mix.h @@ -26,7 +26,7 @@ using T = float; void VSigmoid(const T* x, T* y, int n); void VTanh(const T* x, T* y, int n); -void Softmax(const T* x, T* y, int n, int bs, int m); +void Softmax(const T* x, T* y, int n, int bs, int remain); void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr); void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr); diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc index 9e21e2b8d3..fc8800ec72 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.cc +++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc @@ -81,7 +81,7 @@ void VScal(const double* a, const double* x, double* y, int n) { template <> void StrideScal(const float* a, const float* x, float* y, int n, int stride) { if (x == y) { - platform::dynload::cblas_sscal(n, *a, y, stride); + platform::dynload::cblas_sscal(n/stride, *a, y, stride); } else { refer::StrideScal(a, x, y, n, stride); } @@ -90,7 +90,7 @@ void StrideScal(const float* a, const float* x, float* y, int n, int stri template <> void StrideScal(const double* a, const double* x, double* y, int n, int stride) { if (x == y) { - platform::dynload::cblas_dscal(n, *a, y, stride); + platform::dynload::cblas_dscal(n/stride, *a, y, stride); } else { refer::StrideScal(a, x, y, n, stride); } @@ -148,12 +148,12 @@ void ASum(const double* x, double* res, int n) { template <> void StrideASum(const float* x, float* res, int n, int stride) { - res[0] = platform::dynload::cblas_sasum(n, x, stride); + res[0] = platform::dynload::cblas_sasum(n/stride, x, stride); } template <> void StrideASum(const double* x, double* res, int n, int stride) { - res[0] = platform::dynload::cblas_dasum(n, x, stride); + res[0] = platform::dynload::cblas_dasum(n/stride, x, stride); } // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index 2f135f9e7a..1fbb87b0cf 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -135,7 +135,7 @@ template void StrideScal(const T* a, const T* x, T* y, int n, int stride); template -void Softmax(const T* x, T* y, int n, int bs, int m=1) { +void Softmax(const T* x, T* y, int n, int bs, int remain=1) { std::vector entities(bs); for (int i = 0; i < bs; ++i) { entities[i] = x[i * n]; @@ -149,15 +149,15 @@ void Softmax(const T* x, T* y, int n, int bs, int m=1) { VExp(y, y, n * bs); for (int i = 0; i < bs; ++i) { T sum; - if (m == 1) { + if (remain == 1) { ASum(&y[i * n], &sum, n); sum = static_cast(1) / sum; VScal(&sum, &y[i * n], &y[i * n], n); } else { - for (int j = 0; j < m; ++j) { - StrideASum(&y[i * n + j], &sum, n/m, m); + for (int j = 0; j < remain; ++j) { + StrideASum(&y[i * n + j], &sum, n, remain); sum = static_cast(1) / sum; - StrideScal(&sum, &y[i * n + j], &y[i * n + j], n/m, m); + StrideScal(&sum, &y[i * n + j], &y[i * n + j], n, remain); } } } diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index e3387f60a6..c62925232b 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -421,30 +421,34 @@ void StrideASum(const T* x, T* res, int n, int stride) { template void StrideScal(const T* a, const T* x, T* y, int n , int stride) { - for (int i = 0; i < n; i+=stride) { - y[i] = x[i] * a[0]; + for (int i = 0; i < n; ++i) { + if (i % stride == 0) { + y[i] = x[i] * a[0]; + } else { + y[i] = x[i]; + } } } // y = e^(x - max(x)) // y = y / sum(y) template -void Softmax(const T* x, T* y, int n, int bs = 1, int m = 1) { +void Softmax(const T* x, T* y, int n, int bs = 1, int remain = 1) { for (int i = 0; i < bs; ++i) { T scalar; HMax(x, &scalar, n); scalar = static_cast(0) - scalar; VAddBias(&scalar, x, y, n); // x - max VExp(y, y, n); - if (m == 1) { + if (remain == 1) { HSum(y, &scalar, n); scalar = static_cast(1) / scalar; VScal(&scalar, y, y, n); } else { - for (int j = 0; j < m; j++) { - StrideASum(&y[j], &scalar, n, m); + for (int j = 0; j < remain; j++) { + StrideASum(&y[j], &scalar, n, remain); scalar = static_cast(1) / scalar; - StrideScal(&scalar, &y[j], &y[j], n, m); + StrideScal(&scalar, &y[j], &y[j], n, remain); } } x += n; diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index c47ec01d3e..1397e5be18 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -723,11 +723,10 @@ void TestKernelSoftmax() { VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int bs : {1, 2, 10}) { for (int n : TestSizes()) { - for (int m : {1, 2}) { + for (int m : {1, 2, 3}) { // remain if (m > n || n % m != 0) { continue; } - VLOG(10) << "Softmax: " << bs << ", " << n << ", " << m; auto ref = jit::GetReferFunc(); EXPECT_TRUE(ref != nullptr); std::vector x(bs * n), y(bs * n); @@ -766,6 +765,86 @@ void TestKernelSoftmax() { } } +template +void TestKernelStrideASum() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); + for (int d : TestSizes()) { + for (int m : {1, 2, 3}) { // stride + if (m > d || d % m != 0) { + continue; + } + auto ref = jit::GetReferFunc(); + EXPECT_TRUE(ref != nullptr); + std::vector x(d); + RandomVec(d, x.data()); + T ref_res; + ref(x.data(), &ref_res, d, m); + + auto verifier = [](const typename KernelTuple::func_type tgt, + const std::vector& x, const T ref_res, + const int m) { + EXPECT_TRUE(tgt != nullptr); + T tgt_res; + tgt(x.data(), &tgt_res, x.size(), m); + ExpectEQ(&tgt_res, &ref_res, 1); + }; + TestAllImpls(d, verifier, x, ref_res, m); + } + } +} + +template +void TestKernelStrideScal() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); + // for (int d : TestSizes()) { + // for (int m : {1, 2, 3}) { // stride + for (int d : {4}) { + for (int m : {2}) { // stride + if (m > d || d % m != 0) { + continue; + } + auto ref = jit::GetReferFunc(); + EXPECT_TRUE(ref != nullptr); + + const T a = static_cast(3); + std::vector x(d), yref(d); + std::vector xinp(d); // inplace test + RandomVec(d, x.data()); + std::copy(x.begin(), x.end(), xinp.begin()); + + const T* x_data = x.data(); + T* yref_data = yref.data(); + T* xinp_data = xinp.data(); + // test refer code inplace + ref(&a, x_data, yref_data, d, m); + ref(&a, xinp_data, xinp_data, d, m); + ExpectEQ(xinp_data, yref_data, d); + + auto verifier = [](const typename KernelTuple::func_type tgt, const T a, + const std::vector& x, const std::vector& yref, + const int m) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(yref.size(), x.size()); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + const int d = yref.size(); + std::vector ytgt(d); + T* ytgt_data = ytgt.data(); + // test normal + tgt(&a, x_data, ytgt_data, d, m); + ExpectEQ(ytgt_data, yref_data, d); + // test inplace x + std::copy(x.begin(), x.end(), ytgt.begin()); + tgt(&a, ytgt_data, ytgt_data, d, m); + ExpectEQ(ytgt_data, yref_data, d); + }; + TestAllImpls(d, verifier, a, x, yref, m); + } + } +} + template void TestKernelSgd() { using T = typename KernelTuple::data_type; @@ -918,7 +997,7 @@ TEST(JITKernel_pool, more) { EXPECT_EQ(kers.size(), 10UL); #else #ifdef PADDLE_WITH_MKLML - EXPECT_EQ(kers.size(), 21UL); + EXPECT_EQ(kers.size(), 22UL); #else EXPECT_EQ(kers.size(), 8UL); #endif @@ -927,7 +1006,7 @@ TEST(JITKernel_pool, more) { TEST(JITKernel_pool, refer) { const auto& kers = jit::ReferKernelPool::Instance().AllKernels(); - EXPECT_EQ(kers.size(), 29UL); + EXPECT_EQ(kers.size(), 31UL); } // test helper @@ -1298,3 +1377,6 @@ TEST_CPU_KERNEL(MatMul); TEST_CPU_KERNEL(Softmax); TEST_CPU_KERNEL(Sgd); TEST_CPU_KERNEL(VBroadcast); + +TEST_CPU_KERNEL(StrideASum); +TEST_CPU_KERNEL(StrideScal); From 90bd038d358ebcf30520da457d9672b0c4513b0e Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Mon, 25 Mar 2019 19:58:18 +0800 Subject: [PATCH 13/18] fix format. test=develop --- paddle/fluid/API.spec | 2 +- paddle/fluid/operators/jit/more/mix/mix.cc | 6 ++++-- paddle/fluid/operators/jit/more/mkl/mkl.cc | 14 ++++++++------ paddle/fluid/operators/jit/more/mkl/mkl.h | 2 +- paddle/fluid/operators/jit/refer/refer.h | 4 ++-- paddle/fluid/operators/jit/test.cc | 8 ++++---- paddle/fluid/operators/math/softmax.h | 2 +- paddle/fluid/operators/math/softmax_impl.h | 5 +++-- paddle/fluid/operators/softmax_op.cc | 6 ++---- paddle/fluid/operators/softmax_op.h | 10 ++++++---- paddle/fluid/operators/warpctc_cudnn_op.cu.cc | 3 ++- 11 files changed, 34 insertions(+), 28 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 8849e31025..51c3c7bbf9 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -86,7 +86,7 @@ paddle.fluid.layers.conv2d (ArgSpec(args=['input', 'num_filters', 'filter_size', paddle.fluid.layers.conv3d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)), ('document', '37042620f9bd3a2da6e5d3138b2f724b')) paddle.fluid.layers.sequence_pool (ArgSpec(args=['input', 'pool_type', 'is_test'], varargs=None, keywords=None, defaults=(False,)), ('document', 'a194fb80614023f543df3949fbd0d0b8')) paddle.fluid.layers.sequence_softmax (ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '19ef6f9cdd27feac8a1ae060f19c10b4')) -paddle.fluid.layers.softmax (ArgSpec(args=['input', 'use_cudnn', 'name', 'axis'], varargs=None, keywords=None, defaults=(False, None, -1)), ('document', '85f9690b1b285def19077a41d9dba36c')) +paddle.fluid.layers.softmax (ArgSpec(args=['input', 'use_cudnn', 'name', 'axis'], varargs=None, keywords=None, defaults=(False, None, -1)), ('document', '502bad9e8bc7ef24817d0d4b20f61df3')) paddle.fluid.layers.pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', 'bbd84e855e660cd1084bb71a2fd0cdaa')) paddle.fluid.layers.pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', '043de7333b79ee0ac55053c14ed81625')) paddle.fluid.layers.adaptive_pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '859b887174d06f361658f69cb7c06d95')) diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc index 4f309501b6..1a9fc9ed7b 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.cc +++ b/paddle/fluid/operators/jit/more/mix/mix.cc @@ -54,8 +54,10 @@ void Softmax(const T* x, T* y, int n, int bs, int remain) { auto compute_hmax = KernelFuncs, CPUPlace>::Cache().At(n); auto compute_hsum = KernelFuncs, CPUPlace>::Cache().At(n); auto compute_vscal = KernelFuncs, CPUPlace>::Cache().At(n); - auto compute_stridesum = KernelFuncs, CPUPlace>::Cache().At(n); - auto compute_stridescal = KernelFuncs, CPUPlace>::Cache().At(n); + auto compute_stridesum = + KernelFuncs, CPUPlace>::Cache().At(n); + auto compute_stridescal = + KernelFuncs, CPUPlace>::Cache().At(n); auto compute_vaddbias = KernelFuncs, CPUPlace>::Cache().At(n); auto compute_vexp = KernelFuncs, CPUPlace>::Cache().At(n); diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc index fc8800ec72..75ebddb125 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.cc +++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc @@ -79,18 +79,20 @@ void VScal(const double* a, const double* x, double* y, int n) { } template <> -void StrideScal(const float* a, const float* x, float* y, int n, int stride) { +void StrideScal(const float* a, const float* x, float* y, int n, + int stride) { if (x == y) { - platform::dynload::cblas_sscal(n/stride, *a, y, stride); + platform::dynload::cblas_sscal(n / stride, *a, y, stride); } else { refer::StrideScal(a, x, y, n, stride); } } template <> -void StrideScal(const double* a, const double* x, double* y, int n, int stride) { +void StrideScal(const double* a, const double* x, double* y, int n, + int stride) { if (x == y) { - platform::dynload::cblas_dscal(n/stride, *a, y, stride); + platform::dynload::cblas_dscal(n / stride, *a, y, stride); } else { refer::StrideScal(a, x, y, n, stride); } @@ -148,12 +150,12 @@ void ASum(const double* x, double* res, int n) { template <> void StrideASum(const float* x, float* res, int n, int stride) { - res[0] = platform::dynload::cblas_sasum(n/stride, x, stride); + res[0] = platform::dynload::cblas_sasum(n / stride, x, stride); } template <> void StrideASum(const double* x, double* res, int n, int stride) { - res[0] = platform::dynload::cblas_dasum(n/stride, x, stride); + res[0] = platform::dynload::cblas_dasum(n / stride, x, stride); } // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index 1fbb87b0cf..968895bb6f 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -135,7 +135,7 @@ template void StrideScal(const T* a, const T* x, T* y, int n, int stride); template -void Softmax(const T* x, T* y, int n, int bs, int remain=1) { +void Softmax(const T* x, T* y, int n, int bs, int remain = 1) { std::vector entities(bs); for (int i = 0; i < bs; ++i) { entities[i] = x[i * n]; diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index c62925232b..4aeb2fd628 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -414,13 +414,13 @@ void HSum(const T* x, T* res, int n) { template void StrideASum(const T* x, T* res, int n, int stride) { res[0] = x[0]; - for (int i = stride; i < n; i+=stride) { + for (int i = stride; i < n; i += stride) { res[0] += std::abs(x[i]); } } template -void StrideScal(const T* a, const T* x, T* y, int n , int stride) { +void StrideScal(const T* a, const T* x, T* y, int n, int stride) { for (int i = 0; i < n; ++i) { if (i % stride == 0) { y[i] = x[i] * a[0]; diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 1397e5be18..d8a0b2cbf5 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -723,7 +723,7 @@ void TestKernelSoftmax() { VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int bs : {1, 2, 10}) { for (int n : TestSizes()) { - for (int m : {1, 2, 3}) { // remain + for (int m : {1, 2, 3}) { // remain if (m > n || n % m != 0) { continue; } @@ -770,7 +770,7 @@ void TestKernelStrideASum() { using T = typename KernelTuple::data_type; VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int d : TestSizes()) { - for (int m : {1, 2, 3}) { // stride + for (int m : {1, 2, 3}) { // stride if (m > d || d % m != 0) { continue; } @@ -782,7 +782,7 @@ void TestKernelStrideASum() { ref(x.data(), &ref_res, d, m); auto verifier = [](const typename KernelTuple::func_type tgt, - const std::vector& x, const T ref_res, + const std::vector& x, const T ref_res, const int m) { EXPECT_TRUE(tgt != nullptr); T tgt_res; @@ -801,7 +801,7 @@ void TestKernelStrideScal() { // for (int d : TestSizes()) { // for (int m : {1, 2, 3}) { // stride for (int d : {4}) { - for (int m : {2}) { // stride + for (int m : {2}) { // stride if (m > d || d % m != 0) { continue; } diff --git a/paddle/fluid/operators/math/softmax.h b/paddle/fluid/operators/math/softmax.h index f8e250fa2e..a7a30a71e4 100644 --- a/paddle/fluid/operators/math/softmax.h +++ b/paddle/fluid/operators/math/softmax.h @@ -31,7 +31,7 @@ template class SoftmaxGradFunctor { public: void operator()(const DeviceContext& context, const int axis_dim, - const framework::Tensor* y, const framework::Tensor* y_grad, + const framework::Tensor* y, const framework::Tensor* y_grad, framework::Tensor* x_grad); }; diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index dea8142cc8..6f6f33345f 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -94,8 +94,9 @@ class SoftmaxFunctor> { template void SoftmaxGradFunctor::operator()( - const DeviceContext& context, const int axis_dim, const framework::Tensor* y, - const framework::Tensor* y_grad, framework::Tensor* x_grad) { + const DeviceContext& context, const int axis_dim, + const framework::Tensor* y, const framework::Tensor* y_grad, + framework::Tensor* x_grad) { auto softmax = EigenMatrix::From(*y); auto softmax_grad = EigenMatrix::From(*y_grad); auto logits_grad = EigenMatrix::From(*x_grad); diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index 9cbb6691f4..b812d2cdeb 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -49,10 +49,8 @@ class SoftmaxOp : public framework::OperatorWithKernel { auto use_cudnn = ctx->Attrs().Get("use_cudnn"); auto use_mkldnn = ctx->Attrs().Get("use_mkldnn"); if (axis != rank_x - 1 && axis != -1) { - PADDLE_ENFORCE(!use_cudnn, - "CUDNN kernel only support axis as -1."); - PADDLE_ENFORCE(!use_mkldnn, - "MKLDNN kernel only support axis as -1."); + PADDLE_ENFORCE(!use_cudnn, "CUDNN kernel only support axis as -1."); + PADDLE_ENFORCE(!use_mkldnn, "MKLDNN kernel only support axis as -1."); } ctx->SetOutputDim("Out", ctx->GetInputDim("X")); diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h index bbea935101..a964c3b57a 100644 --- a/paddle/fluid/operators/softmax_op.h +++ b/paddle/fluid/operators/softmax_op.h @@ -66,10 +66,12 @@ class SoftmaxKernel : public framework::OpKernel { #ifdef PADDLE_ON_INFERENCE math::SoftmaxFunctor()( - context.template device_context(), axis_dim, &X_2d, &Out_2d); + context.template device_context(), axis_dim, &X_2d, + &Out_2d); #else math::SoftmaxFunctor()( - context.template device_context(), axis_dim, &X_2d, &Out_2d); + context.template device_context(), axis_dim, &X_2d, + &Out_2d); #endif } }; @@ -96,8 +98,8 @@ class SoftmaxGradKernel : public framework::OpKernel { dOut_2d.ShareDataWith(*dOut).Resize({n, d}); math::SoftmaxGradFunctor()( - context.template device_context(), axis_dim, &Out_2d, &dOut_2d, - &dX_2d); + context.template device_context(), axis_dim, &Out_2d, + &dOut_2d, &dX_2d); } }; diff --git a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc b/paddle/fluid/operators/warpctc_cudnn_op.cu.cc index 716faf2995..8d97396fda 100644 --- a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc +++ b/paddle/fluid/operators/warpctc_cudnn_op.cu.cc @@ -69,7 +69,8 @@ class CudnnCTCKernel : public framework::OpKernel { int rank = logits->dims().size(); Tensor in_2d = framework::ReshapeToMatrix(*logits, rank - 1); Tensor out_2d = framework::ReshapeToMatrix(softmax_logits, rank - 1); - math::SoftmaxFunctor()(dev_ctx, -1, &in_2d, &out_2d); + math::SoftmaxFunctor()(dev_ctx, -1, &in_2d, + &out_2d); // ctc needs sequences data stored in transposed padding format // logits and grad using padding data of layout 'TNC' From d54005a7f43af4107aa117fbd517f81c025165b3 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Mon, 25 Mar 2019 14:23:05 +0000 Subject: [PATCH 14/18] fix unittest. test=develop --- paddle/fluid/operators/softmax_with_cross_entropy_op.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h index ff99e4207a..2220d77e8a 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h @@ -40,10 +40,12 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel { softmax->mutable_data(context.GetPlace()); loss->mutable_data(context.GetPlace()); + int axis_dim = logits->dims()[logits->dims().size()-1]; + auto& dev_ctx = context.template device_context(); math::SoftmaxFunctor()( - dev_ctx, -1, logits, softmax); + dev_ctx, axis_dim, logits, softmax); math::CrossEntropyFunctor()( dev_ctx, loss, softmax, labels, context.Attr("soft_label"), context.Attr("ignore_index")); From ceb31d30f0d0766d27cef928aa5629bc5c92e474 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 26 Mar 2019 10:10:03 +0800 Subject: [PATCH 15/18] fix formax. test=develop --- paddle/fluid/operators/softmax_with_cross_entropy_op.h | 2 +- paddle/fluid/operators/warpctc_cudnn_op.cu.cc | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h index 2220d77e8a..1042cbdcf5 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h @@ -40,7 +40,7 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel { softmax->mutable_data(context.GetPlace()); loss->mutable_data(context.GetPlace()); - int axis_dim = logits->dims()[logits->dims().size()-1]; + int axis_dim = logits->dims()[logits->dims().size() - 1]; auto& dev_ctx = context.template device_context(); diff --git a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc b/paddle/fluid/operators/warpctc_cudnn_op.cu.cc index 8d97396fda..2a744f66f1 100644 --- a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc +++ b/paddle/fluid/operators/warpctc_cudnn_op.cu.cc @@ -67,9 +67,10 @@ class CudnnCTCKernel : public framework::OpKernel { softmax_logits.mutable_data(logits->dims(), ctx.GetPlace()); softmax_logits.set_lod(logits_lod); int rank = logits->dims().size(); + int axis_dim = logits->dims()[rank - 1]; Tensor in_2d = framework::ReshapeToMatrix(*logits, rank - 1); Tensor out_2d = framework::ReshapeToMatrix(softmax_logits, rank - 1); - math::SoftmaxFunctor()(dev_ctx, -1, &in_2d, + math::SoftmaxFunctor()(dev_ctx, axis_dim, &in_2d, &out_2d); // ctc needs sequences data stored in transposed padding format From 7920e3be02cbfef0f6400896f0bde4e8514c9024 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 26 Mar 2019 06:20:34 +0000 Subject: [PATCH 16/18] revert test_softmax_cudnn. test=develop --- .../mkldnn/test_softmax_mkldnn_op.py | 24 ------------------- 1 file changed, 24 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py index 3cf05d5d9f..748b77f2bf 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py @@ -32,30 +32,6 @@ class TestSoftmaxMKLDNNOp2(TestSoftmaxMKLDNNOp): return [2, 3, 4, 5] -class TestSoftmaxMKLDNNOp2(TestSoftmaxMKLDNNOp): - def get_x_shape(self): - return [2, 3, 4, 5] - - def get_axis(self): - return 0 - - -class TestSoftmaxMKLDNNOp2(TestSoftmaxMKLDNNOp): - def get_x_shape(self): - return [2, 3, 4, 5] - - def get_axis(self): - return 1 - - -class TestSoftmaxMKLDNNOp2(TestSoftmaxMKLDNNOp): - def get_x_shape(self): - return [2, 3, 4, 5] - - def get_axis(self): - return 2 - - # Check if primitives already exist in backward class TestSoftmaxMKLDNNPrimitivesAlreadyExist(unittest.TestCase): def setUp(self): From eb2123e12dc0ce1f6920aefa12b684f01bf9ca17 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 27 Mar 2019 06:17:28 +0000 Subject: [PATCH 17/18] fix doc and jit. test=develop --- paddle/fluid/API.spec | 2 +- paddle/fluid/operators/jit/kernel_base.h | 4 ++-- paddle/fluid/operators/jit/more/mix/mix.cc | 5 +++-- paddle/fluid/operators/jit/more/mkl/mkl.h | 1 + paddle/fluid/operators/jit/refer/refer.h | 1 + paddle/fluid/operators/jit/test.cc | 6 ++---- python/paddle/fluid/layers/nn.py | 5 ++++- 7 files changed, 14 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 51c3c7bbf9..6b6081d2cd 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -86,7 +86,7 @@ paddle.fluid.layers.conv2d (ArgSpec(args=['input', 'num_filters', 'filter_size', paddle.fluid.layers.conv3d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)), ('document', '37042620f9bd3a2da6e5d3138b2f724b')) paddle.fluid.layers.sequence_pool (ArgSpec(args=['input', 'pool_type', 'is_test'], varargs=None, keywords=None, defaults=(False,)), ('document', 'a194fb80614023f543df3949fbd0d0b8')) paddle.fluid.layers.sequence_softmax (ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '19ef6f9cdd27feac8a1ae060f19c10b4')) -paddle.fluid.layers.softmax (ArgSpec(args=['input', 'use_cudnn', 'name', 'axis'], varargs=None, keywords=None, defaults=(False, None, -1)), ('document', '502bad9e8bc7ef24817d0d4b20f61df3')) +paddle.fluid.layers.softmax (ArgSpec(args=['input', 'use_cudnn', 'name', 'axis'], varargs=None, keywords=None, defaults=(False, None, -1)), ('document', '59b1c6bf2f0fa9dc649c85fef3a3b2ea')) paddle.fluid.layers.pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', 'bbd84e855e660cd1084bb71a2fd0cdaa')) paddle.fluid.layers.pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', '043de7333b79ee0ac55053c14ed81625')) paddle.fluid.layers.adaptive_pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '859b887174d06f361658f69cb7c06d95')) diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index fdd41a830a..6e0393b820 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -38,6 +38,8 @@ typedef enum { kNCHW16CMulNC, kSeqPool, kSoftmax, + kStrideASum, + kStrideScal, kVAdd, kVAddBias, kVAddRelu, @@ -53,8 +55,6 @@ typedef enum { kVSquare, kVSub, kVTanh, - kStrideASum, - kStrideScal, } KernelType; typedef enum { diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc index 1a9fc9ed7b..f5b7bfff89 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.cc +++ b/paddle/fluid/operators/jit/more/mix/mix.cc @@ -50,11 +50,12 @@ void VTanh(const T* x, T* y, int n) { compute_addbias(&b, y, y, n); } +// remain is the product of dimension shapes after the axis dimension void Softmax(const T* x, T* y, int n, int bs, int remain) { auto compute_hmax = KernelFuncs, CPUPlace>::Cache().At(n); auto compute_hsum = KernelFuncs, CPUPlace>::Cache().At(n); auto compute_vscal = KernelFuncs, CPUPlace>::Cache().At(n); - auto compute_stridesum = + auto compute_strideasum = KernelFuncs, CPUPlace>::Cache().At(n); auto compute_stridescal = KernelFuncs, CPUPlace>::Cache().At(n); @@ -74,7 +75,7 @@ void Softmax(const T* x, T* y, int n, int bs, int remain) { compute_vscal(&scalar, y, y, n); } else { for (int j = 0; j < remain; ++j) { - compute_stridesum(&y[j], &scalar, n, remain); + compute_strideasum(&y[j], &scalar, n, remain); scalar = static_cast(1) / scalar; compute_stridescal(&scalar, &y[j], &y[j], n, remain); } diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index 968895bb6f..b38cc107b8 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -134,6 +134,7 @@ void StrideASum(const T* x, T* res, int n, int stride); template void StrideScal(const T* a, const T* x, T* y, int n, int stride); +// remain is the product of dimension shapes after the axis dimension template void Softmax(const T* x, T* y, int n, int bs, int remain = 1) { std::vector entities(bs); diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index 4aeb2fd628..136b99e0ae 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -432,6 +432,7 @@ void StrideScal(const T* a, const T* x, T* y, int n, int stride) { // y = e^(x - max(x)) // y = y / sum(y) +// remain is the product of dimension shapes after the axis dimension template void Softmax(const T* x, T* y, int n, int bs = 1, int remain = 1) { for (int i = 0; i < bs; ++i) { diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index d8a0b2cbf5..178418f4a7 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -798,10 +798,8 @@ template void TestKernelStrideScal() { using T = typename KernelTuple::data_type; VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); - // for (int d : TestSizes()) { - // for (int m : {1, 2, 3}) { // stride - for (int d : {4}) { - for (int m : {2}) { // stride + for (int d : TestSizes()) { + for (int m : {1, 2, 3}) { // stride if (m > d || d % m != 0) { continue; } diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 19c9734a9e..215720417e 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1826,7 +1826,7 @@ def softmax(input, use_cudnn=False, name=None, axis=-1): The dimension :attr:`axis` of the input tensor will be permuted to the last. Then the input tensor will be logically flattened to a 2-D matrix. The matrix's - second dimension(row length) is as same as the dimension :attr:`axis` of the input + second dimension(row length) is the same as the dimension :attr:`axis` of the input tensor, and the first dimension(column length) is the product of all other dimensions of the input tensor. For each row of the matrix, the softmax operator squashes the K-dimensional(K is the width of the matrix, which is also the size @@ -1864,7 +1864,10 @@ def softmax(input, use_cudnn=False, name=None, axis=-1): .. code-block:: python fc = fluid.layers.fc(input=x, size=10) + # perform softmax in the second dimension softmax = fluid.layers.softmax(input=fc, axis=1) + # perform softmax in the last dimension + softmax = fluid.layers.softmax(input=fc, axis=-1) """ helper = LayerHelper('softmax', **locals()) From 3e352388ebd7ca6cf24f2c2447f6ab5d15ab1b75 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 27 Mar 2019 14:55:25 +0800 Subject: [PATCH 18/18] fix format. test=develop --- paddle/fluid/operators/jit/test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 178418f4a7..d30fa014ed 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -799,7 +799,7 @@ void TestKernelStrideScal() { using T = typename KernelTuple::data_type; VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); for (int d : TestSizes()) { - for (int m : {1, 2, 3}) { // stride + for (int m : {1, 2, 3}) { // stride if (m > d || d % m != 0) { continue; }