From 938390b38d6837a443b1623544f6770352f42568 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Fri, 20 Jul 2018 14:53:05 +0800 Subject: [PATCH 1/3] fix mac build of sendrecvop_utils --- paddle/fluid/operators/distributed/sendrecvop_utils.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc index 98a5dcbbb8..6a3f8fd544 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc @@ -28,10 +28,12 @@ namespace distributed { using VarMsg = sendrecv::VariableMessage; +#ifdef PADDLE_WITH_CUDA void* GetVarPayLoad(const std::string varname, int64_t size) { platform::CUDAPinnedPlace cuda_pinned; return memory::Alloc(cuda_pinned, size); } +#endif void GetTensorPayload(framework::Variable* var, const platform::DeviceContext& ctx, VarMsg* request, From 5d718a5886478a4d1349d9e85ad217c4d4970b5e Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Sun, 22 Jul 2018 15:19:27 +0800 Subject: [PATCH 2/3] optimize reduce_sum_grad op --- paddle/fluid/operators/reduce_op.h | 29 ++++++++++++ paddle/fluid/operators/reduce_sum_op.h | 2 +- python/paddle/fluid/layers/nn.py | 4 +- .../fluid/tests/unittests/test_reduce_op.py | 44 ++++++++++--------- 4 files changed, 56 insertions(+), 23 deletions(-) diff --git a/paddle/fluid/operators/reduce_op.h b/paddle/fluid/operators/reduce_op.h index 72b6cf1773..735ad3af2a 100644 --- a/paddle/fluid/operators/reduce_op.h +++ b/paddle/fluid/operators/reduce_op.h @@ -88,6 +88,35 @@ class ReduceGradKernel : public framework::OpKernel { auto* output = context.Output(framework::GradVarName("X")); output->mutable_data(context.GetPlace()); + if (context.GetPlace().type() == typeid(platform::CPUPlace)) { + const auto* input2_d = input2->data(); + auto* output_d = output->data(); + + // CPU reduce_all_grad + if (reduce_all) { + PADDLE_ENFORCE(input2->dims().size() == 1 && input2->dims()[0] == 1, + "output should be a scalar"); + for (int64_t i = 0; i < framework::product(input0->dims()); ++i) { + output_d[i] = input2_d[0]; + } + return; + } + + if (input0->dims().size() == 2 && dims.size() == 1) { + auto& input_dim = input0->dims(); + for (int64_t i = 0; i < input_dim[0]; ++i) { + for (int64_t j = 0; j < input_dim[1]; ++j) { + if (dims[0] == 0) { + output_d[i * input_dim[1] + j] = input2_d[j]; + } else { + output_d[i * input_dim[1] + j] = input2_d[i]; + } + } + } + return; + } + } + if (reduce_all) { auto x = EigenVector::Flatten(*input0); auto x_reduce = EigenVector::From(*input1); diff --git a/paddle/fluid/operators/reduce_sum_op.h b/paddle/fluid/operators/reduce_sum_op.h index e67d7e1da5..248782ce97 100644 --- a/paddle/fluid/operators/reduce_sum_op.h +++ b/paddle/fluid/operators/reduce_sum_op.h @@ -31,7 +31,7 @@ struct SumGradFunctor { typename DY, typename Dim> void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, const Dim& dim, int size) { - dx->device(place) = dy->broadcast(dim); + dx->device(place) = dy->eval().broadcast(dim); } }; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index ab40d0c217..4df806216a 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -2961,7 +2961,7 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None): # x is a Tensor variable with following elements: # [[0.2, 0.3, 0.5, 0.9] # [0.1, 0.2, 0.6, 0.7]] - # Each example is followed by the correspending output tensor. + # Each example is followed by the corresponding output tensor. fluid.layers.reduce_sum(x) # [3.5] fluid.layers.reduce_sum(x, dim=0) # [0.3, 0.5, 1.1, 1.6] fluid.layers.reduce_sum(x, dim=-1) # [1.9, 1.6] @@ -2970,7 +2970,7 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None): # x is a Tensor variable with shape [2, 2, 2] and elements as below: # [[[1, 2], [3, 4]], # [[5, 6], [7, 8]]] - # Each example is followed by the correspending output tensor. + # Each example is followed by the corresponding output tensor. fluid.layers.reduce_sum(x, dim=[1, 2]) # [10, 26] fluid.layers.reduce_sum(x, dim=[0, 1]) # [16, 20] diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py index 865c2b7df0..dbc2892646 100644 --- a/python/paddle/fluid/tests/unittests/test_reduce_op.py +++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py @@ -89,15 +89,11 @@ class TestProdOp(OpTest): self.check_grad(['X'], 'Out') -class TestKeepDimReduce(OpTest): +class Test1DReduce(OpTest): def setUp(self): self.op_type = "reduce_sum" - self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")} - self.attrs = {'dim': [-2], 'keep_dim': True} - self.outputs = { - 'Out': - self.inputs['X'].sum(axis=tuple(self.attrs['dim']), keepdims=True) - } + self.inputs = {'X': np.random.random(20).astype("float64")} + self.outputs = {'Out': self.inputs['X'].sum(axis=0)} def test_check_output(self): self.check_output() @@ -106,32 +102,40 @@ class TestKeepDimReduce(OpTest): self.check_grad(['X'], 'Out') -class Test1DReduce(OpTest): +class Test2DReduce0(Test1DReduce): def setUp(self): self.op_type = "reduce_sum" - self.inputs = {'X': np.random.random(20).astype("float64")} + self.attrs = {'dim': [0]} + self.inputs = {'X': np.random.random((20, 10)).astype("float64")} self.outputs = {'Out': self.inputs['X'].sum(axis=0)} - def test_check_output(self): - self.check_output() - def test_check_grad(self): - self.check_grad(['X'], 'Out') +class Test2DReduce1(Test1DReduce): + def setUp(self): + self.op_type = "reduce_sum" + self.attrs = {'dim': [1]} + self.inputs = {'X': np.random.random((20, 10)).astype("float64")} + self.outputs = {'Out': self.inputs['X'].sum(axis=1)} + + +class TestKeepDimReduce(Test1DReduce): + def setUp(self): + self.op_type = "reduce_sum" + self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")} + self.attrs = {'dim': [-2], 'keep_dim': True} + self.outputs = { + 'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']), + keepdims=self.attrs['keep_dim']) + } -class TestReduceAll(OpTest): +class TestReduceAll(Test1DReduce): def setUp(self): self.op_type = "reduce_sum" self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")} self.attrs = {'reduce_all': True} self.outputs = {'Out': self.inputs['X'].sum()} - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(['X'], 'Out') - ## reduction in multi dims class TestReduceMeanOpMultiAxises(OpTest): From 273f737517e3d2d3ef67725a143e834be025fa29 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Sun, 22 Jul 2018 19:11:27 +0800 Subject: [PATCH 3/3] optimize code --- paddle/fluid/operators/reduce_op.h | 29 ---------- paddle/fluid/operators/reduce_sum_op.cc | 19 +++--- paddle/fluid/operators/reduce_sum_op.h | 58 +++++++++++++++++++ .../fluid/tests/unittests/test_reduce_op.py | 46 ++++++++++++++- 4 files changed, 112 insertions(+), 40 deletions(-) diff --git a/paddle/fluid/operators/reduce_op.h b/paddle/fluid/operators/reduce_op.h index 735ad3af2a..72b6cf1773 100644 --- a/paddle/fluid/operators/reduce_op.h +++ b/paddle/fluid/operators/reduce_op.h @@ -88,35 +88,6 @@ class ReduceGradKernel : public framework::OpKernel { auto* output = context.Output(framework::GradVarName("X")); output->mutable_data(context.GetPlace()); - if (context.GetPlace().type() == typeid(platform::CPUPlace)) { - const auto* input2_d = input2->data(); - auto* output_d = output->data(); - - // CPU reduce_all_grad - if (reduce_all) { - PADDLE_ENFORCE(input2->dims().size() == 1 && input2->dims()[0] == 1, - "output should be a scalar"); - for (int64_t i = 0; i < framework::product(input0->dims()); ++i) { - output_d[i] = input2_d[0]; - } - return; - } - - if (input0->dims().size() == 2 && dims.size() == 1) { - auto& input_dim = input0->dims(); - for (int64_t i = 0; i < input_dim[0]; ++i) { - for (int64_t j = 0; j < input_dim[1]; ++j) { - if (dims[0] == 0) { - output_d[i * input_dim[1] + j] = input2_d[j]; - } else { - output_d[i * input_dim[1] + j] = input2_d[i]; - } - } - } - return; - } - } - if (reduce_all) { auto x = EigenVector::Flatten(*input0); auto x_reduce = EigenVector::From(*input1); diff --git a/paddle/fluid/operators/reduce_sum_op.cc b/paddle/fluid/operators/reduce_sum_op.cc index c5b5398787..f0e5f6580f 100644 --- a/paddle/fluid/operators/reduce_sum_op.cc +++ b/paddle/fluid/operators/reduce_sum_op.cc @@ -23,12 +23,13 @@ REGISTER_OP_CPU_KERNEL( ops::ReduceKernel, ops::ReduceKernel); -REGISTER_OP_CPU_KERNEL(reduce_sum_grad, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); +REGISTER_OP_CPU_KERNEL( + reduce_sum_grad, + ops::ReduceSumGradKernel, + ops::ReduceSumGradKernel, + ops::ReduceSumGradKernel, + ops::ReduceSumGradKernel); diff --git a/paddle/fluid/operators/reduce_sum_op.h b/paddle/fluid/operators/reduce_sum_op.h index 248782ce97..3e8d1bbdba 100644 --- a/paddle/fluid/operators/reduce_sum_op.h +++ b/paddle/fluid/operators/reduce_sum_op.h @@ -14,11 +14,69 @@ #pragma once +#include + #include "paddle/fluid/operators/reduce_op.h" namespace paddle { namespace operators { +// use for loop to speed up Eigen broadcast. 4 timer faster then broadcast +template +class ReduceSumGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto dims = context.Attr>("dim"); + if (context.GetPlace().type() == typeid(platform::CPUPlace) && + dims.size() == 1) { + auto* input0 = context.Input("X"); + auto* input2 = context.Input(framework::GradVarName("Out")); + auto* output = context.Output(framework::GradVarName("X")); + output->mutable_data(context.GetPlace()); + const auto* input2_d = input2->data(); + auto* output_d = output->data(); + + // handle reduce_all + if (input2->dims().size() == 1 && input2->dims()[0] == 1) { + for (int64_t i = 0; i < framework::product(input0->dims()); ++i) { + output_d[i] = input2_d[0]; + } + return; + } + + // handle reduce by one dimension + int reduce_dim_index = dims[0]; + if (reduce_dim_index < 0) { + reduce_dim_index += input0->dims().size(); + } + + auto& input_dim = input0->dims(); + int64_t before_dim = 1; + for (int i = 0; i < reduce_dim_index; ++i) { + before_dim *= input_dim[i]; + } + int64_t reduce_dim = input_dim[reduce_dim_index]; + int64_t after_dim = 1; + for (int i = reduce_dim_index + 1; i < input_dim.size(); ++i) { + after_dim *= input_dim[i]; + } + for (int64_t i = 0; i < before_dim; ++i) { + for (int64_t j = 0; j < reduce_dim; ++j) { + for (int64_t k = 0; k < after_dim; ++k) { + output_d[i * reduce_dim * after_dim + j * after_dim + k] = + input2_d[i * after_dim + k]; + } + } + } + return; + } + + // default use Eigen broadcast + ReduceGradKernel kernel; + kernel.Compute(context); + } +}; + struct SumFunctor { template void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py index dbc2892646..06d116601b 100644 --- a/python/paddle/fluid/tests/unittests/test_reduce_op.py +++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py @@ -115,14 +115,56 @@ class Test2DReduce1(Test1DReduce): self.op_type = "reduce_sum" self.attrs = {'dim': [1]} self.inputs = {'X': np.random.random((20, 10)).astype("float64")} - self.outputs = {'Out': self.inputs['X'].sum(axis=1)} + self.outputs = { + 'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim'])) + } + + +class Test3DReduce0(Test1DReduce): + def setUp(self): + self.op_type = "reduce_sum" + self.attrs = {'dim': [1]} + self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")} + self.outputs = { + 'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim'])) + } + + +class Test3DReduce1(Test1DReduce): + def setUp(self): + self.op_type = "reduce_sum" + self.attrs = {'dim': [2]} + self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")} + self.outputs = { + 'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim'])) + } + + +class Test3DReduce2(Test1DReduce): + def setUp(self): + self.op_type = "reduce_sum" + self.attrs = {'dim': [-2]} + self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")} + self.outputs = { + 'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim'])) + } + + +class Test3DReduce3(Test1DReduce): + def setUp(self): + self.op_type = "reduce_sum" + self.attrs = {'dim': [1, 2]} + self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")} + self.outputs = { + 'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim'])) + } class TestKeepDimReduce(Test1DReduce): def setUp(self): self.op_type = "reduce_sum" self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")} - self.attrs = {'dim': [-2], 'keep_dim': True} + self.attrs = {'dim': [1], 'keep_dim': True} self.outputs = { 'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']), keepdims=self.attrs['keep_dim'])