From e2e82bde32709a0bedaf940c60c3d5e3b73d22b1 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 11 Oct 2018 21:12:56 +0800 Subject: [PATCH 1/5] Accelerate Reshape op --- paddle/fluid/operators/reshape_op.cc | 82 ++++++++++++-------- paddle/fluid/operators/sequence_concat_op.cc | 5 +- 2 files changed, 51 insertions(+), 36 deletions(-) diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index d72f85f2c4..b8fdc3f826 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -164,7 +164,7 @@ dimension value will be copied from Input(X) at runtime. Note that the index of [2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input. 3. Input(Shape) has a higher priority than Attr(shape) if it is provided, while -Attr(shape) still should be set correctly to gurantee shape inference in +Attr(shape) still should be set correctly to gurantee shape inference in compile-time. )DOC"); @@ -195,6 +195,7 @@ class ReshapeGradOp : public framework::OperatorWithKernel { } }; +template class ReshapeKernel { public: void operator()(const framework::ExecutionContext &ctx) const { @@ -227,12 +228,15 @@ class ReshapeKernel { "sequence_reshape op."); } - out->mutable_data(ctx.GetPlace(), in->type()); - framework::TensorCopySync(*in, ctx.GetPlace(), out); + if (in->data() != + reinterpret_cast(out->mutable_data(ctx.GetPlace(), in->type()))) { + framework::TensorCopySync(*in, ctx.GetPlace(), out); + } out->Resize(out_dims); } }; +template class ReshapeGradKernel { public: void operator()(const framework::ExecutionContext &ctx) const { @@ -240,8 +244,9 @@ class ReshapeGradKernel { auto *d_x = ctx.Output(framework::GradVarName("X")); auto in_dims = d_x->dims(); - d_x->mutable_data(ctx.GetPlace(), d_out->type()); - framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x); + if (d_out->data() != d_x->mutable_data(ctx.GetPlace(), d_out->type())) { + framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x); + } d_x->Resize(in_dims); } }; @@ -259,7 +264,6 @@ class Reshape2Op : public ReshapeOp { : ReshapeOp(type, inputs, outputs, attrs) {} void InferShape(framework::InferShapeContext *ctx) const override { - ReshapeOp::InferShape(ctx); PADDLE_ENFORCE(ctx->HasOutput("XShape"), "Output(XShape) of ReshapeOp should not be null."); const auto &x_dims = ctx->GetInputDim("X"); @@ -270,6 +274,8 @@ class Reshape2Op : public ReshapeOp { } ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims)); ctx->ShareLoD("X", /*->*/ "XShape"); + + ReshapeOp::InferShape(ctx); } }; @@ -335,38 +341,46 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(reshape, ops::ReshapeOp, ops::ReshapeOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(reshape_grad, ops::ReshapeGradOp); -REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double, - ops::ReshapeKernel, int, ops::ReshapeKernel, - int64_t, ops::ReshapeKernel); -REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel, - double, ops::ReshapeGradKernel, int, - ops::ReshapeGradKernel, int64_t, - ops::ReshapeGradKernel); +REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, + double, ops::ReshapeKernel, int, + ops::ReshapeKernel, int64_t, + ops::ReshapeKernel); +REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, + ops::ReshapeGradKernel, double, + ops::ReshapeGradKernel, int, + ops::ReshapeGradKernel, int64_t, + ops::ReshapeGradKernel); REGISTER_OPERATOR(reshape2, ops::Reshape2Op, ops::Reshape2OpMaker, ops::Reshape2GradMaker); REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp); -REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double, - ops::ReshapeKernel, int, ops::ReshapeKernel, - int64_t, ops::ReshapeKernel); -REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel, - double, ops::ReshapeGradKernel, int, - ops::ReshapeGradKernel, int64_t, - ops::ReshapeGradKernel); +REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, + double, ops::ReshapeKernel, int, + ops::ReshapeKernel, int64_t, + ops::ReshapeKernel); +REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, + ops::ReshapeGradKernel, double, + ops::ReshapeGradKernel, int, + ops::ReshapeGradKernel, int64_t, + ops::ReshapeGradKernel); #ifdef PADDLE_WITH_CUDA -REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double, - ops::ReshapeKernel, int, ops::ReshapeKernel, - int64_t, ops::ReshapeKernel); -REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel, - double, ops::ReshapeGradKernel, int, - ops::ReshapeGradKernel, int64_t, - ops::ReshapeGradKernel); -REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double, - ops::ReshapeKernel, int, ops::ReshapeKernel, - int64_t, ops::ReshapeKernel); -REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel, - double, ops::ReshapeGradKernel, int, - ops::ReshapeGradKernel, int64_t, - ops::ReshapeGradKernel); +REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, + double, ops::ReshapeKernel, int, + ops::ReshapeKernel, int64_t, + ops::ReshapeKernel); +REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, + ops::ReshapeGradKernel, double, + ops::ReshapeGradKernel, int, + ops::ReshapeGradKernel, int64_t, + ops::ReshapeGradKernel); +REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, + double, ops::ReshapeKernel, int, + ops::ReshapeKernel, int64_t, + ops::ReshapeKernel); +REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float, + ops::ReshapeGradKernel, double, + ops::ReshapeGradKernel, int, + ops::ReshapeGradKernel, int64_t, + ops::ReshapeGradKernel); #endif diff --git a/paddle/fluid/operators/sequence_concat_op.cc b/paddle/fluid/operators/sequence_concat_op.cc index 397a318295..12b53be708 100644 --- a/paddle/fluid/operators/sequence_concat_op.cc +++ b/paddle/fluid/operators/sequence_concat_op.cc @@ -90,11 +90,12 @@ REGISTER_OPERATOR(sequence_concat, paddle::framework::OperatorWithKernel, paddle::framework::DefaultGradOpDescMaker); template using Kernel = op::SeqConcatKernel; -REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel, Kernel); +REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel, Kernel, + Kernel); REGISTER_OPERATOR(sequence_concat_grad, paddle::framework::OperatorWithKernel, op::SeqConcatGradShapeInferer); template using GradKernel = op::SeqConcatGradKernel; REGISTER_OP_CPU_KERNEL(sequence_concat_grad, GradKernel, - GradKernel); + GradKernel, GradKernel); From f40848828df2bdb5d80675802e4d71bf4f817c3e Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 11 Oct 2018 22:39:04 +0800 Subject: [PATCH 2/5] Polish code test=develop --- paddle/fluid/operators/sequence_concat_op.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/operators/sequence_concat_op.cc b/paddle/fluid/operators/sequence_concat_op.cc index 12b53be708..3234b60861 100644 --- a/paddle/fluid/operators/sequence_concat_op.cc +++ b/paddle/fluid/operators/sequence_concat_op.cc @@ -92,6 +92,7 @@ template using Kernel = op::SeqConcatKernel; REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel, Kernel, Kernel); + REGISTER_OPERATOR(sequence_concat_grad, paddle::framework::OperatorWithKernel, op::SeqConcatGradShapeInferer); template From d9b202e7172ce649945fd7042029cd6a742e1aa3 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 15 Oct 2018 15:25:09 +0800 Subject: [PATCH 3/5] Move tensor copy src_ptr and dst_ptr check to TensorCopy function test=develop --- paddle/fluid/framework/tensor_util.cc | 11 ++++ paddle/fluid/operators/reshape_op.cc | 77 +++++++++++---------------- 2 files changed, 43 insertions(+), 45 deletions(-) diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 1d7a2eb5b3..de77d189c8 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -114,6 +114,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, auto dst_ptr = dst->mutable_data(dst_place, src.type()); auto size = src.numel() * SizeOfType(src.type()); if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data from " << src.place() << " to " + << dst_place; + return; + } memory::Copy(boost::get(dst_place), dst_ptr, boost::get(src_place), src_ptr, size); } @@ -132,6 +137,12 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, platform::is_gpu_place(dst_place)) { auto src_gpu_place = boost::get(src_place); auto dst_gpu_place = boost::get(dst_place); + if (src_ptr == dst_ptr && + src_gpu_place.GetDeviceId() == dst_gpu_place.GetDeviceId()) { + VLOG(3) << "Skip copy the same data from " << src.place() << " to " + << dst_place; + return; + } memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); } #endif diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index b8fdc3f826..500d86fec3 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -195,7 +195,6 @@ class ReshapeGradOp : public framework::OperatorWithKernel { } }; -template class ReshapeKernel { public: void operator()(const framework::ExecutionContext &ctx) const { @@ -228,15 +227,12 @@ class ReshapeKernel { "sequence_reshape op."); } - if (in->data() != - reinterpret_cast(out->mutable_data(ctx.GetPlace(), in->type()))) { - framework::TensorCopySync(*in, ctx.GetPlace(), out); - } + out->mutable_data(ctx.GetPlace(), in->type()); + framework::TensorCopySync(*in, ctx.GetPlace(), out); out->Resize(out_dims); } }; -template class ReshapeGradKernel { public: void operator()(const framework::ExecutionContext &ctx) const { @@ -244,9 +240,8 @@ class ReshapeGradKernel { auto *d_x = ctx.Output(framework::GradVarName("X")); auto in_dims = d_x->dims(); - if (d_out->data() != d_x->mutable_data(ctx.GetPlace(), d_out->type())) { - framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x); - } + d_x->mutable_data(ctx.GetPlace(), d_out->type()); + framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x); d_x->Resize(in_dims); } }; @@ -341,46 +336,38 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(reshape, ops::ReshapeOp, ops::ReshapeOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(reshape_grad, ops::ReshapeGradOp); -REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, - double, ops::ReshapeKernel, int, - ops::ReshapeKernel, int64_t, - ops::ReshapeKernel); -REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, - ops::ReshapeGradKernel, double, - ops::ReshapeGradKernel, int, - ops::ReshapeGradKernel, int64_t, - ops::ReshapeGradKernel); +REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double, + ops::ReshapeKernel, int, ops::ReshapeKernel, + int64_t, ops::ReshapeKernel); +REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel, + double, ops::ReshapeGradKernel, int, + ops::ReshapeGradKernel, int64_t, + ops::ReshapeGradKernel); REGISTER_OPERATOR(reshape2, ops::Reshape2Op, ops::Reshape2OpMaker, ops::Reshape2GradMaker); REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp); -REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, - double, ops::ReshapeKernel, int, - ops::ReshapeKernel, int64_t, - ops::ReshapeKernel); -REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, - ops::ReshapeGradKernel, double, - ops::ReshapeGradKernel, int, - ops::ReshapeGradKernel, int64_t, - ops::ReshapeGradKernel); +REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double, + ops::ReshapeKernel, int, ops::ReshapeKernel, + int64_t, ops::ReshapeKernel); +REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel, + double, ops::ReshapeGradKernel, int, + ops::ReshapeGradKernel, int64_t, + ops::ReshapeGradKernel); #ifdef PADDLE_WITH_CUDA -REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, - double, ops::ReshapeKernel, int, - ops::ReshapeKernel, int64_t, - ops::ReshapeKernel); -REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, - ops::ReshapeGradKernel, double, - ops::ReshapeGradKernel, int, - ops::ReshapeGradKernel, int64_t, - ops::ReshapeGradKernel); -REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, - double, ops::ReshapeKernel, int, - ops::ReshapeKernel, int64_t, - ops::ReshapeKernel); -REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float, - ops::ReshapeGradKernel, double, - ops::ReshapeGradKernel, int, - ops::ReshapeGradKernel, int64_t, - ops::ReshapeGradKernel); +REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double, + ops::ReshapeKernel, int, ops::ReshapeKernel, + int64_t, ops::ReshapeKernel); +REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel, + double, ops::ReshapeGradKernel, int, + ops::ReshapeGradKernel, int64_t, + ops::ReshapeGradKernel); +REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double, + ops::ReshapeKernel, int, ops::ReshapeKernel, + int64_t, ops::ReshapeKernel); +REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel, + double, ops::ReshapeGradKernel, int, + ops::ReshapeGradKernel, int64_t, + ops::ReshapeGradKernel); #endif From 24c9fbdba36b4b9804c63f7ddefeb1074714e63b Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 15 Oct 2018 16:13:29 +0800 Subject: [PATCH 4/5] Polish code test=develop --- paddle/fluid/framework/tensor_util.cc | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index de77d189c8..69bcbc0e58 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -36,6 +36,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, auto size = src.numel() * SizeOfType(src.type()); if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data async from " << src_place << " to " + << dst_place; + return; + } memory::Copy(boost::get(dst_place), dst_ptr, boost::get(src_place), src_ptr, size); } @@ -71,6 +76,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, auto stream = reinterpret_cast(ctx).stream(); if (platform::is_same_place(src_place, dst_place)) { + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data async from " << src_place << " to " + << dst_place; + return; + } memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); } else { @@ -115,7 +125,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, auto size = src.numel() * SizeOfType(src.type()); if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { if (src_ptr == dst_ptr) { - VLOG(3) << "Skip copy the same data from " << src.place() << " to " + VLOG(3) << "Skip copy the same data from " << src_place << " to " << dst_place; return; } @@ -135,14 +145,13 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr); } else if (platform::is_gpu_place(src_place) && platform::is_gpu_place(dst_place)) { - auto src_gpu_place = boost::get(src_place); - auto dst_gpu_place = boost::get(dst_place); - if (src_ptr == dst_ptr && - src_gpu_place.GetDeviceId() == dst_gpu_place.GetDeviceId()) { - VLOG(3) << "Skip copy the same data from " << src.place() << " to " + if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) { + VLOG(3) << "Skip copy the same data from " << src_place << " to " << dst_place; return; } + auto src_gpu_place = boost::get(src_place); + auto dst_gpu_place = boost::get(dst_place); memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); } #endif From aeec82acd5c37d110a71832d647f3c27834c7c8a Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 15 Oct 2018 17:21:10 +0800 Subject: [PATCH 5/5] Add unittest for reshape op test=develop --- paddle/fluid/framework/tensor_util_test.cc | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc index a1e5b967a8..793ccfc79f 100644 --- a/paddle/fluid/framework/tensor_util_test.cc +++ b/paddle/fluid/framework/tensor_util_test.cc @@ -41,6 +41,11 @@ TEST(TensorCopy, Tensor) { EXPECT_EQ(src_ptr[i], dst_ptr[i]); } + TensorCopy(dst_tensor, *cpu_place, &dst_tensor); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout()); Tensor slice_tensor = src_tensor.Slice(1, 2); @@ -82,6 +87,15 @@ TEST(TensorCopy, Tensor) { EXPECT_EQ(src_ptr[i], dst_ptr[i]); } + // Copy the same tensor + TensorCopy(gpu_tensor, *gpu_place, gpu_ctx, &gpu_tensor); + gpu_ctx.Wait(); + const int* dst_ptr_tmp = dst_tensor.data(); + EXPECT_NE(src_ptr, dst_ptr_tmp); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], dst_ptr_tmp[i]); + } + Tensor slice_tensor = src_tensor.Slice(1, 2); // CPU Slice Tensor to GPU Tensor