From e2e82bde32709a0bedaf940c60c3d5e3b73d22b1 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 11 Oct 2018 21:12:56 +0800
Subject: [PATCH 1/5] Accelerate Reshape op

---
 paddle/fluid/operators/reshape_op.cc         | 82 ++++++++++++--------
 paddle/fluid/operators/sequence_concat_op.cc |  5 +-
 2 files changed, 51 insertions(+), 36 deletions(-)
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index d72f85f2c4..b8fdc3f826 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -164,7 +164,7 @@ dimension value will be copied from Input(X) at runtime. Note that the index of
 [2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input.
 
 3. Input(Shape) has a higher priority than Attr(shape) if it is provided, while
-Attr(shape) still should be set correctly to gurantee shape inference in 
+Attr(shape) still should be set correctly to gurantee shape inference in
 compile-time.
 
 )DOC");
@@ -195,6 +195,7 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
   }
 };
 
+template <typename T>
 class ReshapeKernel {
  public:
   void operator()(const framework::ExecutionContext &ctx) const {
@@ -227,12 +228,15 @@ class ReshapeKernel {
           "sequence_reshape op.");
     }
 
-    out->mutable_data(ctx.GetPlace(), in->type());
-    framework::TensorCopySync(*in, ctx.GetPlace(), out);
+    if (in->data<T>() !=
+        reinterpret_cast<T *>(out->mutable_data(ctx.GetPlace(), in->type()))) {
+      framework::TensorCopySync(*in, ctx.GetPlace(), out);
+    }
     out->Resize(out_dims);
   }
 };
 
+template <typename T>
 class ReshapeGradKernel {
  public:
   void operator()(const framework::ExecutionContext &ctx) const {
@@ -240,8 +244,9 @@ class ReshapeGradKernel {
     auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     auto in_dims = d_x->dims();
 
-    d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
+    if (d_out->data<T>() != d_x->mutable_data(ctx.GetPlace(), d_out->type())) {
+      framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
+    }
     d_x->Resize(in_dims);
   }
 };
@@ -259,7 +264,6 @@ class Reshape2Op : public ReshapeOp {
       : ReshapeOp(type, inputs, outputs, attrs) {}
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    ReshapeOp::InferShape(ctx);
     PADDLE_ENFORCE(ctx->HasOutput("XShape"),
                    "Output(XShape) of ReshapeOp should not be null.");
     const auto &x_dims = ctx->GetInputDim("X");
@@ -270,6 +274,8 @@ class Reshape2Op : public ReshapeOp {
     }
     ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
     ctx->ShareLoD("X", /*->*/ "XShape");
+
+    ReshapeOp::InferShape(ctx);
   }
 };
 
@@ -335,38 +341,46 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(reshape, ops::ReshapeOp, ops::ReshapeOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(reshape_grad, ops::ReshapeGradOp);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
-                               ops::ReshapeKernel, int, ops::ReshapeKernel,
-                               int64_t, ops::ReshapeKernel);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
-                               double, ops::ReshapeGradKernel, int,
-                               ops::ReshapeGradKernel, int64_t,
-                               ops::ReshapeGradKernel);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel<float>,
+                               double, ops::ReshapeKernel<double>, int,
+                               ops::ReshapeKernel<int>, int64_t,
+                               ops::ReshapeKernel<int64_t>);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float,
+                               ops::ReshapeGradKernel<float>, double,
+                               ops::ReshapeGradKernel<double>, int,
+                               ops::ReshapeGradKernel<int>, int64_t,
+                               ops::ReshapeGradKernel<int64_t>);
 
 REGISTER_OPERATOR(reshape2, ops::Reshape2Op, ops::Reshape2OpMaker,
                   ops::Reshape2GradMaker);
 REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
-                               ops::ReshapeKernel, int, ops::ReshapeKernel,
-                               int64_t, ops::ReshapeKernel);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
-                               double, ops::ReshapeGradKernel, int,
-                               ops::ReshapeGradKernel, int64_t,
-                               ops::ReshapeGradKernel);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel<float>,
+                               double, ops::ReshapeKernel<double>, int,
+                               ops::ReshapeKernel<int>, int64_t,
+                               ops::ReshapeKernel<int64_t>);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float,
+                               ops::ReshapeGradKernel<float>, double,
+                               ops::ReshapeGradKernel<double>, int,
+                               ops::ReshapeGradKernel<int>, int64_t,
+                               ops::ReshapeGradKernel<int64_t>);
 
 #ifdef PADDLE_WITH_CUDA
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
-                                ops::ReshapeKernel, int, ops::ReshapeKernel,
-                                int64_t, ops::ReshapeKernel);
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
-                                double, ops::ReshapeGradKernel, int,
-                                ops::ReshapeGradKernel, int64_t,
-                                ops::ReshapeGradKernel);
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
-                                ops::ReshapeKernel, int, ops::ReshapeKernel,
-                                int64_t, ops::ReshapeKernel);
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
-                                double, ops::ReshapeGradKernel, int,
-                                ops::ReshapeGradKernel, int64_t,
-                                ops::ReshapeGradKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel<float>,
+                                double, ops::ReshapeKernel<double>, int,
+                                ops::ReshapeKernel<int>, int64_t,
+                                ops::ReshapeKernel<int64_t>);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float,
+                                ops::ReshapeGradKernel<float>, double,
+                                ops::ReshapeGradKernel<double>, int,
+                                ops::ReshapeGradKernel<int>, int64_t,
+                                ops::ReshapeGradKernel<int64_t>);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel<float>,
+                                double, ops::ReshapeKernel<double>, int,
+                                ops::ReshapeKernel<int>, int64_t,
+                                ops::ReshapeKernel<int64_t>);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float,
+                                ops::ReshapeGradKernel<float>, double,
+                                ops::ReshapeGradKernel<double>, int,
+                                ops::ReshapeGradKernel<int>, int64_t,
+                                ops::ReshapeGradKernel<int64_t>);
 #endif
diff --git a/paddle/fluid/operators/sequence_concat_op.cc b/paddle/fluid/operators/sequence_concat_op.cc
index 397a318295..12b53be708 100644
--- a/paddle/fluid/operators/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cc
@@ -90,11 +90,12 @@ REGISTER_OPERATOR(sequence_concat, paddle::framework::OperatorWithKernel,
                   paddle::framework::DefaultGradOpDescMaker<false>);
 template <typename T>
 using Kernel = op::SeqConcatKernel<paddle::platform::CPUDeviceContext, T>;
-REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel<float>, Kernel<double>);
+REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel<float>, Kernel<double>,
+                       Kernel<int64_t>);
 REGISTER_OPERATOR(sequence_concat_grad, paddle::framework::OperatorWithKernel,
                   op::SeqConcatGradShapeInferer);
 template <typename T>
 using GradKernel =
     op::SeqConcatGradKernel<paddle::platform::CPUDeviceContext, T>;
 REGISTER_OP_CPU_KERNEL(sequence_concat_grad, GradKernel<float>,
-                       GradKernel<double>);
+                       GradKernel<double>, GradKernel<int64_t>);

From f40848828df2bdb5d80675802e4d71bf4f817c3e Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 11 Oct 2018 22:39:04 +0800
Subject: [PATCH 2/5] Polish code

test=develop
---
 paddle/fluid/operators/sequence_concat_op.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/operators/sequence_concat_op.cc b/paddle/fluid/operators/sequence_concat_op.cc
index 12b53be708..3234b60861 100644
--- a/paddle/fluid/operators/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cc
@@ -92,6 +92,7 @@ template <typename T>
 using Kernel = op::SeqConcatKernel<paddle::platform::CPUDeviceContext, T>;
 REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel<float>, Kernel<double>,
                        Kernel<int64_t>);
+
 REGISTER_OPERATOR(sequence_concat_grad, paddle::framework::OperatorWithKernel,
                   op::SeqConcatGradShapeInferer);
 template <typename T>

From d9b202e7172ce649945fd7042029cd6a742e1aa3 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 15 Oct 2018 15:25:09 +0800
Subject: [PATCH 3/5] Move tensor copy src_ptr and dst_ptr check to TensorCopy
 function

test=develop
---
 paddle/fluid/framework/tensor_util.cc | 11 ++++
 paddle/fluid/operators/reshape_op.cc  | 77 +++++++++++----------------
 2 files changed, 43 insertions(+), 45 deletions(-)

diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 1d7a2eb5b3..de77d189c8 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -114,6 +114,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
   auto dst_ptr = dst->mutable_data(dst_place, src.type());
   auto size = src.numel() * SizeOfType(src.type());
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data from " << src.place() << " to "
+              << dst_place;
+      return;
+    }
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
                  boost::get<platform::CPUPlace>(src_place), src_ptr, size);
   }
@@ -132,6 +137,12 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
              platform::is_gpu_place(dst_place)) {
     auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
     auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
+    if (src_ptr == dst_ptr &&
+        src_gpu_place.GetDeviceId() == dst_gpu_place.GetDeviceId()) {
+      VLOG(3) << "Skip copy the same data from " << src.place() << " to "
+              << dst_place;
+      return;
+    }
     memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
   }
 #endif
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index b8fdc3f826..500d86fec3 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -195,7 +195,6 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
   }
 };
 
-template <typename T>
 class ReshapeKernel {
  public:
   void operator()(const framework::ExecutionContext &ctx) const {
@@ -228,15 +227,12 @@ class ReshapeKernel {
           "sequence_reshape op.");
     }
 
-    if (in->data<T>() !=
-        reinterpret_cast<T *>(out->mutable_data(ctx.GetPlace(), in->type()))) {
-      framework::TensorCopySync(*in, ctx.GetPlace(), out);
-    }
+    out->mutable_data(ctx.GetPlace(), in->type());
+    framework::TensorCopySync(*in, ctx.GetPlace(), out);
     out->Resize(out_dims);
   }
 };
 
-template <typename T>
 class ReshapeGradKernel {
  public:
   void operator()(const framework::ExecutionContext &ctx) const {
@@ -244,9 +240,8 @@ class ReshapeGradKernel {
     auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     auto in_dims = d_x->dims();
 
-    if (d_out->data<T>() != d_x->mutable_data(ctx.GetPlace(), d_out->type())) {
-      framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
-    }
+    d_x->mutable_data(ctx.GetPlace(), d_out->type());
+    framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
     d_x->Resize(in_dims);
   }
 };
@@ -341,46 +336,38 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(reshape, ops::ReshapeOp, ops::ReshapeOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(reshape_grad, ops::ReshapeGradOp);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel<float>,
-                               double, ops::ReshapeKernel<double>, int,
-                               ops::ReshapeKernel<int>, int64_t,
-                               ops::ReshapeKernel<int64_t>);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float,
-                               ops::ReshapeGradKernel<float>, double,
-                               ops::ReshapeGradKernel<double>, int,
-                               ops::ReshapeGradKernel<int>, int64_t,
-                               ops::ReshapeGradKernel<int64_t>);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
+                               ops::ReshapeKernel, int, ops::ReshapeKernel,
+                               int64_t, ops::ReshapeKernel);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
+                               double, ops::ReshapeGradKernel, int,
+                               ops::ReshapeGradKernel, int64_t,
+                               ops::ReshapeGradKernel);
 
 REGISTER_OPERATOR(reshape2, ops::Reshape2Op, ops::Reshape2OpMaker,
                   ops::Reshape2GradMaker);
 REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel<float>,
-                               double, ops::ReshapeKernel<double>, int,
-                               ops::ReshapeKernel<int>, int64_t,
-                               ops::ReshapeKernel<int64_t>);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float,
-                               ops::ReshapeGradKernel<float>, double,
-                               ops::ReshapeGradKernel<double>, int,
-                               ops::ReshapeGradKernel<int>, int64_t,
-                               ops::ReshapeGradKernel<int64_t>);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
+                               ops::ReshapeKernel, int, ops::ReshapeKernel,
+                               int64_t, ops::ReshapeKernel);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
+                               double, ops::ReshapeGradKernel, int,
+                               ops::ReshapeGradKernel, int64_t,
+                               ops::ReshapeGradKernel);
 
 #ifdef PADDLE_WITH_CUDA
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel<float>,
-                                double, ops::ReshapeKernel<double>, int,
-                                ops::ReshapeKernel<int>, int64_t,
-                                ops::ReshapeKernel<int64_t>);
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float,
-                                ops::ReshapeGradKernel<float>, double,
-                                ops::ReshapeGradKernel<double>, int,
-                                ops::ReshapeGradKernel<int>, int64_t,
-                                ops::ReshapeGradKernel<int64_t>);
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel<float>,
-                                double, ops::ReshapeKernel<double>, int,
-                                ops::ReshapeKernel<int>, int64_t,
-                                ops::ReshapeKernel<int64_t>);
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float,
-                                ops::ReshapeGradKernel<float>, double,
-                                ops::ReshapeGradKernel<double>, int,
-                                ops::ReshapeGradKernel<int>, int64_t,
-                                ops::ReshapeGradKernel<int64_t>);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
+                                ops::ReshapeKernel, int, ops::ReshapeKernel,
+                                int64_t, ops::ReshapeKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
+                                double, ops::ReshapeGradKernel, int,
+                                ops::ReshapeGradKernel, int64_t,
+                                ops::ReshapeGradKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
+                                ops::ReshapeKernel, int, ops::ReshapeKernel,
+                                int64_t, ops::ReshapeKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
+                                double, ops::ReshapeGradKernel, int,
+                                ops::ReshapeGradKernel, int64_t,
+                                ops::ReshapeGradKernel);
 #endif

From 24c9fbdba36b4b9804c63f7ddefeb1074714e63b Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 15 Oct 2018 16:13:29 +0800
Subject: [PATCH 4/5] Polish code

test=develop
---
 paddle/fluid/framework/tensor_util.cc | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index de77d189c8..69bcbc0e58 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -36,6 +36,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
   auto size = src.numel() * SizeOfType(src.type());
 
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+              << dst_place;
+      return;
+    }
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
                  boost::get<platform::CPUPlace>(src_place), src_ptr, size);
   }
@@ -71,6 +76,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     if (platform::is_same_place(src_place, dst_place)) {
+      if (src_ptr == dst_ptr) {
+        VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+                << dst_place;
+        return;
+      }
       memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
                    stream);
     } else {
@@ -115,7 +125,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
   auto size = src.numel() * SizeOfType(src.type());
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
     if (src_ptr == dst_ptr) {
-      VLOG(3) << "Skip copy the same data from " << src.place() << " to "
+      VLOG(3) << "Skip copy the same data from " << src_place << " to "
               << dst_place;
       return;
     }
@@ -135,14 +145,13 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
     memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
   } else if (platform::is_gpu_place(src_place) &&
              platform::is_gpu_place(dst_place)) {
-    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
-    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
-    if (src_ptr == dst_ptr &&
-        src_gpu_place.GetDeviceId() == dst_gpu_place.GetDeviceId()) {
-      VLOG(3) << "Skip copy the same data from " << src.place() << " to "
+    if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) {
+      VLOG(3) << "Skip copy the same data from " << src_place << " to "
               << dst_place;
       return;
     }
+    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
     memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
   }
 #endif

From aeec82acd5c37d110a71832d647f3c27834c7c8a Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 15 Oct 2018 17:21:10 +0800
Subject: [PATCH 5/5] Add unittest for reshape op

test=develop
---
 paddle/fluid/framework/tensor_util_test.cc | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index a1e5b967a8..793ccfc79f 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -41,6 +41,11 @@ TEST(TensorCopy, Tensor) {
     EXPECT_EQ(src_ptr[i], dst_ptr[i]);
   }
 
+  TensorCopy(dst_tensor, *cpu_place, &dst_tensor);
+  for (size_t i = 0; i < 9; ++i) {
+    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+  }
+
   EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
 
   Tensor slice_tensor = src_tensor.Slice(1, 2);
@@ -82,6 +87,15 @@ TEST(TensorCopy, Tensor) {
       EXPECT_EQ(src_ptr[i], dst_ptr[i]);
     }
 
+    // Copy the same tensor
+    TensorCopy(gpu_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+    gpu_ctx.Wait();
+    const int* dst_ptr_tmp = dst_tensor.data<int>();
+    EXPECT_NE(src_ptr, dst_ptr_tmp);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], dst_ptr_tmp[i]);
+    }
+
     Tensor slice_tensor = src_tensor.Slice(1, 2);
 
     // CPU Slice Tensor to GPU Tensor