From bde90be71bc2758b464960c8e2631ee177c1d9a7 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 17 Jul 2017 18:10:18 +0800
Subject: [PATCH 1/3] Read/Write a Tensor Python

Basically following
http://pybind11.readthedocs.io/en/stable/advanced/pycpp/numpy.html

* Use buffer protocol to return a view of Tensor. It can be cast to
  numpy array in Python.
* Set a numpy array to a tensor.
---
 paddle/framework/tensor.h                     |   9 +-
 paddle/pybind/pybind.cc                       | 142 +++++++++++++++++-
 .../paddle/v2/framework/tests/test_tensor.py  |  45 ++++++
 3 files changed, 194 insertions(+), 2 deletions(-)
 create mode 100644 python/paddle/v2/framework/tests/test_tensor.py
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 29bad7a00a..891cf73641 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <cstdint>
 #include <cstring>
 #include <memory>
+#include <typeindex>
 #include "paddle/framework/ddim.h"
 #include "paddle/framework/enforce.h"
 #include "paddle/memory/memory.h"
@@ -127,6 +128,10 @@ class Tensor {
 
   DDim dims() const { return dims_; }
 
+  platform::Place place() const { return holder_->place(); }
+
+  std::type_index type() const { return holder_->type(); }
+
  private:
   // Placeholder hides type T, so it doesn't appear as a template
   // parameter of Variable.
@@ -135,6 +140,7 @@ class Tensor {
     virtual void* ptr() const = 0;
     virtual platform::Place place() const = 0;
     virtual size_t size() const = 0;
+    virtual std::type_index type() const = 0;
   };
 
   template <typename T, typename PlaceType>
@@ -159,7 +165,8 @@ class Tensor {
 
     virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
     virtual size_t size() const { return size_; }
-    virtual platform::Place place() const { return place_; }
+    virtual paddle::platform::Place place() const { return place_; }
+    virtual std::type_index type() const { return std::type_index(typeid(T)); }
 
     std::unique_ptr<T, Deleter<PlaceType>> ptr_;
     platform::Place place_;  // record the place of ptr_.
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index b5ead21fd0..8222323e36 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <Python.h>
 #include <paddle/framework/op_registry.h>
 #include <paddle/framework/scope.h>
+#include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <fstream>
@@ -25,9 +26,143 @@ namespace pd = paddle::framework;
 
 USE_OP(add_two);
 
+struct PlaceDebugString : public boost::static_visitor<std::string> {
+  std::string operator()(const paddle::platform::GPUPlace& place) const {
+    return "GPU(" + std::to_string(place.device) + ")";
+  }
+
+  std::string operator()(const paddle::platform::CPUPlace& place) const {
+    return "CPU";
+  }
+};
+
+template <typename T>
+struct TensorToPyBuffer {
+  pd::Tensor& self_;
+  explicit TensorToPyBuffer(pd::Tensor& self) : self_(self) {}
+
+  bool CanCast() const { return std::type_index(typeid(T)) == self_.type(); }
+
+  py::buffer_info Cast() const {
+    auto dim_vec = pd::vectorize(self_.dims());
+    std::vector<size_t> dims_outside;
+    std::vector<size_t> strides;
+    dims_outside.resize(dim_vec.size());
+    strides.resize(dim_vec.size());
+
+    size_t prod = 1;
+    for (size_t i = dim_vec.size(); i != 0; --i) {
+      dims_outside[i - 1] = (size_t)dim_vec[i - 1];
+      strides[i - 1] = sizeof(float) * prod;
+      prod *= dims_outside[i - 1];
+    }
+
+    return py::buffer_info(self_.mutable_data<T>(self_.place()),
+                           sizeof(T),
+                           py::format_descriptor<T>::format(),
+                           (size_t)pd::arity(self_.dims()),
+                           dims_outside,
+                           strides);
+  }
+};
+
+template <bool less, size_t I, typename... ARGS>
+struct CastToPyBufferImpl;
+
+template <size_t I, typename... ARGS>
+struct CastToPyBufferImpl<false, I, ARGS...> {
+  py::buffer_info operator()(pd::Tensor& tensor) {
+    PADDLE_THROW("This type of tensor cannot be expose to Python");
+    return py::buffer_info();
+  }
+};
+
+template <size_t I, typename... ARGS>
+struct CastToPyBufferImpl<true, I, ARGS...> {
+  using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
+  py::buffer_info operator()(pd::Tensor& tensor) {
+    TensorToPyBuffer<CUR_TYPE> cast_object(tensor);
+    if (cast_object.CanCast()) {
+      return cast_object.Cast();
+    } else {
+      constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
+      return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
+    }
+  }
+};
+
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const std::vector<T>& vec) {
+  for (size_t i = 0; i < vec.size(); ++i) {
+    os << vec[i];
+    if (i + 1 != vec.size()) {
+      os << ", ";
+    }
+  }
+  return os;
+}
+
+py::buffer_info CastToPyBuffer(pd::Tensor& tensor) {
+  auto buffer_info = CastToPyBufferImpl<true, 0, float, int>()(tensor);
+  return buffer_info;
+}
+
+template <typename T>
+void PyTensorSet(
+    pd::Tensor& self,
+    py::array_t<T, py::array::c_style | py::array::forcecast> array) {
+  std::vector<int> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back((int)array.shape()[i]);
+  }
+
+  self.set_dims(pd::make_ddim(dims));
+  auto* dst = self.mutable_data<T>(paddle::platform::CPUPlace());
+  std::memcpy(dst, array.data(), sizeof(T) * array.size());
+}
+
 PYBIND11_PLUGIN(core) {
   py::module m("core", "C++ core of Paddle Paddle");
 
+  py::class_<paddle::platform::Place>(
+      m, "Place", R"DOC(Device Place Class.)DOC")
+      .def("__str__",
+           [](const paddle::platform::Place& self) {
+             return boost::apply_visitor(PlaceDebugString(), self);
+           })
+      .def("is_gpu",
+           [](const paddle::platform::Place& self) {
+             return paddle::platform::is_gpu_place(self);
+           })
+      .def("is_cpu", [](const paddle::platform::Place& self) {
+        return paddle::platform::is_cpu_place(self);
+      });
+
+  py::class_<pd::Tensor>(m, "Tensor", py::buffer_protocol())
+      .def("get_place", &pd::Tensor::place)
+      .def_buffer([](pd::Tensor& self) -> py::buffer_info {
+        PADDLE_ENFORCE(paddle::platform::is_cpu_place(self.place()),
+                       "Only CPU tensor can cast to numpy array");
+        return CastToPyBuffer(self);
+      })
+      .def("get_dims",
+           [](const pd::Tensor& self) { return pd::vectorize(self.dims()); })
+      .def("set_dims",
+           [](pd::Tensor& self, const std::vector<int>& dim) {
+             self.set_dims(pd::make_ddim(dim));
+           })
+      .def("alloc_float",
+           [](pd::Tensor& self) {
+             self.mutable_data<float>(paddle::platform::CPUPlace());
+           })
+      .def("alloc_int",
+           [](pd::Tensor& self) {
+             self.mutable_data<int>(paddle::platform::CPUPlace());
+           })
+      .def("set", PyTensorSet<float>)
+      .def("set", PyTensorSet<int>);
+
   py::class_<pd::Variable>(m, "Variable", R"DOC(Variable Class.
 
 All parameter, weight, gradient are variables in Paddle.
@@ -38,7 +173,12 @@ All parameter, weight, gradient are variables in Paddle.
              *var.GetMutable<int>() = val;
            })
       .def("get_int",
-           [](const pd::Variable& var) -> int { return var.Get<int>(); });
+           [](const pd::Variable& var) -> int { return var.Get<int>(); })
+      .def("get_tensor",
+           [](pd::Variable& self) -> pd::Tensor* {
+             return self.GetMutable<pd::Tensor>();
+           },
+           py::return_value_policy::reference);
 
   py::class_<pd::Scope, std::shared_ptr<pd::Scope>>(m, "Scope")
       .def(py::init<const std::shared_ptr<pd::Scope>&>())
diff --git a/python/paddle/v2/framework/tests/test_tensor.py b/python/paddle/v2/framework/tests/test_tensor.py
new file mode 100644
index 0000000000..b72aff3b9c
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_tensor.py
@@ -0,0 +1,45 @@
+import paddle.v2.framework.core as core
+import unittest
+import numpy
+
+
+class TestScope(unittest.TestCase):
+    def test_int_tensor(self):
+        scope = core.Scope(None)
+        var = scope.create_var("test_tensor")
+        tensor = var.get_tensor()
+
+        tensor.set_dims([1000, 784])
+        tensor.alloc_int()
+
+        tensor_array = numpy.array(tensor)
+        self.assertEqual((1000, 784), tensor_array.shape)
+        tensor_array[3, 9] = 1
+        tensor_array[19, 11] = 2
+        tensor.set(tensor_array)
+
+        tensor_array_2 = numpy.array(tensor)
+        self.assertEqual(1.0, tensor_array_2[3, 9])
+        self.assertEqual(2.0, tensor_array_2[19, 11])
+
+    def test_float_tensor(self):
+        scope = core.Scope(None)
+        var = scope.create_var("test_tensor")
+        tensor = var.get_tensor()
+
+        tensor.set_dims([1000, 784])
+        tensor.alloc_float()
+
+        tensor_array = numpy.array(tensor)
+        self.assertEqual((1000, 784), tensor_array.shape)
+        tensor_array[3, 9] = 1.0
+        tensor_array[19, 11] = 2.0
+        tensor.set(tensor_array)
+
+        tensor_array_2 = numpy.array(tensor)
+        self.assertAlmostEqual(1.0, tensor_array_2[3, 9])
+        self.assertAlmostEqual(2.0, tensor_array_2[19, 11])
+
+
+if __name__ == '__main__':
+    unittest.main()

From a89c7ffa94bc26a879b8978273219980648aaec4 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 18 Jul 2017 11:57:31 +0800
Subject: [PATCH 2/3] Make Tensor <--> Numpy interactive in tensor.h

* Follow review comments to seperate Tensor Numpy interactive methods in
  tensor.h.
* Simplify logic for `CastToPyBufferImpl`, make it as one struct and in
  details namespace.
* Remove `Scope` expose in Python, since it currently is useless.
* Remove some debug functions.
---
 paddle/pybind/pybind.cc                       | 118 +-----------------
 paddle/pybind/tensor.h                        |  91 ++++++++++++++
 .../paddle/v2/framework/tests/CMakeLists.txt  |   3 +-
 3 files changed, 97 insertions(+), 115 deletions(-)
 create mode 100644 paddle/pybind/tensor.h

diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 8222323e36..e3dc3e718c 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <Python.h>
 #include <paddle/framework/op_registry.h>
 #include <paddle/framework/scope.h>
+#include <paddle/pybind/tensor.h>
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
@@ -26,125 +27,14 @@ namespace pd = paddle::framework;
 
 USE_OP(add_two);
 
-struct PlaceDebugString : public boost::static_visitor<std::string> {
-  std::string operator()(const paddle::platform::GPUPlace& place) const {
-    return "GPU(" + std::to_string(place.device) + ")";
-  }
-
-  std::string operator()(const paddle::platform::CPUPlace& place) const {
-    return "CPU";
-  }
-};
-
-template <typename T>
-struct TensorToPyBuffer {
-  pd::Tensor& self_;
-  explicit TensorToPyBuffer(pd::Tensor& self) : self_(self) {}
-
-  bool CanCast() const { return std::type_index(typeid(T)) == self_.type(); }
-
-  py::buffer_info Cast() const {
-    auto dim_vec = pd::vectorize(self_.dims());
-    std::vector<size_t> dims_outside;
-    std::vector<size_t> strides;
-    dims_outside.resize(dim_vec.size());
-    strides.resize(dim_vec.size());
-
-    size_t prod = 1;
-    for (size_t i = dim_vec.size(); i != 0; --i) {
-      dims_outside[i - 1] = (size_t)dim_vec[i - 1];
-      strides[i - 1] = sizeof(float) * prod;
-      prod *= dims_outside[i - 1];
-    }
-
-    return py::buffer_info(self_.mutable_data<T>(self_.place()),
-                           sizeof(T),
-                           py::format_descriptor<T>::format(),
-                           (size_t)pd::arity(self_.dims()),
-                           dims_outside,
-                           strides);
-  }
-};
-
-template <bool less, size_t I, typename... ARGS>
-struct CastToPyBufferImpl;
-
-template <size_t I, typename... ARGS>
-struct CastToPyBufferImpl<false, I, ARGS...> {
-  py::buffer_info operator()(pd::Tensor& tensor) {
-    PADDLE_THROW("This type of tensor cannot be expose to Python");
-    return py::buffer_info();
-  }
-};
-
-template <size_t I, typename... ARGS>
-struct CastToPyBufferImpl<true, I, ARGS...> {
-  using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
-  py::buffer_info operator()(pd::Tensor& tensor) {
-    TensorToPyBuffer<CUR_TYPE> cast_object(tensor);
-    if (cast_object.CanCast()) {
-      return cast_object.Cast();
-    } else {
-      constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
-      return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
-    }
-  }
-};
-
-template <typename T>
-std::ostream& operator<<(std::ostream& os, const std::vector<T>& vec) {
-  for (size_t i = 0; i < vec.size(); ++i) {
-    os << vec[i];
-    if (i + 1 != vec.size()) {
-      os << ", ";
-    }
-  }
-  return os;
-}
-
-py::buffer_info CastToPyBuffer(pd::Tensor& tensor) {
-  auto buffer_info = CastToPyBufferImpl<true, 0, float, int>()(tensor);
-  return buffer_info;
-}
-
-template <typename T>
-void PyTensorSet(
-    pd::Tensor& self,
-    py::array_t<T, py::array::c_style | py::array::forcecast> array) {
-  std::vector<int> dims;
-  dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
-    dims.push_back((int)array.shape()[i]);
-  }
-
-  self.set_dims(pd::make_ddim(dims));
-  auto* dst = self.mutable_data<T>(paddle::platform::CPUPlace());
-  std::memcpy(dst, array.data(), sizeof(T) * array.size());
-}
-
 PYBIND11_PLUGIN(core) {
   py::module m("core", "C++ core of Paddle Paddle");
 
-  py::class_<paddle::platform::Place>(
-      m, "Place", R"DOC(Device Place Class.)DOC")
-      .def("__str__",
-           [](const paddle::platform::Place& self) {
-             return boost::apply_visitor(PlaceDebugString(), self);
-           })
-      .def("is_gpu",
-           [](const paddle::platform::Place& self) {
-             return paddle::platform::is_gpu_place(self);
-           })
-      .def("is_cpu", [](const paddle::platform::Place& self) {
-        return paddle::platform::is_cpu_place(self);
-      });
-
   py::class_<pd::Tensor>(m, "Tensor", py::buffer_protocol())
-      .def("get_place", &pd::Tensor::place)
       .def_buffer([](pd::Tensor& self) -> py::buffer_info {
         PADDLE_ENFORCE(paddle::platform::is_cpu_place(self.place()),
                        "Only CPU tensor can cast to numpy array");
-        return CastToPyBuffer(self);
+        return paddle::pybind::CastToPyBuffer(self);
       })
       .def("get_dims",
            [](const pd::Tensor& self) { return pd::vectorize(self.dims()); })
@@ -160,8 +50,8 @@ PYBIND11_PLUGIN(core) {
            [](pd::Tensor& self) {
              self.mutable_data<int>(paddle::platform::CPUPlace());
            })
-      .def("set", PyTensorSet<float>)
-      .def("set", PyTensorSet<int>);
+      .def("set", paddle::pybind::PyTensorSetFromArray<float>)
+      .def("set", paddle::pybind::PyTensorSetFromArray<int>);
 
   py::class_<pd::Variable>(m, "Variable", R"DOC(Variable Class.
 
diff --git a/paddle/pybind/tensor.h b/paddle/pybind/tensor.h
new file mode 100644
index 0000000000..ef07144ad4
--- /dev/null
+++ b/paddle/pybind/tensor.h
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <paddle/framework/tensor.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace paddle {
+
+namespace pybind {
+
+namespace details {
+
+template <bool less, size_t I, typename... ARGS>
+struct CastToPyBufferImpl;
+
+template <size_t I, typename... ARGS>
+struct CastToPyBufferImpl<false, I, ARGS...> {
+  py::buffer_info operator()(framework::Tensor &tensor) {
+    PADDLE_THROW("This type of tensor cannot be expose to Python");
+    return py::buffer_info();
+  }
+};
+
+template <size_t I, typename... ARGS>
+struct CastToPyBufferImpl<true, I, ARGS...> {
+  using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
+  py::buffer_info operator()(framework::Tensor &tensor) {
+    if (std::type_index(typeid(CUR_TYPE)) == tensor.type()) {
+      auto dim_vec = framework::vectorize(tensor.dims());
+      std::vector<size_t> dims_outside;
+      std::vector<size_t> strides;
+      dims_outside.resize(dim_vec.size());
+      strides.resize(dim_vec.size());
+
+      size_t prod = 1;
+      for (size_t i = dim_vec.size(); i != 0; --i) {
+        dims_outside[i - 1] = (size_t)dim_vec[i - 1];
+        strides[i - 1] = sizeof(CUR_TYPE) * prod;
+        prod *= dims_outside[i - 1];
+      }
+
+      return py::buffer_info(tensor.mutable_data<CUR_TYPE>(tensor.place()),
+                             sizeof(CUR_TYPE),
+                             py::format_descriptor<CUR_TYPE>::format(),
+                             (size_t)framework::arity(tensor.dims()),
+                             dims_outside,
+                             strides);
+    } else {
+      constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
+      return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
+    }
+  }
+};
+}  // namespace details
+inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
+  auto buffer_info = details::CastToPyBufferImpl<true, 0, float, int>()(tensor);
+  return buffer_info;
+}
+
+template <typename T>
+void PyTensorSetFromArray(
+    framework::Tensor &self,
+    py::array_t<T, py::array::c_style | py::array::forcecast> array) {
+  std::vector<int> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back((int)array.shape()[i]);
+  }
+
+  self.set_dims(framework::make_ddim(dims));
+  auto *dst = self.mutable_data<T>(paddle::platform::CPUPlace());
+  std::memcpy(dst, array.data(), sizeof(T) * array.size());
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index 86fc60f26a..4ce2bef6fc 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_python_test(test_framework test_protobuf.py test_scope.py
-    test_default_scope_funcs.py test_op_creation_methods.py)
+    test_default_scope_funcs.py test_op_creation_methods.py
+    test_tensor.py)

From 1dc53a289fe724cd3772618de374aacbf72a87f6 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 18 Jul 2017 15:23:13 +0800
Subject: [PATCH 3/3] Use friend not to expose tensor's `type/place`

---
 paddle/framework/tensor.h                 | 14 +++++++++-----
 paddle/pybind/pybind.cc                   |  4 +---
 paddle/pybind/{tensor.h => tensor_bind.h} | 18 +++++++++++-------
 3 files changed, 21 insertions(+), 15 deletions(-)
 rename paddle/pybind/{tensor.h => tensor_bind.h} (84%)

diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 891cf73641..c495687dc4 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -24,6 +24,12 @@ limitations under the License. */
 #include "paddle/platform/place.h"
 
 namespace paddle {
+namespace pybind {
+namespace details {  // forward declare
+template <bool less, size_t i, typename... args>
+struct CastToPyBufferImpl;
+}  // namespace details
+}  // namespace pybind
 namespace framework {
 
 class Tensor {
@@ -128,10 +134,6 @@ class Tensor {
 
   DDim dims() const { return dims_; }
 
-  platform::Place place() const { return holder_->place(); }
-
-  std::type_index type() const { return holder_->type(); }
-
  private:
   // Placeholder hides type T, so it doesn't appear as a template
   // parameter of Variable.
@@ -186,7 +188,9 @@ class Tensor {
   DDim dims_;
   size_t numel_;   // cache of `product(dims_)`
   size_t offset_;  // marks the begin of tensor data area.
-};                 // namespace framework
+  template <bool less, size_t i, typename... args>
+  friend struct paddle::pybind::details::CastToPyBufferImpl;
+};  // namespace framework
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index e3dc3e718c..0eef36f8ec 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <Python.h>
 #include <paddle/framework/op_registry.h>
 #include <paddle/framework/scope.h>
-#include <paddle/pybind/tensor.h>
+#include <paddle/pybind/tensor_bind.h>
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
@@ -32,8 +32,6 @@ PYBIND11_PLUGIN(core) {
 
   py::class_<pd::Tensor>(m, "Tensor", py::buffer_protocol())
       .def_buffer([](pd::Tensor& self) -> py::buffer_info {
-        PADDLE_ENFORCE(paddle::platform::is_cpu_place(self.place()),
-                       "Only CPU tensor can cast to numpy array");
         return paddle::pybind::CastToPyBuffer(self);
       })
       .def("get_dims",
diff --git a/paddle/pybind/tensor.h b/paddle/pybind/tensor_bind.h
similarity index 84%
rename from paddle/pybind/tensor.h
rename to paddle/pybind/tensor_bind.h
index ef07144ad4..b96516643a 100644
--- a/paddle/pybind/tensor.h
+++ b/paddle/pybind/tensor_bind.h
@@ -40,7 +40,10 @@ template <size_t I, typename... ARGS>
 struct CastToPyBufferImpl<true, I, ARGS...> {
   using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
   py::buffer_info operator()(framework::Tensor &tensor) {
-    if (std::type_index(typeid(CUR_TYPE)) == tensor.type()) {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(tensor.holder_->place()),
+                   "Only CPU tensor can cast to numpy array");
+
+    if (std::type_index(typeid(CUR_TYPE)) == tensor.holder_->type()) {
       auto dim_vec = framework::vectorize(tensor.dims());
       std::vector<size_t> dims_outside;
       std::vector<size_t> strides;
@@ -54,12 +57,13 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
         prod *= dims_outside[i - 1];
       }
 
-      return py::buffer_info(tensor.mutable_data<CUR_TYPE>(tensor.place()),
-                             sizeof(CUR_TYPE),
-                             py::format_descriptor<CUR_TYPE>::format(),
-                             (size_t)framework::arity(tensor.dims()),
-                             dims_outside,
-                             strides);
+      return py::buffer_info(
+          tensor.mutable_data<CUR_TYPE>(tensor.holder_->place()),
+          sizeof(CUR_TYPE),
+          py::format_descriptor<CUR_TYPE>::format(),
+          (size_t)framework::arity(tensor.dims()),
+          dims_outside,
+          strides);
     } else {
       constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
       return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);