Remove template parameter for Tensor methods (#4937)

* Remove template parameter for Tensor methods * Also check the type is correct when data() * Simplize holder_ * Fix accuracy_op * Register Code
8 years ago · c532b96741
parent 43702a89d5
commit c532b96741
31 changed files with 248 additions and 202 deletions
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@ -26,6 +26,8 @@ inline DataType ToDataType(std::type_index type) {
    return DataType::FP64;
  } else if (typeid(int).hash_code() == type.hash_code()) {
    return DataType::INT32;
+  } else if (typeid(int64_t).hash_code() == type.hash_code()) {
+    return DataType::INT64;
  } else {
    PADDLE_THROW("Not supported");
  }
--- a/paddle/framework/feed_fetch_method.h
+++ b/paddle/framework/feed_fetch_method.h
@ -34,7 +34,7 @@ void SetFeedVariable(const LoDTensor& input, const std::string& var_name,
    feed_inputs.resize(index + 1);
  }
  // shared data with input tensor
-  feed_inputs[index].ShareDataWith<T>(input);
+  feed_inputs[index].ShareDataWith(input);
  // set lod
  feed_inputs[index].set_lod(input.lod());
 }
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@ -60,6 +60,10 @@ class Tensor {
  template <typename T>
  inline T* mutable_data(platform::Place place);

+  inline void* mutable_data(platform::Place place, std::type_index type);
+
+  inline void* mutable_data(platform::Place place);
+
  /**
   * @brief     Return a pointer to mutable memory block.
   *
@ -81,7 +85,6 @@ class Tensor {
  inline Tensor& Resize(const DDim& dims);

  /*! The internal of two tensors share the same memory block. */
-  template <typename T>
  inline Tensor& ShareDataWith(const Tensor& src);

  /**
@ -96,26 +99,9 @@ class Tensor {
  // TODO(qijun): https://github.com/PaddlePaddle/Paddle/issues/4647
  // Remove `CopyFrom` and `CopyFromVector` from Tensor interface
  // and make them global functions
-  template <typename T>
  inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
                       const platform::DeviceContext& ctx);

-  // FIXME(yuyang18): CopyFrom should without template T, use the replace
-  // `CopyFrom` with `CopyFromTensor`
-  inline void CopyFromTensor(const Tensor& src,
-                             const platform::Place& dst_place,
-                             const platform::DeviceContext& ctx) {
-    // NOLINTNEXTLINES_8 cpplint.py will recognize below lines as functions.
-    // That is a bug of cpplint.py. Just ignore lint these lines.
-    if (src.type() == std::type_index(typeid(double))) {
-      CopyFrom<double>(src, dst_place, ctx);
-    } else if (src.type() == std::type_index(typeid(float))) {
-      CopyFrom<float>(src, dst_place, ctx);
-    } else if (src.type() == std::type_index(typeid(int))) {
-      CopyFrom<int>(src, dst_place, ctx);
-    }
-  }
-
  /**
   * @brief   Copy the content of an external vector to a tensor.
   *
@ -135,7 +121,6 @@ class Tensor {
   * @param[in] begin_idx   The begin index of the slice.
   * @param[in] end_idx     The end index of the slice.
   */
-  template <typename T>
  inline Tensor Slice(const int& begin_idx, const int& end_idx) const;

  platform::Place place() const {
@ -146,7 +131,6 @@ class Tensor {
  std::type_index type() const { return holder_->type(); }

 private:
-  template <typename T>
  inline void check_memory_size() const;

 private:
@ -155,20 +139,22 @@ class Tensor {
   *          parameter of Variable.
   */
  struct Placeholder {
-    virtual ~Placeholder() {}
+    virtual ~Placeholder() = default;
    virtual void* ptr() const = 0;
    virtual size_t size() const = 0;
    virtual std::type_index type() const = 0;
    virtual platform::Place place() const = 0;
+    virtual void set_type(std::type_index type) = 0;
  };

-  template <typename T, typename Place>
+  template <typename Place>
  struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(Place place, size_t size)
-        : ptr_(static_cast<T*>(memory::Alloc(place, size)),
-               memory::PODDeleter<T, Place>(place)),
+    PlaceholderImpl(Place place, size_t size, std::type_index type)
+        : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)),
+               memory::PODDeleter<uint8_t, Place>(place)),
          place_(place),
-          size_(size) {
+          size_(size),
+          type_(type) {
      PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
                              (is_cpu_place(place_) ? "CPU" : "GPU"));
    }
@ -176,16 +162,20 @@ class Tensor {
    virtual size_t size() const { return size_; }
    virtual platform::Place place() const { return place_; }
    virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
-    virtual std::type_index type() const { return std::type_index(typeid(T)); }
+    virtual std::type_index type() const { return type_; }
+    virtual void set_type(std::type_index type) { type_ = type; }

    /*! the pointer of memory block. */
-    std::unique_ptr<T, memory::PODDeleter<T, Place>> ptr_;
+    std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_;

    /*! the place of memory block. */
    platform::Place place_;

    /*! the size of memory block. */
    size_t size_;
+
+    /* the current type of memory */
+    std::type_index type_;
  };

  /*! holds the memory block if allocated. */
--- a/paddle/framework/tensor_array.cc
+++ b/paddle/framework/tensor_array.cc
@ -106,8 +106,8 @@ void TensorArray::Write(size_t index, const LoDTensor& value) {

  values_[index].Resize(value.dims());
  values_[index].mutable_data<value_type>(platform::CPUPlace());
-  values_[index].CopyFrom<value_type>(value, platform::CPUPlace(),
-                                      platform::CPUDeviceContext());
+  values_[index].CopyFrom(value, platform::CPUPlace(),
+                          platform::CPUDeviceContext());
 }

 void TensorArray::WriteShared(size_t index, const LoDTensor& value) {
@ -116,7 +116,7 @@ void TensorArray::WriteShared(size_t index, const LoDTensor& value) {
    values_.resize(index + 1);
  }

-  values_[index].ShareDataWith<value_type>(value);
+  values_[index].ShareDataWith(value);
 }

 LoDTensor TensorArray::Pack(size_t level, const std::vector<DySeqMeta>& meta,
@ -163,9 +163,9 @@ LoDTensor TensorArray::Stack() const {
  result.mutable_data<value_type>(platform::CPUPlace());

  for (size_t idx = 0; idx < size(); idx++) {
-    result.Slice<value_type>(idx, idx + 1)
-        .CopyFrom<value_type>(Read(idx), platform::CPUPlace(),
-                              platform::CPUDeviceContext());
+    result.Slice(idx, idx + 1)
+        .CopyFrom(Read(idx), platform::CPUPlace(),
+                  platform::CPUDeviceContext());
  }
  return result;
 }
@ -191,13 +191,12 @@ void TensorArray::Unstack(const LoDTensor& source, bool data_shared) const {
    auto& value = values_[elem];
    if (data_shared) {
      // share memory
-      value.ShareDataWith<value_type>(source.Slice<value_type>(elem, elem + 1));
+      value.ShareDataWith(source.Slice(elem, elem + 1));
    } else {
      // copy
      value.Resize(value_dims);
-      value.CopyFrom<value_type>(source.Slice<value_type>(elem, elem + 1),
-                                 platform::CPUPlace(),
-                                 platform::CPUDeviceContext());
+      value.CopyFrom(source.Slice(elem, elem + 1), platform::CPUPlace(),
+                     platform::CPUDeviceContext());
    }
  }
 }
@ -242,11 +241,10 @@ LoDTensor DynamicBatchUnpacker::GetBatch(size_t index) {

  for (size_t i = 0; i < indice.size(); i++) {
    auto index = indice[i];
-    auto target = result.Slice<value_type>(i, i + 1);
-    auto slice = source->Slice<value_type>(index, index + 1);
+    auto target = result.Slice(i, i + 1);
+    auto slice = source->Slice(index, index + 1);

-    target.CopyFrom<value_type>(slice, platform::CPUPlace(),
-                                platform::CPUDeviceContext());
+    target.CopyFrom(slice, platform::CPUPlace(), platform::CPUDeviceContext());
  }

  return result;
@ -277,10 +275,10 @@ LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source,
      // target is result[index]
      auto index = seq_meta.begin + batch_id;
      if (index >= seq_meta.end) break;
-      auto source_ = source[batch_id].Slice<float>(seq_id, seq_id + 1);
-      auto target = result.Slice<float>(index, index + 1);
-      target.CopyFrom<float>(source_, platform::CPUPlace(),
-                             platform::CPUDeviceContext());
+      auto source_ = source[batch_id].Slice(seq_id, seq_id + 1);
+      auto target = result.Slice(index, index + 1);
+      target.CopyFrom(source_, platform::CPUPlace(),
+                      platform::CPUDeviceContext());
    }
  }

--- a/paddle/framework/tensor_array_test.cc
+++ b/paddle/framework/tensor_array_test.cc
@ -91,7 +91,7 @@ class TensorArrayPackTester : public ::testing::Test {
      size_t begin = level[i];
      size_t end = level[i + 1];
      for (size_t j = begin; j < end; j++) {
-        auto record = source.Slice<int>(j, j + 1);
+        auto record = source.Slice(j, j + 1);
        for (int dim = 0; dim < 128; dim++) {
          record.mutable_data<int>(platform::CPUPlace())[dim] = j - begin;
        }
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@ -19,12 +19,50 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

+template <typename... T>
+struct SizeOfTypeFunctor;
+
 template <typename T>
+struct SizeOfTypeFunctor<T> {
+  size_t operator()(std::type_index type) const {
+    if (typeid(T).hash_code() == type.hash_code()) {
+      return sizeof(T);
+    } else {
+      return 0UL;
+    }
+  }
+};
+
+template <>
+struct SizeOfTypeFunctor<> {
+  size_t operator()(std::type_index type) const { return 0UL; }
+};
+
+template <typename HEAD, typename... TAIL>
+struct SizeOfTypeFunctor<HEAD, TAIL...> {
+  size_t operator()(std::type_index type) const {
+    SizeOfTypeFunctor<HEAD> head;
+    size_t head_size = head(type);
+    if (head_size != 0) {
+      return head_size;
+    }
+    SizeOfTypeFunctor<TAIL...> tail;
+    return tail(type);
+  }
+};
+
+static inline size_t SizeOfType(std::type_index type) {
+  SizeOfTypeFunctor<int, float, double, int16_t, int64_t> functor;
+  size_t size = functor(type);
+  PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
+  return size;
+}
+
 inline void Tensor::check_memory_size() const {
  PADDLE_ENFORCE_NOT_NULL(
      holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
  PADDLE_ENFORCE_GE(
-      holder_->size(), numel() * sizeof(T) + offset_,
+      holder_->size(), numel() * SizeOfType(type()) + offset_,
      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
      "first to re-allocate memory.\n"
      "or maybe the required data-type mismatches the data already stored.");
@ -32,14 +70,23 @@ inline void Tensor::check_memory_size() const {

 template <typename T>
 inline const T* Tensor::data() const {
-  check_memory_size<T>();
+  check_memory_size();
+  PADDLE_ENFORCE(std::is_same<T, void>::value ||
+                     holder_->type().hash_code() == typeid(T).hash_code(),
+                 "Tensor holds the wrong type, it holds %s",
+                 this->holder_->type().name());
+
  return reinterpret_cast<const T*>(
      reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
 }

 template <typename T>
 inline T* Tensor::data() {
-  check_memory_size<T>();
+  check_memory_size();
+  PADDLE_ENFORCE(std::is_same<T, void>::value ||
+                     holder_->type().hash_code() == typeid(T).hash_code(),
+                 "Tensor holds the wrong type, it holds %s",
+                 this->holder_->type().name());
  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                              offset_);
 }
@ -54,51 +101,62 @@ inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
 template <typename T>
 inline T* Tensor::mutable_data(platform::Place place) {
  static_assert(std::is_pod<T>::value, "T must be POD");
+  return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
+}
+
+inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
+  if (holder_ != nullptr) {
+    holder_->set_type(type);
+  }
  PADDLE_ENFORCE_GT(numel(), 0,
                    "Tensor's numel must be larger than zero to call "
                    "Tensor::mutable_data. Call Tensor::set_dim first.");
+  int64_t size = numel() * SizeOfType(type);
  /* some versions of boost::variant don't have operator!= */
-  int64_t size = numel() * sizeof(T);
  if (holder_ == nullptr || !(holder_->place() == place) ||
      holder_->size() < size + offset_) {
    if (platform::is_cpu_place(place)) {
-      holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
-          boost::get<platform::CPUPlace>(place), size));
+      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
+          boost::get<platform::CPUPlace>(place), size, type));
    } else if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
      PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
    }
 #else
-      holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
-          boost::get<platform::GPUPlace>(place), size));
+      holder_.reset(new PlaceholderImpl<platform::GPUPlace>(
+          boost::get<platform::GPUPlace>(place), size, type));
    }
 #endif
    offset_ = 0;
  }
-  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                              offset_);
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                                 offset_);
+}
+
+inline void* Tensor::mutable_data(platform::Place place) {
+  PADDLE_ENFORCE(this->holder_ != nullptr,
+                 "Cannot invoke mutable data if current hold nothing");
+  return mutable_data(place, holder_->type());
 }

-template <typename T>
 inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
-  src.check_memory_size<T>();
+  src.check_memory_size();
  *this = src;
  return *this;
 }

-template <typename T>
 inline void Tensor::CopyFrom(const Tensor& src,
                             const platform::Place& dst_place,
                             const platform::DeviceContext& ctx) {
-  src.check_memory_size<T>();
+  src.check_memory_size();
  Resize(src.dims());

  auto src_place = src.holder_->place();
-  auto src_ptr = static_cast<const void*>(src.data<T>());
+  auto src_ptr = src.data<void>();

-  auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
+  auto dst_ptr = mutable_data(dst_place, src.type());

-  auto size = src.numel() * sizeof(T);
+  auto size = src.numel() * SizeOfType(src.type());

  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
@ -165,9 +223,8 @@ inline void Tensor::CopyFromVector(const std::vector<T>& src,
 #endif
 }

-template <typename T>
 inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
-  check_memory_size<T>();
+  check_memory_size();
  PADDLE_ENFORCE_GE(begin_idx, 0, "Slice begin index is less than zero.");
  PADDLE_ENFORCE_LE(end_idx, dims_[0], "Slice end index is out of bound.");
  PADDLE_ENFORCE_LT(begin_idx, end_idx,
@ -182,7 +239,7 @@ inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
    DDim dst_dims = dims_;
    dst_dims[0] = end_idx - begin_idx;
    dst.Resize(dst_dims);
-    dst.offset_ = offset_ + begin_idx * base * sizeof(T);
+    dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
    return dst;
  }
 }
@ -196,10 +253,9 @@ inline const DDim& Tensor::dims() const { return dims_; }

 inline int64_t Tensor::numel() const { return product(dims_); }

-template <typename T>
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
  Tensor res;
-  res.ShareDataWith<T>(src);
+  res.ShareDataWith(src);
  res.Resize(flatten_to_2d(src.dims(), num_col_dims));
  return res;
 }
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@ -108,7 +108,7 @@ TEST(Tensor, ShareDataWith) {
    // Try to share data form uninitialized tensor
    bool caught = false;
    try {
-      dst_tensor.ShareDataWith<float>(src_tensor);
+      dst_tensor.ShareDataWith(src_tensor);
    } catch (paddle::platform::EnforceNotMet err) {
      caught = true;
      std::string msg =
@ -122,7 +122,7 @@ TEST(Tensor, ShareDataWith) {
    ASSERT_TRUE(caught);

    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CPUPlace());
-    dst_tensor.ShareDataWith<int>(src_tensor);
+    dst_tensor.ShareDataWith(src_tensor);
    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
  }

@ -131,7 +131,7 @@ TEST(Tensor, ShareDataWith) {
    Tensor src_tensor;
    Tensor dst_tensor;
    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
-    dst_tensor.ShareDataWith<int>(src_tensor);
+    dst_tensor.ShareDataWith(src_tensor);
    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
  }
 #endif
@ -143,7 +143,7 @@ TEST(Tensor, Slice) {
  {
    Tensor src_tensor;
    src_tensor.mutable_data<int>(make_ddim({5, 3, 4}), CPUPlace());
-    Tensor slice_tensor = src_tensor.Slice<int>(1, 3);
+    Tensor slice_tensor = src_tensor.Slice(1, 3);
    DDim slice_dims = slice_tensor.dims();
    ASSERT_EQ(arity(slice_dims), 3);
    EXPECT_EQ(slice_dims[0], 2);
@ -167,7 +167,7 @@ TEST(Tensor, Slice) {
  {
    Tensor src_tensor;
    src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
-    Tensor slice_tensor = src_tensor.Slice<double>(2, 6);
+    Tensor slice_tensor = src_tensor.Slice(2, 6);
    DDim slice_dims = slice_tensor.dims();
    ASSERT_EQ(arity(slice_dims), 2);
    EXPECT_EQ(slice_dims[0], 4);
@ -202,7 +202,7 @@ TEST(Tensor, CopyFrom) {
    memcpy(src_ptr, arr, 9 * sizeof(int));

    auto cpu_place = new paddle::platform::CPUPlace();
-    dst_tensor.CopyFrom<int>(src_tensor, *cpu_place, cpu_ctx);
+    dst_tensor.CopyFrom(src_tensor, *cpu_place, cpu_ctx);

    const int* dst_ptr = dst_tensor.data<int>();
    ASSERT_NE(src_ptr, dst_ptr);
@ -210,8 +210,8 @@ TEST(Tensor, CopyFrom) {
      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
    }

-    Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
-    dst_tensor.CopyFrom<int>(slice_tensor, *cpu_place, cpu_ctx);
+    Tensor slice_tensor = src_tensor.Slice(1, 2);
+    dst_tensor.CopyFrom(slice_tensor, *cpu_place, cpu_ctx);
    const int* slice_ptr = slice_tensor.data<int>();
    dst_ptr = dst_tensor.data<int>();
    ASSERT_NE(dst_ptr, slice_ptr);
@ -233,11 +233,11 @@ TEST(Tensor, CopyFrom) {
    // CPU Tensor to GPU Tensor
    auto gpu_place = new paddle::platform::GPUPlace(0);
    CUDADeviceContext gpu_ctx(*gpu_place);
-    gpu_tensor.CopyFrom<int>(src_tensor, *gpu_place, gpu_ctx);
+    gpu_tensor.CopyFrom(src_tensor, *gpu_place, gpu_ctx);

    // GPU Tensor to CPU Tensor
    auto cpu_place = new paddle::platform::CPUPlace();
-    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx);
+    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);

    // Sync before Compare Tensors
    gpu_ctx.Wait();
@ -247,13 +247,13 @@ TEST(Tensor, CopyFrom) {
      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
    }

-    Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
+    Tensor slice_tensor = src_tensor.Slice(1, 2);

    // CPU Slice Tensor to GPU Tensor
-    gpu_tensor.CopyFrom<int>(slice_tensor, *gpu_place, gpu_ctx);
+    gpu_tensor.CopyFrom(slice_tensor, *gpu_place, gpu_ctx);

    // GPU Tensor to CPU Tensor
-    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx);
+    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);

    // Sync before Compare Slice Tensors
    gpu_ctx.Wait();
@ -320,7 +320,7 @@ TEST(Tensor, CopyFromVector) {
    CUDADeviceContext gpu_ctx(*gpu_place);
    gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx);
    // Copy from GPU to CPU tensor for comparison
-    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx);
+    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);

    // Sync before Compare Tensors
    gpu_ctx.Wait();
@ -340,7 +340,7 @@ TEST(Tensor, CopyFromVector) {
    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
    gpu_tensor.Resize(make_ddim({2, 2}));
    gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx);
-    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx);
+    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);

    // Sync before Compare Tensors
    gpu_ctx.Wait();
@ -368,7 +368,7 @@ TEST(Tensor, ReshapeToMatrix) {
  for (int i = 0; i < 2 * 3 * 4 * 9; ++i) {
    src_ptr[i] = i;
  }
-  Tensor res = ReshapeToMatrix<int>(src, 2);
+  Tensor res = ReshapeToMatrix(src, 2);
  ASSERT_EQ(res.dims()[0], 2 * 3);
  ASSERT_EQ(res.dims()[1], 4 * 9);
 }
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@ -69,5 +69,8 @@ information, or not. But the output only shares the LoD with input `Inference`.

 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(accuracy, ops::AccuracyOp, ops::AccuracyOpMaker);
-REGISTER_OP_CPU_KERNEL(accuracy,
-                       ops::AccuracyKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    accuracy, ops::AccuracyKernel<paddle::platform::CPUPlace, float>,
+    ops::AccuracyKernel<paddle::platform::CPUPlace, int>,
+    ops::AccuracyKernel<paddle::platform::CPUPlace, double>,
+    ops::AccuracyKernel<paddle::platform::CPUPlace, int64_t>);
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@ -21,9 +21,9 @@ namespace paddle {
 namespace operators {
 using platform::PADDLE_CUDA_NUM_THREADS;

-template <int BlockSize>
-__global__ void AccuracyCudaKernel(const int N, const int D, const int* Xdata,
-                                   const int* labeldata, float* accuracy) {
+template <typename T, int BlockSize>
+__global__ void AccuracyCudaKernel(const int N, const int D, const T* Xdata,
+                                   const T* labeldata, float* accuracy) {
  int count = 0;
  __shared__ int total[BlockSize];

@ -57,8 +57,8 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
    auto* accuracy = ctx.Output<Tensor>("Accuracy");
    // FIXME(typhoonzero): only support indices currently
    // if add support for output values, how to detect the data type?
-    const int* inference_data = inference->data<int>();
-    const int* label_data = label->data<int>();
+    const T* inference_data = inference->data<T>();
+    const T* label_data = label->data<T>();
    float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());

    size_t num_samples = inference->dims()[0];
@ -69,7 +69,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
      return;
    }

-    AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<<
+    AccuracyCudaKernel<T, PADDLE_CUDA_NUM_THREADS><<<
        1, PADDLE_CUDA_NUM_THREADS, 0,
        reinterpret_cast<const platform::CUDADeviceContext&>(
            ctx.device_context())
@ -81,5 +81,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle

-REGISTER_OP_GPU_KERNEL(accuracy,
-                       paddle::operators::AccuracyOpCUDAKernel<float>);
+REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
+                       paddle::operators::AccuracyOpCUDAKernel<double>,
+                       paddle::operators::AccuracyOpCUDAKernel<int>,
+                       paddle::operators::AccuracyOpCUDAKernel<int64_t>);
--- a/paddle/operators/conv2d_op.h
+++ b/paddle/operators/conv2d_op.h
@ -108,17 +108,17 @@ class GemmConv2DKernel : public framework::OpKernel<T> {
    int in_step = input_channels / groups;
    int out_step = output_channels / groups;
    for (int i = 0; i < batch_size; i++) {
-      Tensor in_batch = input->Slice<T>(i, i + 1).Resize(input_shape);
-      Tensor out_batch = output->Slice<T>(i, i + 1).Resize(output_matrix_shape);
+      Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+      Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
      for (int g = 0; g < groups; g++) {
        // im2col
-        Tensor in_slice = in_batch.Slice<T>(g * in_step, (g + 1) * in_step);
+        Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
        im2col(context.device_context(), in_slice, col, strides[0], strides[1],
               paddings[0], paddings[1]);

        // gemm
-        Tensor out_slice = out_batch.Slice<T>(g * out_step, (g + 1) * out_step);
-        Tensor filter_slice = filter.Slice<T>(g * out_step, (g + 1) * out_step);
+        Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+        Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
        math::matmul<Place, T>(context.device_context(), filter_slice, false,
                               col_matrix, false, T(1.0), &out_slice, T(0.0));
      }
@ -198,22 +198,20 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {

      for (int i = 0; i < batch_size; i++) {
        Tensor out_grad_batch =
-            output_grad->Slice<T>(i, i + 1).Resize(output_matrix_shape);
-        Tensor in_grad_batch =
-            input_grad->Slice<T>(i, i + 1).Resize(input_shape);
+            output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
+        Tensor in_grad_batch = input_grad->Slice(i, i + 1).Resize(input_shape);
        for (int g = 0; g < groups; g++) {
          // gemm
          Tensor out_grad_slice =
-              out_grad_batch.Slice<T>(g * out_step, (g + 1) * out_step);
-          Tensor filter_slice =
-              filter.Slice<T>(g * out_step, (g + 1) * out_step);
+              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
+          Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
          math::matmul<Place, T>(context.device_context(), filter_slice, true,
                                 out_grad_slice, false, T(1.0), &col_matrix,
                                 T(0.0));

          // col2im
          Tensor in_grad_slice =
-              in_grad_batch.Slice<T>(g * in_step, (g + 1) * in_step);
+              in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
          col2im(context.device_context(), in_grad_slice, col, strides[0],
                 strides[1], paddings[0], paddings[1]);
        }
@ -229,19 +227,19 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {

      for (int i = 0; i < batch_size; i++) {
        Tensor out_grad_batch =
-            output_grad->Slice<T>(i, i + 1).Resize(output_matrix_shape);
-        Tensor in_batch = input->Slice<T>(i, i + 1).Resize(input_shape);
+            output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
+        Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
        for (int g = 0; g < groups; g++) {
          // im2col
          Tensor out_grad_slice =
-              out_grad_batch.Slice<T>(g * out_step, (g + 1) * out_step);
-          Tensor in_slice = in_batch.Slice<T>(g * in_step, (g + 1) * in_step);
+              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
+          Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
          im2col(context.device_context(), in_slice, col, strides[0],
                 strides[1], paddings[0], paddings[1]);

          // gemm
          Tensor filter_grad_slice =
-              filter_grad_.Slice<T>(g * out_step, (g + 1) * out_step);
+              filter_grad_.Slice(g * out_step, (g + 1) * out_step);
          math::matmul<Place, T>(context.device_context(), out_grad_slice,
                                 false, col_matrix, true, T(1.0),
                                 &filter_grad_slice, T(1.0));
--- a/paddle/operators/dynamic_recurrent_op.cc
+++ b/paddle/operators/dynamic_recurrent_op.cc
@ -48,12 +48,11 @@ inline void ReorderBootState(const DySeqMetaBatch& metas,
                             const LoDTensor& boot_state, LoDTensor* tensor,
                             const platform::Place& dst_place) {
  for (size_t seq_id = 0; seq_id < metas.size(); seq_id++) {
-    auto slice = tensor->Slice<T>(seq_id, seq_id + 1);
+    auto slice = tensor->Slice(seq_id, seq_id + 1);
    auto boot_slice =
-        boot_state.Slice<T>(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1);
+        boot_state.Slice(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1);
    // TODO(superjom) pass in device context as an argument
-    slice.template CopyFrom<T>(boot_slice, dst_place,
-                               platform::CPUDeviceContext());
+    slice.CopyFrom(boot_slice, dst_place, platform::CPUDeviceContext());
  }
 }

@ -138,7 +137,7 @@ void DynamicRecurrentOp::WriteStepInputs() const {
      if (var == nullptr) {
        var = step_scope.Var(item.first);
      }
-      var->GetMutable<LoDTensor>()->ShareDataWith<value_type>(tensor);
+      var->GetMutable<LoDTensor>()->ShareDataWith(tensor);
    }
  }
 }
@ -206,7 +205,7 @@ void DynamicRecurrentOp::ConcatOutputs() const {
  for (auto& item : step_outputs_) {
    auto tensor = item.second.Pack(level, some_meta, some_lod);
    auto* output = cache_.outlinks[item.first]->GetMutable<LoDTensor>();
-    const_cast<LoDTensor*>(output)->ShareDataWith<value_type>(tensor);
+    const_cast<LoDTensor*>(output)->ShareDataWith(tensor);
  }
 }

@ -260,8 +259,8 @@ void DynamicRecurrentOp::LinkState(const rnn::MemoryAttr& memory,
  }

  // shink and share from previous state
-  auto shrinked_pre_state = pre_state->Slice<value_type>(0, num_instances);
-  state_pre.ShareDataWith<value_type>(shrinked_pre_state);
+  auto shrinked_pre_state = pre_state->Slice(0, num_instances);
+  state_pre.ShareDataWith(shrinked_pre_state);
 }

 void DynamicRecurrentOp::ArgCache::Init(
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@ -47,7 +47,7 @@ class FeedOp : public framework::OperatorBase {
    auto &feed_list = feed_var->Get<framework::FeedFetchList>();
    auto &feed_item = feed_list.at(static_cast<size_t>(col));
    auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
-    out_item->CopyFromTensor(feed_item, dev_ctx.GetPlace(), dev_ctx);
+    out_item->CopyFrom(feed_item, dev_ctx.GetPlace(), dev_ctx);
    out_item->set_lod(feed_item.lod());
  }
 };
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@ -51,7 +51,7 @@ class FetchOp : public framework::OperatorBase {

    // FIXME(yuyang18): Should we assume the fetch operator always generate
    // CPU outputs?
-    dst_item.CopyFromTensor(src_item, platform::CPUPlace(), dev_ctx);
+    dst_item.CopyFrom(src_item, platform::CPUPlace(), dev_ctx);

    VLOG(3) << "Fetch variable " << fetch_var_name << " to " << out_name;
  }
--- a/paddle/operators/math/im2col_test.cc
+++ b/paddle/operators/math/im2col_test.cc
@ -64,7 +64,7 @@ void testIm2col() {
  if (paddle::platform::is_cpu_place(*place)) {
    input = input_tmp;
  } else {
-    input.CopyFrom<float>(input_tmp, *place, *context);
+    input.CopyFrom(input_tmp, *place, *context);
  }
  output_cfo.mutable_data<float>(
      {1, filter_size, filter_size, output_height, output_width}, *place);
@ -85,8 +85,7 @@ void testIm2col() {
  if (paddle::platform::is_cpu_place(*place)) {
    out_cfo_ptr = output_cfo.data<float>();
  } else {
-    output_tmp.CopyFrom<float>(output_cfo, paddle::platform::CPUPlace(),
-                               *context);
+    output_tmp.CopyFrom(output_cfo, paddle::platform::CPUPlace(), *context);
    out_cfo_ptr = output_tmp.data<float>();
  }
  EXPECT_EQ(out_cfo_ptr[0], 0);
@ -102,8 +101,7 @@ void testIm2col() {
  if (paddle::platform::is_cpu_place(*place)) {
    out_ocf_ptr = output_ocf.data<float>();
  } else {
-    output_tmp.CopyFrom<float>(output_ocf, paddle::platform::CPUPlace(),
-                               *context);
+    output_tmp.CopyFrom(output_ocf, paddle::platform::CPUPlace(), *context);
    out_ocf_ptr = output_tmp.data<float>();
  }
  EXPECT_EQ(out_ocf_ptr[0], 0);
--- a/paddle/operators/math/math_function_test.cu
+++ b/paddle/operators/math/math_function_test.cu
@ -16,15 +16,15 @@ TEST(math_function, notrans_mul_trans) {
  auto* gpu_place = new paddle::platform::GPUPlace(0);
  paddle::platform::CUDADeviceContext context(*gpu_place);

-  input1_gpu.CopyFrom<float>(input1, *gpu_place, context);
-  input2_gpu.CopyFrom<float>(input1, *gpu_place, context);
+  input1_gpu.CopyFrom(input1, *gpu_place, context);
+  input2_gpu.CopyFrom(input1, *gpu_place, context);

  out_gpu.mutable_data<float>({2, 2}, *gpu_place);

  paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
      context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);

-  out.CopyFrom<float>(out_gpu, *cpu_place, context);
+  out.CopyFrom(out_gpu, *cpu_place, context);

  float* out_ptr = out.data<float>();
  context.Wait();
@ -50,15 +50,15 @@ TEST(math_function, trans_mul_notrans) {
  auto* gpu_place = new paddle::platform::GPUPlace(0);
  paddle::platform::CUDADeviceContext context(*gpu_place);

-  input1_gpu.CopyFrom<float>(input1, *gpu_place, context);
-  input2_gpu.CopyFrom<float>(input1, *gpu_place, context);
+  input1_gpu.CopyFrom(input1, *gpu_place, context);
+  input2_gpu.CopyFrom(input1, *gpu_place, context);

  out_gpu.mutable_data<float>({3, 3}, *gpu_place);

  paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
      context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);

-  out.CopyFrom<float>(out_gpu, *cpu_place, context);
+  out.CopyFrom(out_gpu, *cpu_place, context);

  float* out_ptr = out.data<float>();
  context.Wait();
@ -99,9 +99,9 @@ TEST(math_function, gemm_notrans_cublas) {
  auto* gpu_place = new paddle::platform::GPUPlace(0);
  paddle::platform::CUDADeviceContext context(*gpu_place);

-  input1_gpu.CopyFrom<float>(input1, *gpu_place, context);
-  input2_gpu.CopyFrom<float>(input2, *gpu_place, context);
-  input3_gpu.CopyFrom<float>(input3, *gpu_place, context);
+  input1_gpu.CopyFrom(input1, *gpu_place, context);
+  input2_gpu.CopyFrom(input2, *gpu_place, context);
+  input3_gpu.CopyFrom(input3, *gpu_place, context);
  float* a = input1_gpu.data<float>();
  float* b = input2_gpu.data<float>();
  float* c = input3_gpu.mutable_data<float>(*gpu_place);
@ -109,7 +109,7 @@ TEST(math_function, gemm_notrans_cublas) {
  paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
      context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);

-  input3.CopyFrom<float>(input3_gpu, *cpu_place, context);
+  input3.CopyFrom(input3_gpu, *cpu_place, context);

  // numpy code:
  // a = np.arange(6).reshape(2, 3)
@ -154,9 +154,9 @@ TEST(math_function, gemm_trans_cublas) {
  auto* gpu_place = new paddle::platform::GPUPlace(0);
  paddle::platform::CUDADeviceContext context(*gpu_place);

-  input1_gpu.CopyFrom<float>(input1, *gpu_place, context);
-  input2_gpu.CopyFrom<float>(input2, *gpu_place, context);
-  input3_gpu.CopyFrom<float>(input3, *gpu_place, context);
+  input1_gpu.CopyFrom(input1, *gpu_place, context);
+  input2_gpu.CopyFrom(input2, *gpu_place, context);
+  input3_gpu.CopyFrom(input3, *gpu_place, context);
  float* a = input1_gpu.data<float>();
  float* b = input2_gpu.data<float>();
  float* c = input3_gpu.mutable_data<float>(*gpu_place);
@ -164,7 +164,7 @@ TEST(math_function, gemm_trans_cublas) {
  paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
      context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);

-  input3.CopyFrom<float>(input3_gpu, *cpu_place, context);
+  input3.CopyFrom(input3_gpu, *cpu_place, context);
  context.Wait();

  EXPECT_EQ(input3_ptr[0], 0);
--- a/paddle/operators/math/selected_rows_functor_test.cu
+++ b/paddle/operators/math/selected_rows_functor_test.cu
@ -67,7 +67,7 @@ TEST(selected_rows_functor, gpu_add) {
  EXPECT_EQ(out_rows[6], 9);

  Tensor out_cpu;
-  out_cpu.CopyFrom<float>(*out_value, cpu_place, ctx);
+  out_cpu.CopyFrom(*out_value, cpu_place, ctx);
  ctx.Wait();

  auto* out_cpu_data = out_cpu.data<float>();
@ -94,7 +94,7 @@ TEST(selected_rows_functor, gpu_add) {
  add_tensor_functor(ctx, *output, *tensor1, tensor2.get());

  Tensor tensor2_cpu;
-  tensor2_cpu.CopyFrom<float>(*tensor2, cpu_place, ctx);
+  tensor2_cpu.CopyFrom(*tensor2, cpu_place, ctx);
  ctx.Wait();

  auto* tensor2_cpu_data = tensor2_cpu.data<float>();
--- a/paddle/operators/math/vol2col_test.cc
+++ b/paddle/operators/math/vol2col_test.cc
@ -78,7 +78,7 @@ void testVol2col() {
  if (paddle::platform::is_cpu_place(*place)) {
    input = input_tmp;
  } else {
-    input.CopyFrom<float>(input_tmp, *place, *context);
+    input.CopyFrom(input_tmp, *place, *context);
  }
  output.mutable_data<float>({1, filter_size, filter_size, filter_size,
                              output_depth, output_height, output_width},
@ -93,7 +93,7 @@ void testVol2col() {
  if (paddle::platform::is_cpu_place(*place)) {
    out_cfo_ptr = output.data<float>();
  } else {
-    output_tmp.CopyFrom<float>(output, paddle::platform::CPUPlace(), *context);
+    output_tmp.CopyFrom(output, paddle::platform::CPUPlace(), *context);
    out_cfo_ptr = output_tmp.data<float>();
  }

@ -107,7 +107,7 @@ void testVol2col() {
  if (paddle::platform::is_cpu_place(*place)) {
    input = input_tmp;
  } else {
-    input.CopyFrom<float>(input_tmp, *place, *context);
+    input.CopyFrom(input_tmp, *place, *context);
  }

  paddle::operators::math::Col2VolFunctor<Place, float> col2vol;
@ -118,7 +118,7 @@ void testVol2col() {
  if (paddle::platform::is_cpu_place(*place)) {
    in_ptr = input.data<float>();
  } else {
-    input_tmp.CopyFrom<float>(input, paddle::platform::CPUPlace(), *context);
+    input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context);
    in_ptr = input_tmp.data<float>();
  }

--- a/paddle/operators/matmul_op.h
+++ b/paddle/operators/matmul_op.h
@ -46,7 +46,7 @@ class MatMulKernel : public framework::OpKernel<T> {
 template <typename T>
 inline Tensor Reshape(const Tensor& input, const DDim& dims) {
  Tensor output;
-  output.ShareDataWith<T>(input);
+  output.ShareDataWith(input);
  output.Resize(dims);
  return output;
 }
@ -56,7 +56,7 @@ inline Tensor Reshape(const Tensor& input, const DDim& dims) {
 template <typename T>
 Tensor CombineBatchAndM(const Tensor& input) {
  Tensor output;
-  output.ShareDataWith<T>(input);
+  output.ShareDataWith(input);
  auto in_dims = input.dims();
  if (in_dims.size() == 3) {
    std::vector<int64_t> out_dims = {in_dims[0] * in_dims[1], in_dims[2]};
@ -80,7 +80,7 @@ Tensor CombineBatchAndN(const framework::ExecutionContext& context,
    std::vector<int64_t> out_dims = {in_dims[1], in_dims[0] * in_dims[2]};
    output.Resize(make_ddim(out_dims));
  } else {
-    output.ShareDataWith<T>(input);
+    output.ShareDataWith(input);
  }
  return output;
 }
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@ -36,12 +36,12 @@ class MulKernel : public framework::OpKernel<T> {
    Tensor* z = context.Output<Tensor>("Out");
    const Tensor x_matrix =
        x->dims().size() > 2
-            ? framework::ReshapeToMatrix<T>(
+            ? framework::ReshapeToMatrix(
                  *x, context.template Attr<int>("x_num_col_dims"))
            : *x;
    const Tensor y_matrix =
        y->dims().size() > 2
-            ? framework::ReshapeToMatrix<T>(
+            ? framework::ReshapeToMatrix(
                  *y, context.template Attr<int>("y_num_col_dims"))
            : *y;

@ -59,30 +59,30 @@ class MulGradKernel : public framework::OpKernel<T> {
    int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
    const Tensor* x = ctx.Input<Tensor>("X");
    const Tensor* y = ctx.Input<Tensor>("Y");
-    const Tensor x_matrix =
-        x->dims().size() > 2 ? framework::ReshapeToMatrix<T>(*x, x_num_col_dims)
-                             : *x;
-    const Tensor y_matrix =
-        y->dims().size() > 2 ? framework::ReshapeToMatrix<T>(*y, y_num_col_dims)
-                             : *y;
+    const Tensor x_matrix = x->dims().size() > 2
+                                ? framework::ReshapeToMatrix(*x, x_num_col_dims)
+                                : *x;
+    const Tensor y_matrix = y->dims().size() > 2
+                                ? framework::ReshapeToMatrix(*y, y_num_col_dims)
+                                : *y;
    const Tensor* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));

    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
    Tensor* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
    if (dx) {
      dx->mutable_data<T>(ctx.GetPlace());
-      Tensor dx_matrix = dx->dims().size() > 2 ? framework::ReshapeToMatrix<T>(
-                                                     *dx, x_num_col_dims)
-                                               : *dx;
+      Tensor dx_matrix = dx->dims().size() > 2
+                             ? framework::ReshapeToMatrix(*dx, x_num_col_dims)
+                             : *dx;
      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
      math::matmul<Place, T>(ctx.device_context(), *dout, false, y_matrix, true,
                             1, &dx_matrix, 0);
    }
    if (dy) {
      dy->mutable_data<T>(ctx.GetPlace());
-      Tensor dy_matrix = dy->dims().size() > 2 ? framework::ReshapeToMatrix<T>(
-                                                     *dy, y_num_col_dims)
-                                               : *dy;
+      Tensor dy_matrix = dy->dims().size() > 2
+                             ? framework::ReshapeToMatrix(*dy, y_num_col_dims)
+                             : *dy;
      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
      math::matmul<Place, T>(ctx.device_context(), x_matrix, true, *dout, false,
                             1, &dy_matrix, 0);
--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
@ -33,8 +33,7 @@ class MultiplexGPUKernel : public framework::OpKernel<T> {
    auto cols = ins[0]->numel() / rows;
    // copy index to cpu
    Tensor index_t_cpu;
-    index_t_cpu.CopyFrom<int32_t>(*ids, platform::CPUPlace(),
-                                  ctx.device_context());
+    index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), ctx.device_context());
    auto* index = index_t_cpu.data<int32_t>();
    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
                      ctx.device_context())
@ -71,8 +70,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
    auto cols = ins[0]->numel() / rows;
    // copy index to cpu
    Tensor index_t_cpu;
-    index_t_cpu.CopyFrom<int32_t>(*ids, platform::CPUPlace(),
-                                  ctx.device_context());
+    index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), ctx.device_context());
    auto* index = index_t_cpu.data<int32_t>();

    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@ -95,7 +95,7 @@ void RecurrentAlgorithm::InitMemories(Scope* step_scope) const {
        step_scope->FindVar(attr.boot_var)->GetMutable<LoDTensor>();
    pre_mem->Resize(boot_mem->dims());
    PADDLE_ENFORCE_EQ(pre_mem->dims().size(), 2);
-    pre_mem->ShareDataWith<float>(*boot_mem);
+    pre_mem->ShareDataWith(*boot_mem);
  }
 }

@ -171,7 +171,7 @@ void RecurrentGradientAlgorithm::LinkBootMemoryGradients(
    auto* boot_mem_grad =
        step_scope->Var(attr.boot_var)->GetMutable<LoDTensor>();
    boot_mem_grad->Resize(mem_grad->dims());
-    boot_mem_grad->ShareDataWith<float>(*mem_grad);
+    boot_mem_grad->ShareDataWith(*mem_grad);
  }
 }

--- a/paddle/operators/reshape_op.h
+++ b/paddle/operators/reshape_op.h
@ -33,7 +33,7 @@ class ReshapeKernel : public framework::OpKernel<T> {
    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
                   [](int a) { return static_cast<int64_t>(a); });
    auto out_dims = framework::make_ddim(shape_int64);
-    out->CopyFrom<T>(*in, ctx.GetPlace(), ctx.device_context());
+    out->CopyFrom(*in, ctx.GetPlace(), ctx.device_context());
    out->Resize(out_dims);
  }
 };
@ -47,7 +47,7 @@ class ReshapeGradKernel : public framework::OpKernel<T> {
    d_x->mutable_data<T>(ctx.GetPlace());

    auto in_dims = d_x->dims();
-    d_x->CopyFrom<T>(*d_out, ctx.GetPlace(), ctx.device_context());
+    d_x->CopyFrom(*d_out, ctx.GetPlace(), ctx.device_context());
    d_x->Resize(in_dims);
  }
 };
--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
@ -43,7 +43,7 @@ void SegmentInputs(const std::vector<Scope*>& step_scopes,
          step_scopes[j]->Var(inlinks[i])->GetMutable<Tensor>();
      // The input of operators of each step is Tensor here.
      // Maybe need to modify Slice function.
-      *step_input = input->Slice<float>(j, j + 1);
+      *step_input = input->Slice(j, j + 1);
      step_input->Resize(step_dims);
    }
  }
@ -71,8 +71,8 @@ void ConcatOutputs(const std::vector<Scope*>& step_scopes,
          step_scopes[j]->FindVar(outlinks[i])->GetMutable<LoDTensor>();
      // TODO(luotao02) data type and platform::DeviceContext() should set
      // correctly
-      (output->Slice<float>(j, j + 1))
-          .CopyFrom<float>(*step_output, platform::CPUPlace(), ctx);
+      (output->Slice(j, j + 1))
+          .CopyFrom(*step_output, platform::CPUPlace(), ctx);
    }
  }
 }
@ -95,7 +95,7 @@ void LinkMemories(const std::vector<Scope*>& scopes,
    auto* mem = scope->FindVar(attr.pre_var)->GetMutable<LoDTensor>();
    auto* linked_mem = linked_scope->FindVar(attr.var)->GetMutable<LoDTensor>();
    mem->Resize(linked_mem->dims());
-    mem->ShareDataWith<float>(*linked_mem);
+    mem->ShareDataWith(*linked_mem);
  }
 }

--- a/paddle/operators/scatter_op.cu
+++ b/paddle/operators/scatter_op.cu
@ -30,7 +30,7 @@ class ScatterOpCUDAKernel : public framework::OpKernel<T> {
    auto *Updates = ctx.Input<Tensor>("Updates");
    auto *Out = ctx.Output<Tensor>("Out");

-    Out->ShareDataWith<T>(*Ref);
+    Out->ShareDataWith(*Ref);

    GPUScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
  }
@ -48,7 +48,7 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));

    // In place gradient: dRef = dO
-    dRef->ShareDataWith<T>(*dOut);
+    dRef->ShareDataWith(*dOut);
    dUpdates->mutable_data<T>(ctx.GetPlace());
    // Gradient by Gather: dUpdates = dO[Index]
    GPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);
--- a/paddle/operators/scatter_op.h
+++ b/paddle/operators/scatter_op.h
@ -35,7 +35,7 @@ class ScatterOpKernel : public framework::OpKernel<T> {
    auto *Out = ctx.Output<Tensor>("Out");

    // In place output: Out = Ref, Out[Index] += Updates
-    Out->ShareDataWith<T>(*Ref);
+    Out->ShareDataWith(*Ref);
    // Apply ScatterUpdate: Out[index] += Updates[:]
    ScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
  }
@ -53,7 +53,7 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));

    // In place gradient: dRef = dO
-    dRef->ShareDataWith<T>(*dOut);
+    dRef->ShareDataWith(*dOut);
    dUpdates->mutable_data<T>(ctx.GetPlace());
    // Gradient by Gather: dUpdates += dO[Index]
    CPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);
--- a/Show More
+++ b/Show More