Merge pull request #4455 from reyoung/feature/make_paddle_support_double

Support double precision
7 years ago · 184768e070
parent 5d6d2bc1b9 d53b38e340
commit 184768e070
72 changed files with 341 additions and 142 deletions
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@ -0,0 +1,36 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <typeindex>
+#include "paddle/framework/framework.pb.h"
+
+namespace paddle {
+namespace framework {
+
+inline DataType ToDataType(std::type_index type) {
+  if (typeid(float).hash_code() == type.hash_code()) {
+    return DataType::FP32;
+  } else if (typeid(double).hash_code() == type.hash_code()) {
+    return DataType::FP64;
+  } else if (typeid(int).hash_code() == type.hash_code()) {
+    return DataType::INT32;
+  } else {
+    PADDLE_THROW("Not supported");
+    return static_cast<DataType>(-1);
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@ -100,13 +100,39 @@ class OpRegistrar : public Registrar {
  }
 };

-template <typename PlaceType, typename KernelType>
+template <typename PlaceType, bool at_end, size_t I, typename... KernelType>
+struct OpKernelRegistrarFunctor;
+
+template <typename PlaceType, size_t I, typename... KernelTypes>
+struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
+  using KERNEL_TYPE =
+      typename std::tuple_element<I, std::tuple<KernelTypes...>>::type;
+
+  void operator()(const char* op_type) const {
+    using T = typename KERNEL_TYPE::ELEMENT_TYPE;
+    OperatorWithKernel::OpKernelKey key(ToDataType(std::type_index(typeid(T))),
+                                        PlaceType());
+    OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);
+
+    constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
+    OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
+        func;
+    func(op_type);
+  }
+};
+
+template <typename PlaceType, size_t I, typename... KernelType>
+struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
+  void operator()(const char* op_type) const {}
+};
+
+// User can register many kernel in one place. The data type could be different.
+template <typename PlaceType, typename... KernelType>
 class OpKernelRegistrar : public Registrar {
 public:
  explicit OpKernelRegistrar(const char* op_type) {
-    OperatorWithKernel::OpKernelKey key;
-    key.place_ = PlaceType();
-    OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KernelType);
+    OpKernelRegistrarFunctor<PlaceType, false, 0, KernelType...> func;
+    func(op_type);
  }
 };

--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@ -22,6 +22,7 @@ limitations under the License. */

 #include "op_info.h"
 #include "paddle/framework/attribute.h"
+#include "paddle/framework/data_type.h"
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/scope.h"
@ -403,7 +404,7 @@ class RuntimeInferShapeContext : public InferShapeContextBase {
  const Scope& scope_;
 };

-class OpKernel {
+class OpKernelBase {
 public:
  /**
   * ExecutionContext is the only parameter of Kernel Run function.
@ -414,33 +415,47 @@ class OpKernel {

  virtual void Compute(const ExecutionContext& context) const = 0;

-  virtual ~OpKernel() {}
+  virtual ~OpKernelBase() = default;
+};
+
+template <typename T>
+class OpKernel : public OpKernelBase {
+ public:
+  using ELEMENT_TYPE = T;
 };

 class OperatorWithKernel : public OperatorBase {
 public:
  struct OpKernelKey {
    platform::Place place_;
+    DataType data_type_;

-    OpKernelKey() = default;
-    explicit OpKernelKey(const platform::DeviceContext& dev_ctx) {
-      place_ = dev_ctx.GetPlace();
-    }
+    OpKernelKey(DataType data_type, platform::Place place)
+        : place_(place), data_type_(data_type) {}
+
+    OpKernelKey(DataType data_type, const platform::DeviceContext& dev_ctx)
+        : place_(dev_ctx.GetPlace()), data_type_(data_type) {}

    bool operator==(const OpKernelKey& o) const {
-      return platform::places_are_same_class(place_, o.place_);
+      return platform::places_are_same_class(place_, o.place_) &&
+             data_type_ == o.data_type_;
    }
  };

  struct OpKernelHash {
-    std::hash<bool> hash_;
+    std::hash<int> hash_;
    size_t operator()(const OpKernelKey& key) const {
-      return hash_(platform::is_gpu_place(key.place_));
+      int place = key.place_.which();
+      int data_type = static_cast<int>(key.data_type_);
+      int pre_hash = data_type << NUM_PLACE_TYPE_LIMIT_IN_BIT |
+                     (place & ((1 << NUM_PLACE_TYPE_LIMIT_IN_BIT) - 1));
+      return hash_(pre_hash);
    }
  };

  using OpKernelMap =
-      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
+      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernelBase>,
+                         OpKernelHash>;

  OperatorWithKernel(const std::string& type, const VariableNameMap& inputs,
                     const VariableNameMap& outputs, const AttributeMap& attrs)
@ -451,8 +466,10 @@ class OperatorWithKernel : public OperatorBase {
    RuntimeInferShapeContext infer_shape_ctx(*this, scope);
    this->InferShape(&infer_shape_ctx);

-    auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
-    opKernel->Compute(ExecutionContext(*this, scope, dev_ctx));
+    ExecutionContext ctx(*this, scope, dev_ctx);
+    auto& opKernel = AllOpKernels().at(type_).at(
+        OpKernelKey(IndicateDataType(ctx), dev_ctx));
+    opKernel->Compute(ctx);
  }

  static std::unordered_map<std::string /* op_type */, OpKernelMap>&
@ -462,13 +479,43 @@ class OperatorWithKernel : public OperatorBase {
  }

  bool SupportGPU() const override {
-    OperatorWithKernel::OpKernelKey key;
-    key.place_ = platform::GPUPlace();
-    return OperatorWithKernel::AllOpKernels().at(type_).count(key) != 0;
+    auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_);
+    return std::any_of(op_kernels.begin(), op_kernels.end(),
+                       [](OpKernelMap::const_reference kern_pair) {
+                         return platform::is_gpu_place(kern_pair.first.place_);
+                       });
  }

 protected:
  virtual void InferShape(InferShapeContextBase* ctx) const = 0;
+
+  // indicate kernel DataType by input data. Defaultly all input data must be
+  // same.
+  virtual DataType IndicateDataType(const ExecutionContext& ctx) const {
+    auto& scope = ctx.scope();
+    int data_type = -1;
+    for (auto& input : this->inputs_) {
+      for (auto& ipt_name : input.second) {
+        auto* var = scope.FindVar(ipt_name);
+        if (var != nullptr) {
+          const Tensor* t = nullptr;
+          if (var->IsType<Tensor>()) {
+            t = &var->Get<Tensor>();
+          } else if (var->IsType<LoDTensor>()) {
+            t = &var->Get<LoDTensor>();
+          }
+          if (t != nullptr) {
+            int tmp = static_cast<int>(ToDataType(t->type()));
+            PADDLE_ENFORCE(tmp == data_type || data_type == -1,
+                           "DataType of Paddle Op must be same.");
+            data_type = tmp;
+          }
+        }
+      }
+    }
+    PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
+    return static_cast<DataType>(data_type);
+  }
 };

 }  // namespace framework
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@ -114,10 +114,13 @@ class OpWithKernelTest : public OperatorWithKernel {

 protected:
  void InferShape(framework::InferShapeContextBase* ctx) const override {}
+  DataType IndicateDataType(const ExecutionContext& ctx) const override {
+    return DataType::FP32;
+  }
 };

 template <typename T1, typename T2>
-class CPUKernelTest : public OpKernel {
+class CPUKernelTest : public OpKernel<float> {
 public:
  void Compute(const ExecutionContext& ctx) const {
    std::cout << "this is cpu kernel" << std::endl;
@ -144,7 +147,7 @@ class OpKernelTestMultiInputsProtoAndCheckerMaker
  }
 };

-class CPUKernalMultiInputsTest : public OpKernel {
+class CPUKernalMultiInputsTest : public OpKernel<float> {
 public:
  void Compute(const ExecutionContext& ctx) const {
    auto xs = ctx.op().Inputs("xs");
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@ -29,20 +29,10 @@ limitations under the License. */

 namespace paddle {

-namespace pybind {
-namespace details {
-template <bool less, size_t i, typename... args>
-struct CastToPyBufferImpl;
-}
-}  // namespace pybind
-
 namespace framework {

 class Tensor {
 public:
-  template <bool less, size_t i, typename... args>
-  friend struct pybind::details::CastToPyBufferImpl;
-
  template <typename T, size_t D, int MajorType, typename IndexType>
  friend struct EigenTensor;

@ -119,6 +109,8 @@ class Tensor {
    return holder_->place();
  }

+  std::type_index type() const { return holder_->type(); }
+
 private:
  template <typename T>
  inline void check_memory_size() const;
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@ -47,7 +47,7 @@ __global__ void AccuracyCudaKernel(const int N, const int D, const int* Xdata,
 }

 template <typename T>
-class AccuracyOpCUDAKernel : public framework::OpKernel {
+class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
--- a/paddle/operators/accuracy_op.h
+++ b/paddle/operators/accuracy_op.h
@ -35,7 +35,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;

 template <typename Place, typename T>
-class AccuracyKernel : public framework::OpKernel {
+class AccuracyKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* inference = ctx.Input<Tensor>("Inference");
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {

 template <typename Place, typename T, typename Functor>
-class ActivationKernel : public framework::OpKernel {
+class ActivationKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* X = context.Input<framework::Tensor>("X");
@ -36,7 +36,7 @@ class ActivationKernel : public framework::OpKernel {
 };

 template <typename Place, typename T, typename Functor>
-class ActivationGradKernel : public framework::OpKernel {
+class ActivationGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* X = context.Input<framework::Tensor>("X");
@ -202,7 +202,7 @@ struct SquareGradFunctor {
 };

 template <typename Place, typename T, typename AttrType = T>
-class BReluKernel : public framework::OpKernel {
+class BReluKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* X = context.Input<framework::Tensor>("X");
@ -219,7 +219,7 @@ class BReluKernel : public framework::OpKernel {
 };

 template <typename Place, typename T, typename AttrType = T>
-class BReluGradKernel : public framework::OpKernel {
+class BReluGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* X = context.Input<framework::Tensor>("X");
@ -239,7 +239,7 @@ class BReluGradKernel : public framework::OpKernel {
 };

 template <typename Place, typename T, typename AttrType = T>
-class SoftReluKernel : public framework::OpKernel {
+class SoftReluKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* X = context.Input<framework::Tensor>("X");
@ -256,7 +256,7 @@ class SoftReluKernel : public framework::OpKernel {
 };

 template <typename Place, typename T, typename AttrType = T>
-class SoftReluGradKernel : public framework::OpKernel {
+class SoftReluGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* X = context.Input<framework::Tensor>("X");
@ -277,7 +277,7 @@ class SoftReluGradKernel : public framework::OpKernel {
 };

 template <typename Place, typename T, typename AttrType = T>
-class PowKernel : public framework::OpKernel {
+class PowKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* X = context.Input<framework::Tensor>("X");
@ -293,7 +293,7 @@ class PowKernel : public framework::OpKernel {
 };

 template <typename Place, typename T, typename AttrType = T>
-class PowGradKernel : public framework::OpKernel {
+class PowGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* X = context.Input<framework::Tensor>("X");
@ -312,7 +312,7 @@ class PowGradKernel : public framework::OpKernel {
 };

 template <typename Place, typename T, typename AttrType = T>
-class STanhKernel : public framework::OpKernel {
+class STanhKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* X = context.Input<framework::Tensor>("X");
@ -329,7 +329,7 @@ class STanhKernel : public framework::OpKernel {
 };

 template <typename Place, typename T, typename AttrType = T>
-class STanhGradKernel : public framework::OpKernel {
+class STanhGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* X = context.Input<framework::Tensor>("X");
--- a/paddle/operators/add_op.h
+++ b/paddle/operators/add_op.h
@ -25,7 +25,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;

 template <typename Place, typename T>
-class AddKernel : public framework::OpKernel {
+class AddKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* input0 = context.Input<Tensor>("X");
--- a/paddle/operators/clip_op.h
+++ b/paddle/operators/clip_op.h
@ -56,7 +56,7 @@ class ClipGradFunctor {
 };

 template <typename Place, typename T>
-class ClipKernel : public framework::OpKernel {
+class ClipKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto max = context.Attr<T>("max");
@ -73,7 +73,7 @@ class ClipKernel : public framework::OpKernel {
 };

 template <typename Place, typename T>
-class ClipGradKernel : public framework::OpKernel {
+class ClipGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto max = context.Attr<T>("max");
--- a/paddle/operators/concat_op.h
+++ b/paddle/operators/concat_op.h
@ -22,7 +22,7 @@ namespace paddle {
 namespace operators {

 template <typename Place, typename T>
-class ConcatKernel : public framework::OpKernel {
+class ConcatKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto ins = ctx.MultiInput<framework::Tensor>("X");
@ -44,7 +44,7 @@ class ConcatKernel : public framework::OpKernel {
 };

 template <typename Place, typename T>
-class ConcatGradKernel : public framework::OpKernel {
+class ConcatGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const {
    auto* in = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
--- a/paddle/operators/cos_sim_op.h
+++ b/paddle/operators/cos_sim_op.h
@ -28,7 +28,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;

 template <typename Place, typename T>
-class CosSimKernel : public framework::OpKernel {
+class CosSimKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    // get Tensor
@ -67,7 +67,7 @@ class CosSimKernel : public framework::OpKernel {
 };

 template <typename Place, typename T>
-class CosSimGradKernel : public framework::OpKernel {
+class CosSimGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    // get Tensor
--- a/paddle/operators/crop_op.h
+++ b/paddle/operators/crop_op.h
@ -27,7 +27,7 @@ using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 using framework::Tensor;

 template <typename T>
-class CropKernel : public framework::OpKernel {
+class CropKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* x = context.Input<Tensor>("X");
@ -69,7 +69,7 @@ void CropGradFunction(const framework::ExecutionContext& context) {
 }

 template <typename Place, typename T>
-class CropGradKernel : public framework::OpKernel {
+class CropGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    size_t rank =
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@ -47,6 +47,12 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("Y", {x_dims[0], 1});
    ctx->ShareLoD("X", /*->*/ "Y");
  }
+
+  // CrossEntropy's data type just determined by "X"
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("X")->type());
+  }
 };

 class CrossEntropyGradientOp : public framework::OperatorWithKernel {
@ -87,6 +93,12 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
    }
    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
  }
+
+  // CrossEntropy's data type just determined by "X"
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("X")->type());
+  }
 };

 class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@ -53,7 +53,7 @@ __global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
 }  // namespace

 template <typename T>
-class CrossEntropyOpCUDAKernel : public framework::OpKernel {
+class CrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
@ -69,7 +69,7 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel {
 };

 template <typename T>
-class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
+class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@ -26,7 +26,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;

 template <typename T>
-class CrossEntropyOpKernel : public framework::OpKernel {
+class CrossEntropyOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
@ -42,7 +42,7 @@ class CrossEntropyOpKernel : public framework::OpKernel {
 };

 template <typename T>
-class CrossEntropyGradientOpKernel : public framework::OpKernel {
+class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
@ -47,7 +47,7 @@ struct MaskGenerator {
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
 template <typename Place, typename T, typename AttrType>
-class GPUDropoutKernel : public framework::OpKernel {
+class GPUDropoutKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* x = context.Input<Tensor>("X");
--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
@ -26,7 +26,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;

 template <typename Place, typename T, typename AttrType>
-class CPUDropoutKernel : public framework::OpKernel {
+class CPUDropoutKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* x = context.Input<Tensor>("X");
@ -62,7 +62,7 @@ class CPUDropoutKernel : public framework::OpKernel {
 };

 template <typename Place, typename T>
-class DropoutGradKernel : public framework::OpKernel {
+class DropoutGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    PADDLE_ENFORCE(context.Attr<bool>("is_training"),
--- a/paddle/operators/elementwise_add_op.h
+++ b/paddle/operators/elementwise_add_op.h
@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {

 template <typename Place, typename T>
-class ElementwiseAddKernel : public framework::OpKernel {
+class ElementwiseAddKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    ElementwiseCompute<EigenAddFunctor, Place, T>(ctx);
@ -101,7 +101,7 @@ struct ElementwiseAddBroadCast2GradFunctor {
 };

 template <typename Place, typename T>
-class ElementwiseAddGradKernel : public framework::OpKernel {
+class ElementwiseAddGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    ElementwiseGradCompute<Place, T, ElementwiseAddGradFunctor<T>,
--- a/paddle/operators/elementwise_div_op.h
+++ b/paddle/operators/elementwise_div_op.h
@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {

 template <typename Place, typename T>
-class ElementwiseDivKernel : public framework::OpKernel {
+class ElementwiseDivKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    ElementwiseCompute<EigenDivFunctor, Place, T>(ctx);
@ -103,7 +103,7 @@ struct ElementwiseDivBroadCast2GradFunctor {
 };

 template <typename Place, typename T>
-class ElementwiseDivGradKernel : public framework::OpKernel {
+class ElementwiseDivGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    ElementwiseGradCompute<Place, T, ElementwiseDivGradFunctor<T>,
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
@ -36,7 +36,9 @@ REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker,
            elementwise_mul_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
    elementwise_mul,
-    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, float>);
+    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, float>,
+    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
    elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, float>);
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, float>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/elementwise_mul_op.cu
+++ b/paddle/operators/elementwise_mul_op.cu
@ -19,7 +19,9 @@ namespace ops = paddle::operators;

 REGISTER_OP_GPU_KERNEL(
    elementwise_mul,
-    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, float>);
+    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, float>,
+    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
    elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, float>);
+    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, float>,
+    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, double>);
--- a/paddle/operators/elementwise_mul_op.h
+++ b/paddle/operators/elementwise_mul_op.h
@ -19,7 +19,7 @@ namespace paddle {
 namespace operators {

 template <typename Place, typename T>
-class ElementwiseMulKernel : public framework::OpKernel {
+class ElementwiseMulKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    ElementwiseCompute<EigenMulFunctor, Place, T>(ctx);
@ -102,7 +102,7 @@ struct ElementwiseMulBroadCast2GradFunctor {
 };

 template <typename Place, typename T>
-class ElementwiseMulGradKernel : public framework::OpKernel {
+class ElementwiseMulGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    ElementwiseGradCompute<Place, T, ElementwiseMulGradFunctor<T>,
--- a/paddle/operators/elementwise_sub_op.h
+++ b/paddle/operators/elementwise_sub_op.h
@ -19,7 +19,7 @@ namespace paddle {
 namespace operators {

 template <typename Place, typename T>
-class ElementwiseSubKernel : public framework::OpKernel {
+class ElementwiseSubKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    ElementwiseCompute<EigenSubFunctor, Place, T>(ctx);
@ -102,7 +102,7 @@ struct ElementwiseSubBroadCast2GradFunctor {
 };

 template <typename Place, typename T>
-class ElementwiseSubGradKernel : public framework::OpKernel {
+class ElementwiseSubGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    ElementwiseGradCompute<Place, T, ElementwiseSubGradFunctor<T>,
--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {

 template <typename Place, typename T>
-class FillZerosLikeKernel : public framework::OpKernel {
+class FillZerosLikeKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* output = context.Output<framework::Tensor>("Y");
--- a/Show More
+++ b/Show More