Refine device context (#6433)

There are mainly following fixes: - take `DeviceContext` as the template parameter of math functors and OpKernel instead of `Place` - remove `eigen_device` interface in base class `DeviceContext` - remove `GetEigenDevice` interface in `ExecutionContext` and base class `DeviceContext` - remove unused `platform::EigenDeviceConverter` - rename `REGISTER_OP_GPU_KERNEL` to `REGISTER_OP_CUDA_KERNEL` - rename `USE_GPU_ONLY_OP` to `USE_CUDA_ONLY_OP`
8 years ago · 61ec0b9516
parent 7902ad653c
commit 61ec0b9516
319 changed files with 2624 additions and 2546 deletions
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@ -181,8 +181,8 @@ class OpKernelRegistrar : public Registrar {
    return 0;                                                             \
  }

-#define REGISTER_OP_GPU_KERNEL(op_type, ...) \
-  REGISTER_OP_KERNEL(op_type, GPU, ::paddle::platform::GPUPlace, __VA_ARGS__)
+#define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
+  REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::GPUPlace, __VA_ARGS__)

 #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
  REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
@ -217,7 +217,7 @@ class OpKernelRegistrar : public Registrar {
 #else
 #define USE_OP_KERNEL(op_type)        \
  USE_OP_DEVICE_KERNEL(op_type, CPU); \
-  USE_OP_DEVICE_KERNEL(op_type, GPU)
+  USE_OP_DEVICE_KERNEL(op_type, CUDA)
 #endif

 #define USE_NO_KERNEL_OP(op_type) USE_OP_ITSELF(op_type);
@ -226,9 +226,9 @@ class OpKernelRegistrar : public Registrar {
  USE_OP_ITSELF(op_type);        \
  USE_OP_DEVICE_KERNEL(op_type, CPU);

-#define USE_GPU_ONLY_OP(op_type) \
-  USE_OP_ITSELF(op_type);        \
-  USE_OP_DEVICE_KERNEL(op_type, GPU)
+#define USE_CUDA_ONLY_OP(op_type) \
+  USE_OP_ITSELF(op_type);         \
+  USE_OP_DEVICE_KERNEL(op_type, CUDA)

 #define USE_OP(op_type)   \
  USE_OP_ITSELF(op_type); \
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@ -22,20 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-template <>
-Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
-    platform::CPUPlace, Eigen::DefaultDevice>() const {
-  return *device_context_.GetEigenDevice<platform::CPUPlace>();
-}
-
-#ifdef PADDLE_WITH_CUDA
-template <>
-Eigen::GpuDevice&
-ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
-  return *device_context_.GetEigenDevice<platform::GPUPlace>();
-}
-#endif
-
 std::string OperatorBase::Input(const std::string& name) const {
  auto& ins = Inputs(name);
  PADDLE_ENFORCE_LE(ins.size(), 1UL,
@ -429,7 +415,7 @@ void OperatorWithKernel::Run(const Scope& scope,
 }
 OpKernelType OperatorWithKernel::GetKernelType(
    const ExecutionContext& ctx) const {
-  return OpKernelType(IndicateDataType(ctx), ctx.device_context());
+  return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
 }
 DataType OperatorWithKernel::IndicateDataType(
    const ExecutionContext& ctx) const {
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@ -276,17 +276,25 @@ class ExecutionContext {
    out_tensor->set_lod(in_tensor.lod());
  }

-  template <typename PlaceType,
-            typename DeviceType = typename platform::EigenDeviceConverter<
-                PlaceType>::EigenDeviceType>
-  DeviceType& GetEigenDevice() const;
-
  platform::Place GetPlace() const { return device_context_.GetPlace(); }

+  template <typename DeviceContextType>
+  const DeviceContextType& device_context() const {
+    return *reinterpret_cast<const DeviceContextType*>(&device_context_);
+  }
+
  const platform::DeviceContext& device_context() const {
    return device_context_;
  }

+#ifdef PADDLE_WITH_CUDA
+  const inline platform::CUDADeviceContext& cuda_device_context() const {
+    PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
+    return *reinterpret_cast<const platform::CUDADeviceContext*>(
+        &device_context_);
+  }
+#endif
+
  //! Get actual name vector for this input.
  const std::vector<std::string>& Inputs(const std::string& name) const {
    return op_.Inputs(name);
@ -297,14 +305,6 @@ class ExecutionContext {
    return op_.Outputs(name);
  }

-#ifdef PADDLE_WITH_CUDA
-  const inline platform::CUDADeviceContext& cuda_device_context() const {
-    PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
-    return *reinterpret_cast<const platform::CUDADeviceContext*>(
-        &device_context_);
-  }
-#endif
-
 private:
  const OperatorBase& op_;
  const Scope& scope_;
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@ -115,7 +115,7 @@ class OpWithKernelTest : public OperatorWithKernel {
 protected:
  void InferShape(framework::InferShapeContext* ctx) const override {}
  OpKernelType GetKernelType(const ExecutionContext& ctx) const override {
-    return OpKernelType(DataType::FP32, ctx.device_context());
+    return OpKernelType(DataType::FP32, ctx.GetPlace());
  }
 };

--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@ -138,7 +138,7 @@ function(op_library TARGET)
    if ("${TARGET}" STREQUAL "nccl_op")
        set(pybind_flag 1)
        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n")
+        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
    endif()

    # reduce_op contains several operators
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@ -57,7 +57,7 @@ class AccuracyOp : public framework::OperatorWithKernel {
      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
-        ctx.device_context());
+        ctx.GetPlace());
  }
 };

--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@ -104,5 +104,6 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {

 // FIXME(typhoonzero): types of T is for inference data.
 // label data is always int64
-REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
-                       paddle::operators::AccuracyOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(accuracy,
+                        paddle::operators::AccuracyOpCUDAKernel<float>,
+                        paddle::operators::AccuracyOpCUDAKernel<double>);
--- a/paddle/operators/accuracy_op.h
+++ b/paddle/operators/accuracy_op.h
@ -21,7 +21,7 @@ namespace operators {

 using Tensor = framework::Tensor;

-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AccuracyKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@ -611,16 +611,17 @@ REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker,
 REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad,
            ops::ActivationOpGrad);

-#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)       \
-  REGISTER_OP_CPU_KERNEL(                                                     \
-      act_type,                                                               \
-      ops::ActivationKernel<paddle::platform::CPUPlace, ops::functor<float>>, \
-      ops::ActivationKernel<paddle::platform::CPUPlace,                       \
-                            ops::functor<double>>);                           \
-  REGISTER_OP_CPU_KERNEL(                                                     \
-      act_type##_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace,  \
-                                                 ops::grad_functor<float>>,   \
-      ops::ActivationGradKernel<paddle::platform::CPUPlace,                   \
+#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)   \
+  REGISTER_OP_CPU_KERNEL(                                                 \
+      act_type, ops::ActivationKernel<paddle::platform::CPUDeviceContext, \
+                                      ops::functor<float>>,               \
+      ops::ActivationKernel<paddle::platform::CPUDeviceContext,           \
+                            ops::functor<double>>);                       \
+  REGISTER_OP_CPU_KERNEL(                                                 \
+      act_type##_grad,                                                    \
+      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
+                                ops::grad_functor<float>>,                \
+      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
                                ops::grad_functor<double>>);

 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
--- a/paddle/operators/activation_op.cu
+++ b/paddle/operators/activation_op.cu
@ -17,16 +17,17 @@

 namespace ops = paddle::operators;

-#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, functor, grad_functor)       \
-  REGISTER_OP_GPU_KERNEL(                                                     \
-      act_type,                                                               \
-      ops::ActivationKernel<paddle::platform::GPUPlace, ops::functor<float>>, \
-      ops::ActivationKernel<paddle::platform::GPUPlace,                       \
-                            ops::functor<double>>);                           \
-  REGISTER_OP_GPU_KERNEL(                                                     \
-      act_type##_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace,  \
-                                                 ops::grad_functor<float>>,   \
-      ops::ActivationGradKernel<paddle::platform::GPUPlace,                   \
+#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, functor, grad_functor)   \
+  REGISTER_OP_CUDA_KERNEL(                                                 \
+      act_type, ops::ActivationKernel<paddle::platform::CUDADeviceContext, \
+                                      ops::functor<float>>,                \
+      ops::ActivationKernel<paddle::platform::CUDADeviceContext,           \
+                            ops::functor<double>>);                        \
+  REGISTER_OP_CUDA_KERNEL(                                                 \
+      act_type##_grad,                                                     \
+      ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,       \
+                                ops::grad_functor<float>>,                 \
+      ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,       \
                                ops::grad_functor<double>>);

-FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_GPU_KERNEL);
+FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@ -19,7 +19,7 @@
 namespace paddle {
 namespace operators {

-template <typename Place, typename Functor>
+template <typename DeviceContext, typename Functor>
 class ActivationKernel
    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
 public:
@ -32,18 +32,19 @@ class ActivationKernel

    auto x = framework::EigenVector<T>::Flatten(*X);
    auto y = framework::EigenVector<T>::Flatten(*Y);
-    auto place = context.GetEigenDevice<Place>();
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
    Functor functor;

    auto attrs = functor.GetAttrs();
    for (auto& attr : attrs) {
      *attr.second = context.Attr<float>(attr.first);
    }
-    functor(place, x, y);
+    functor(*place, x, y);
  }
 };

-template <typename Place, typename Functor>
+template <typename DeviceContext, typename Functor>
 class ActivationGradKernel
    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
 public:
@ -59,13 +60,14 @@ class ActivationGradKernel
    auto x = framework::EigenVector<T>::Flatten(*X);
    auto y = framework::EigenVector<T>::Flatten(*Y);
    auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto place = context.GetEigenDevice<Place>();
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
    Functor functor;
    auto attrs = functor.GetAttrs();
    for (auto& attr : attrs) {
      *attr.second = context.Attr<float>(attr.first);
    }
-    functor(place, x, y, dy, dx);
+    functor(*place, x, y, dy, dx);
  }
 };

--- a/paddle/operators/adadelta_op.cc
+++ b/paddle/operators/adadelta_op.cc
@ -109,5 +109,5 @@ $$
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUPlace, float>,
-    ops::AdadeltaOpKernel<paddle::platform::CPUPlace, double>);
+    adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/operators/adadelta_op.cu
+++ b/paddle/operators/adadelta_op.cu
@ -16,6 +16,6 @@
 #include "paddle/operators/adadelta_op.h"

 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::GPUPlace, float>,
-    ops::AdadeltaOpKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    adadelta, ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/operators/adadelta_op.h
+++ b/paddle/operators/adadelta_op.h
@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AdadeltaOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@ -51,7 +51,7 @@ class AdadeltaOpKernel : public framework::OpKernel<T> {
        framework::EigenVector<T>::Flatten(*avg_squared_grad_out_tensor);
    auto avg_squared_update_out =
        framework::EigenVector<T>::Flatten(*avg_squared_update_out_tensor);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();

    avg_squared_grad_out.device(place) =
        rho * avg_squared_grad + (1 - rho) * grad.square();
--- a/paddle/operators/adagrad_op.cc
+++ b/paddle/operators/adagrad_op.cc
@ -100,8 +100,8 @@ size_t FindPos(const std::vector<int64_t>& rows, int64_t value) {
 }  // namespace

 template <typename T>
-struct SparseAdagradFunctor<platform::CPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SparseAdagradFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
                  const framework::SelectedRows& grad,
                  const framework::Tensor& learning_rate, T epsilon,
                  framework::Tensor* moment, framework::Tensor* param) {
@ -120,7 +120,7 @@ struct SparseAdagradFunctor<platform::CPUPlace, T> {
            {static_cast<int64_t>(merge_rows.size()), grad_width}),
        context.GetPlace());

-    math::SetConstant<platform::CPUPlace, T> constant_functor;
+    math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
    constant_functor(context, grad_merge->mutable_value(), 0.0);

    auto* grad_merge_data = grad_merge->mutable_value()->data<T>();
@ -144,9 +144,9 @@ struct SparseAdagradFunctor<platform::CPUPlace, T> {
    auto gs =
        framework::EigenVector<T>::Flatten(*(grad_square->mutable_value()));
    auto gm = framework::EigenVector<T>::Flatten(grad_merge->value());
-    gs.device(*context.GetEigenDevice<platform::CPUPlace>()) = gm * gm;
+    gs.device(*context.eigen_device()) = gm * gm;

-    math::SelectedRowsAddToTensor<platform::CPUPlace, T> functor;
+    math::SelectedRowsAddToTensor<platform::CPUDeviceContext, T> functor;
    functor(context, *grad_square, moment);

    // 3. update parameter
@ -164,13 +164,13 @@ struct SparseAdagradFunctor<platform::CPUPlace, T> {
  }
 };

-template struct SparseAdagradFunctor<platform::CPUPlace, float>;
-template struct SparseAdagradFunctor<platform::CPUPlace, double>;
+template struct SparseAdagradFunctor<platform::CPUDeviceContext, float>;
+template struct SparseAdagradFunctor<platform::CPUDeviceContext, double>;
 }  // namespace operators
 }  // namespace paddle

 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adagrad, ops::AdagradOp, ops::AdagradOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    adagrad, ops::AdagradOpKernel<paddle::platform::CPUPlace, float>,
-    ops::AdagradOpKernel<paddle::platform::CPUPlace, double>);
+    adagrad, ops::AdagradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AdagradOpKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/operators/adagrad_op.cu
+++ b/paddle/operators/adagrad_op.cu
@ -72,8 +72,8 @@ __global__ void SparseAdagradFunctorKernel(const T* grad, const int64_t* rows,
 }  // namespace

 template <typename T>
-struct SparseAdagradFunctor<platform::GPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
                  const framework::SelectedRows& grad,
                  const framework::Tensor& learning_rate, T epsilon,
                  framework::Tensor* moment, framework::Tensor* param) {
@ -92,7 +92,7 @@ struct SparseAdagradFunctor<platform::GPUPlace, T> {
            {static_cast<int64_t>(merge_rows.size()), grad_width}),
        context.GetPlace());

-    math::SetConstant<platform::GPUPlace, T> constant_functor;
+    math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
    constant_functor(context, grad_merge->mutable_value(), 0.0);

    auto* grad_merge_data = grad_merge->mutable_value()->data<T>();
@ -119,9 +119,9 @@ struct SparseAdagradFunctor<platform::GPUPlace, T> {
    auto gs =
        framework::EigenVector<T>::Flatten(*(grad_square->mutable_value()));
    auto gm = framework::EigenVector<T>::Flatten(grad_merge->value());
-    gs.device(*context.GetEigenDevice<platform::GPUPlace>()) = gm * gm;
+    gs.device(*context.eigen_device()) = gm * gm;

-    math::SelectedRowsAddToTensor<platform::GPUPlace, T> functor;
+    math::SelectedRowsAddToTensor<platform::CUDADeviceContext, T> functor;
    functor(context, *grad_square, moment);

    // 3. update parameter
@ -139,13 +139,13 @@ struct SparseAdagradFunctor<platform::GPUPlace, T> {
  }
 };

-template struct SparseAdagradFunctor<platform::GPUPlace, float>;
-template struct SparseAdagradFunctor<platform::GPUPlace, double>;
+template struct SparseAdagradFunctor<platform::CUDADeviceContext, float>;
+template struct SparseAdagradFunctor<platform::CUDADeviceContext, double>;

 }  // namespace operators
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    adagrad, ops::AdagradOpKernel<paddle::platform::GPUPlace, float>,
-    ops::AdagradOpKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    adagrad, ops::AdagradOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AdagradOpKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/operators/adagrad_op.h
+++ b/paddle/operators/adagrad_op.h
@ -19,15 +19,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 struct SparseAdagradFunctor {
-  void operator()(const platform::DeviceContext& context,
+  void operator()(const DeviceContext& context,
                  const framework::SelectedRows& grad,
                  const framework::Tensor& learning_rate, T epsilon,
                  framework::Tensor* moment, framework::Tensor* param);
 };

-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AdagradOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@ -52,11 +52,11 @@ class AdagradOpKernel : public framework::OpKernel<T> {

      auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
      auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
-      auto place = ctx.GetEigenDevice<Place>();
+      auto* place = ctx.template device_context<DeviceContext>().eigen_device();

-      moment_out.device(place) = moment + grad * grad;
+      moment_out.device(*place) = moment + grad * grad;
      Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
-      param_out.device(place) =
+      param_out.device(*place) =
          param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
    } else if (grad_var->IsType<framework::SelectedRows>()) {
      auto* param_tensor = ctx.Input<framework::Tensor>("Param");
@ -65,8 +65,9 @@ class AdagradOpKernel : public framework::OpKernel<T> {
      auto* moment_tensor = ctx.Input<framework::Tensor>("Moment");
      PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor);

-      SparseAdagradFunctor<Place, T> functor;
-      functor(ctx.device_context(), *ctx.Input<framework::SelectedRows>("Grad"),
+      SparseAdagradFunctor<DeviceContext, T> functor;
+      functor(ctx.template device_context<DeviceContext>(),
+              *ctx.Input<framework::SelectedRows>("Grad"),
              *ctx.Input<framework::Tensor>("LearningRate"), epsilon,
              moment_out_tensor, param_out_tensor);
    } else {
--- a/paddle/operators/adam_op.cc
+++ b/paddle/operators/adam_op.cc
@ -128,6 +128,6 @@ $$

 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker);
-REGISTER_OP_CPU_KERNEL(adam,
-                       ops::AdamOpKernel<paddle::platform::CPUPlace, float>,
-                       ops::AdamOpKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    adam, ops::AdamOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AdamOpKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/operators/adam_op.cu
+++ b/paddle/operators/adam_op.cu
@ -16,6 +16,6 @@
 #include "paddle/operators/adam_op.h"

 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(adam,
-                       ops::AdamOpKernel<paddle::platform::GPUPlace, float>,
-                       ops::AdamOpKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    adam, ops::AdamOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AdamOpKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/operators/adam_op.h
+++ b/paddle/operators/adam_op.h
@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AdamOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@ -52,17 +52,17 @@ class AdamOpKernel : public framework::OpKernel<T> {
    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
    auto moment1_out = framework::EigenVector<T>::Flatten(*moment1_out_tensor);
    auto moment2_out = framework::EigenVector<T>::Flatten(*moment2_out_tensor);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto* place = ctx.template device_context<DeviceContext>().eigen_device();

-    moment1_out.device(place) = beta1 * moment1 + (1 - beta1) * grad;
-    moment2_out.device(place) = beta2 * moment2 + (1 - beta2) * grad.square();
+    moment1_out.device(*place) = beta1 * moment1 + (1 - beta1) * grad;
+    moment2_out.device(*place) = beta2 * moment2 + (1 - beta2) * grad.square();

    // All of these are tensors of 1 element
    auto lr_t = lr * (1 - beta2_pow).sqrt() / (1 - beta1_pow);
    // Eigen does not support automatic broadcast
    // Get dimensions of moment vector to broadcast lr_t
    Eigen::DSizes<int, 1> m_dsize(moment1_out_tensor->numel());
-    param_out.device(place) =
+    param_out.device(*place) =
        param -
        lr_t.broadcast(m_dsize) *
            (moment1_out / (moment2_out.sqrt() + epsilon));
--- a/paddle/operators/adamax_op.cc
+++ b/paddle/operators/adamax_op.cc
@ -127,6 +127,6 @@ division by 0 error.

 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker);
-REGISTER_OP_CPU_KERNEL(adamax,
-                       ops::AdamaxOpKernel<paddle::platform::CPUPlace, float>,
-                       ops::AdamaxOpKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    adamax, ops::AdamaxOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AdamaxOpKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/operators/adamax_op.cu
+++ b/paddle/operators/adamax_op.cu
@ -16,6 +16,6 @@
 #include "paddle/operators/adamax_op.h"

 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(adamax,
-                       ops::AdamaxOpKernel<paddle::platform::GPUPlace, float>,
-                       ops::AdamaxOpKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    adamax, ops::AdamaxOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AdamaxOpKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/operators/adamax_op.h
+++ b/paddle/operators/adamax_op.h
@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AdamaxOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@ -51,14 +51,14 @@ class AdamaxOpKernel : public framework::OpKernel<T> {
    auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
    auto inf_norm_out =
        framework::EigenVector<T>::Flatten(*inf_norm_out_tensor);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto* place = ctx.template device_context<DeviceContext>().eigen_device();

-    moment_out.device(place) = beta1 * moment + (1 - beta1) * grad;
-    inf_norm_out.device(place) =
+    moment_out.device(*place) = beta1 * moment + (1 - beta1) * grad;
+    inf_norm_out.device(*place) =
        grad.abs().cwiseMax((beta2 * inf_norm) + epsilon);
    auto lr_t = lr / (1 - beta1_pow);
    Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
-    param_out.device(place) =
+    param_out.device(*place) =
        param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out);
  }
 };
--- a/paddle/operators/auc_op.h
+++ b/paddle/operators/auc_op.h
@ -25,7 +25,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;

-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AucKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@ -135,7 +135,8 @@ The required data format for this layer is one of the following:
 };

 template <typename T>
-class BatchNormKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
+class BatchNormKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
    const float epsilon = ctx.Attr<float>("epsilon");
@ -318,12 +319,12 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
      PADDLE_THROW("can't find Y@GRAD");
    }
    return framework::OpKernelType(framework::ToDataType(t->type()),
-                                   ctx.device_context());
+                                   ctx.GetPlace());
  }
 };

 template <typename T>
-class BatchNormGradKernel<platform::CPUPlace, T>
+class BatchNormGradKernel<platform::CPUDeviceContext, T>
    : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
@ -436,8 +437,9 @@ class BatchNormGradKernel<platform::CPUPlace, T>
 namespace ops = paddle::operators;
 REGISTER_OP(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
            batch_norm_grad, ops::BatchNormGradOp);
-REGISTER_OP_CPU_KERNEL(batch_norm,
-                       ops::BatchNormKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    batch_norm,
+    ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
    batch_norm_grad,
-    ops::BatchNormGradKernel<paddle::platform::CPUPlace, float>);
+    ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>);
--- a/Show More
+++ b/Show More