Refine device context ()

There are mainly following fixes:

- take `DeviceContext` as the template parameter of math functors and OpKernel instead of `Place`
- remove `eigen_device` interface in base class  `DeviceContext`
- remove `GetEigenDevice` interface in `ExecutionContext` and base class `DeviceContext`
- remove unused `platform::EigenDeviceConverter`
- rename `REGISTER_OP_GPU_KERNEL` to `REGISTER_OP_CUDA_KERNEL`
- rename `USE_GPU_ONLY_OP` to `USE_CUDA_ONLY_OP`
del_some_in_makelist
QI JUN 7 years ago committed by GitHub
parent 7902ad653c
commit 61ec0b9516
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -181,8 +181,8 @@ class OpKernelRegistrar : public Registrar {
return 0; \
}
#define REGISTER_OP_GPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, GPU, ::paddle::platform::GPUPlace, __VA_ARGS__)
#define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::GPUPlace, __VA_ARGS__)
#define REGISTER_OP_CPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
@ -217,7 +217,7 @@ class OpKernelRegistrar : public Registrar {
#else
#define USE_OP_KERNEL(op_type) \
USE_OP_DEVICE_KERNEL(op_type, CPU); \
USE_OP_DEVICE_KERNEL(op_type, GPU)
USE_OP_DEVICE_KERNEL(op_type, CUDA)
#endif
#define USE_NO_KERNEL_OP(op_type) USE_OP_ITSELF(op_type);
@ -226,9 +226,9 @@ class OpKernelRegistrar : public Registrar {
USE_OP_ITSELF(op_type); \
USE_OP_DEVICE_KERNEL(op_type, CPU);
#define USE_GPU_ONLY_OP(op_type) \
USE_OP_ITSELF(op_type); \
USE_OP_DEVICE_KERNEL(op_type, GPU)
#define USE_CUDA_ONLY_OP(op_type) \
USE_OP_ITSELF(op_type); \
USE_OP_DEVICE_KERNEL(op_type, CUDA)
#define USE_OP(op_type) \
USE_OP_ITSELF(op_type); \

@ -22,20 +22,6 @@ limitations under the License. */
namespace paddle {
namespace framework {
template <>
Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
platform::CPUPlace, Eigen::DefaultDevice>() const {
return *device_context_.GetEigenDevice<platform::CPUPlace>();
}
#ifdef PADDLE_WITH_CUDA
template <>
Eigen::GpuDevice&
ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
return *device_context_.GetEigenDevice<platform::GPUPlace>();
}
#endif
std::string OperatorBase::Input(const std::string& name) const {
auto& ins = Inputs(name);
PADDLE_ENFORCE_LE(ins.size(), 1UL,
@ -429,7 +415,7 @@ void OperatorWithKernel::Run(const Scope& scope,
}
OpKernelType OperatorWithKernel::GetKernelType(
const ExecutionContext& ctx) const {
return OpKernelType(IndicateDataType(ctx), ctx.device_context());
return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
}
DataType OperatorWithKernel::IndicateDataType(
const ExecutionContext& ctx) const {

@ -276,17 +276,25 @@ class ExecutionContext {
out_tensor->set_lod(in_tensor.lod());
}
template <typename PlaceType,
typename DeviceType = typename platform::EigenDeviceConverter<
PlaceType>::EigenDeviceType>
DeviceType& GetEigenDevice() const;
platform::Place GetPlace() const { return device_context_.GetPlace(); }
template <typename DeviceContextType>
const DeviceContextType& device_context() const {
return *reinterpret_cast<const DeviceContextType*>(&device_context_);
}
const platform::DeviceContext& device_context() const {
return device_context_;
}
#ifdef PADDLE_WITH_CUDA
const inline platform::CUDADeviceContext& cuda_device_context() const {
PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
return *reinterpret_cast<const platform::CUDADeviceContext*>(
&device_context_);
}
#endif
//! Get actual name vector for this input.
const std::vector<std::string>& Inputs(const std::string& name) const {
return op_.Inputs(name);
@ -297,14 +305,6 @@ class ExecutionContext {
return op_.Outputs(name);
}
#ifdef PADDLE_WITH_CUDA
const inline platform::CUDADeviceContext& cuda_device_context() const {
PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
return *reinterpret_cast<const platform::CUDADeviceContext*>(
&device_context_);
}
#endif
private:
const OperatorBase& op_;
const Scope& scope_;

@ -115,7 +115,7 @@ class OpWithKernelTest : public OperatorWithKernel {
protected:
void InferShape(framework::InferShapeContext* ctx) const override {}
OpKernelType GetKernelType(const ExecutionContext& ctx) const override {
return OpKernelType(DataType::FP32, ctx.device_context());
return OpKernelType(DataType::FP32, ctx.GetPlace());
}
};

@ -138,7 +138,7 @@ function(op_library TARGET)
if ("${TARGET}" STREQUAL "nccl_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n")
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
endif()
# reduce_op contains several operators

@ -57,7 +57,7 @@ class AccuracyOp : public framework::OperatorWithKernel {
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
ctx.device_context());
ctx.GetPlace());
}
};

@ -104,5 +104,6 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
// FIXME(typhoonzero): types of T is for inference data.
// label data is always int64
REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
paddle::operators::AccuracyOpCUDAKernel<double>);
REGISTER_OP_CUDA_KERNEL(accuracy,
paddle::operators::AccuracyOpCUDAKernel<float>,
paddle::operators::AccuracyOpCUDAKernel<double>);

@ -21,7 +21,7 @@ namespace operators {
using Tensor = framework::Tensor;
template <typename Place, typename T>
template <typename DeviceContext, typename T>
class AccuracyKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {

@ -611,16 +611,17 @@ REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker,
REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad,
ops::ActivationOpGrad);
#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor) \
REGISTER_OP_CPU_KERNEL( \
act_type, \
ops::ActivationKernel<paddle::platform::CPUPlace, ops::functor<float>>, \
ops::ActivationKernel<paddle::platform::CPUPlace, \
ops::functor<double>>); \
REGISTER_OP_CPU_KERNEL( \
act_type##_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, \
ops::grad_functor<float>>, \
ops::ActivationGradKernel<paddle::platform::CPUPlace, \
#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor) \
REGISTER_OP_CPU_KERNEL( \
act_type, ops::ActivationKernel<paddle::platform::CPUDeviceContext, \
ops::functor<float>>, \
ops::ActivationKernel<paddle::platform::CPUDeviceContext, \
ops::functor<double>>); \
REGISTER_OP_CPU_KERNEL( \
act_type##_grad, \
ops::ActivationGradKernel<paddle::platform::CPUDeviceContext, \
ops::grad_functor<float>>, \
ops::ActivationGradKernel<paddle::platform::CPUDeviceContext, \
ops::grad_functor<double>>);
FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);

@ -17,16 +17,17 @@
namespace ops = paddle::operators;
#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, functor, grad_functor) \
REGISTER_OP_GPU_KERNEL( \
act_type, \
ops::ActivationKernel<paddle::platform::GPUPlace, ops::functor<float>>, \
ops::ActivationKernel<paddle::platform::GPUPlace, \
ops::functor<double>>); \
REGISTER_OP_GPU_KERNEL( \
act_type##_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, \
ops::grad_functor<float>>, \
ops::ActivationGradKernel<paddle::platform::GPUPlace, \
#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, functor, grad_functor) \
REGISTER_OP_CUDA_KERNEL( \
act_type, ops::ActivationKernel<paddle::platform::CUDADeviceContext, \
ops::functor<float>>, \
ops::ActivationKernel<paddle::platform::CUDADeviceContext, \
ops::functor<double>>); \
REGISTER_OP_CUDA_KERNEL( \
act_type##_grad, \
ops::ActivationGradKernel<paddle::platform::CUDADeviceContext, \
ops::grad_functor<float>>, \
ops::ActivationGradKernel<paddle::platform::CUDADeviceContext, \
ops::grad_functor<double>>);
FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_GPU_KERNEL);
FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);

@ -19,7 +19,7 @@
namespace paddle {
namespace operators {
template <typename Place, typename Functor>
template <typename DeviceContext, typename Functor>
class ActivationKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
@ -32,18 +32,19 @@ class ActivationKernel
auto x = framework::EigenVector<T>::Flatten(*X);
auto y = framework::EigenVector<T>::Flatten(*Y);
auto place = context.GetEigenDevice<Place>();
auto* place =
context.template device_context<DeviceContext>().eigen_device();
Functor functor;
auto attrs = functor.GetAttrs();
for (auto& attr : attrs) {
*attr.second = context.Attr<float>(attr.first);
}
functor(place, x, y);
functor(*place, x, y);
}
};
template <typename Place, typename Functor>
template <typename DeviceContext, typename Functor>
class ActivationGradKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
@ -59,13 +60,14 @@ class ActivationGradKernel
auto x = framework::EigenVector<T>::Flatten(*X);
auto y = framework::EigenVector<T>::Flatten(*Y);
auto dx = framework::EigenVector<T>::Flatten(*dX);
auto place = context.GetEigenDevice<Place>();
auto* place =
context.template device_context<DeviceContext>().eigen_device();
Functor functor;
auto attrs = functor.GetAttrs();
for (auto& attr : attrs) {
*attr.second = context.Attr<float>(attr.first);
}
functor(place, x, y, dy, dx);
functor(*place, x, y, dy, dx);
}
};

@ -109,5 +109,5 @@ $$
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker);
REGISTER_OP_CPU_KERNEL(
adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUPlace, float>,
ops::AdadeltaOpKernel<paddle::platform::CPUPlace, double>);
adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, double>);

@ -16,6 +16,6 @@
#include "paddle/operators/adadelta_op.h"
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(
adadelta, ops::AdadeltaOpKernel<paddle::platform::GPUPlace, float>,
ops::AdadeltaOpKernel<paddle::platform::GPUPlace, double>);
REGISTER_OP_CUDA_KERNEL(
adadelta, ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, double>);

@ -19,7 +19,7 @@ limitations under the License. */
namespace paddle {
namespace operators {
template <typename Place, typename T>
template <typename DeviceContext, typename T>
class AdadeltaOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
@ -51,7 +51,7 @@ class AdadeltaOpKernel : public framework::OpKernel<T> {
framework::EigenVector<T>::Flatten(*avg_squared_grad_out_tensor);
auto avg_squared_update_out =
framework::EigenVector<T>::Flatten(*avg_squared_update_out_tensor);
auto place = ctx.GetEigenDevice<Place>();
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
avg_squared_grad_out.device(place) =
rho * avg_squared_grad + (1 - rho) * grad.square();

@ -100,8 +100,8 @@ size_t FindPos(const std::vector<int64_t>& rows, int64_t value) {
} // namespace
template <typename T>
struct SparseAdagradFunctor<platform::CPUPlace, T> {
void operator()(const platform::DeviceContext& context,
struct SparseAdagradFunctor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& context,
const framework::SelectedRows& grad,
const framework::Tensor& learning_rate, T epsilon,
framework::Tensor* moment, framework::Tensor* param) {
@ -120,7 +120,7 @@ struct SparseAdagradFunctor<platform::CPUPlace, T> {
{static_cast<int64_t>(merge_rows.size()), grad_width}),
context.GetPlace());
math::SetConstant<platform::CPUPlace, T> constant_functor;
math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
constant_functor(context, grad_merge->mutable_value(), 0.0);
auto* grad_merge_data = grad_merge->mutable_value()->data<T>();
@ -144,9 +144,9 @@ struct SparseAdagradFunctor<platform::CPUPlace, T> {
auto gs =
framework::EigenVector<T>::Flatten(*(grad_square->mutable_value()));
auto gm = framework::EigenVector<T>::Flatten(grad_merge->value());
gs.device(*context.GetEigenDevice<platform::CPUPlace>()) = gm * gm;
gs.device(*context.eigen_device()) = gm * gm;
math::SelectedRowsAddToTensor<platform::CPUPlace, T> functor;
math::SelectedRowsAddToTensor<platform::CPUDeviceContext, T> functor;
functor(context, *grad_square, moment);
// 3. update parameter
@ -164,13 +164,13 @@ struct SparseAdagradFunctor<platform::CPUPlace, T> {
}
};
template struct SparseAdagradFunctor<platform::CPUPlace, float>;
template struct SparseAdagradFunctor<platform::CPUPlace, double>;
template struct SparseAdagradFunctor<platform::CPUDeviceContext, float>;
template struct SparseAdagradFunctor<platform::CPUDeviceContext, double>;
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(adagrad, ops::AdagradOp, ops::AdagradOpMaker);
REGISTER_OP_CPU_KERNEL(
adagrad, ops::AdagradOpKernel<paddle::platform::CPUPlace, float>,
ops::AdagradOpKernel<paddle::platform::CPUPlace, double>);
adagrad, ops::AdagradOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::AdagradOpKernel<paddle::platform::CPUDeviceContext, double>);

@ -72,8 +72,8 @@ __global__ void SparseAdagradFunctorKernel(const T* grad, const int64_t* rows,
} // namespace
template <typename T>
struct SparseAdagradFunctor<platform::GPUPlace, T> {
void operator()(const platform::DeviceContext& context,
struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& context,
const framework::SelectedRows& grad,
const framework::Tensor& learning_rate, T epsilon,
framework::Tensor* moment, framework::Tensor* param) {
@ -92,7 +92,7 @@ struct SparseAdagradFunctor<platform::GPUPlace, T> {
{static_cast<int64_t>(merge_rows.size()), grad_width}),
context.GetPlace());
math::SetConstant<platform::GPUPlace, T> constant_functor;
math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
constant_functor(context, grad_merge->mutable_value(), 0.0);
auto* grad_merge_data = grad_merge->mutable_value()->data<T>();
@ -119,9 +119,9 @@ struct SparseAdagradFunctor<platform::GPUPlace, T> {
auto gs =
framework::EigenVector<T>::Flatten(*(grad_square->mutable_value()));
auto gm = framework::EigenVector<T>::Flatten(grad_merge->value());
gs.device(*context.GetEigenDevice<platform::GPUPlace>()) = gm * gm;
gs.device(*context.eigen_device()) = gm * gm;
math::SelectedRowsAddToTensor<platform::GPUPlace, T> functor;
math::SelectedRowsAddToTensor<platform::CUDADeviceContext, T> functor;
functor(context, *grad_square, moment);
// 3. update parameter
@ -139,13 +139,13 @@ struct SparseAdagradFunctor<platform::GPUPlace, T> {
}
};
template struct SparseAdagradFunctor<platform::GPUPlace, float>;
template struct SparseAdagradFunctor<platform::GPUPlace, double>;
template struct SparseAdagradFunctor<platform::CUDADeviceContext, float>;
template struct SparseAdagradFunctor<platform::CUDADeviceContext, double>;
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(
adagrad, ops::AdagradOpKernel<paddle::platform::GPUPlace, float>,
ops::AdagradOpKernel<paddle::platform::GPUPlace, double>);
REGISTER_OP_CUDA_KERNEL(
adagrad, ops::AdagradOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::AdagradOpKernel<paddle::platform::CUDADeviceContext, double>);

@ -19,15 +19,15 @@ limitations under the License. */
namespace paddle {
namespace operators {
template <typename Place, typename T>
template <typename DeviceContext, typename T>
struct SparseAdagradFunctor {
void operator()(const platform::DeviceContext& context,
void operator()(const DeviceContext& context,
const framework::SelectedRows& grad,
const framework::Tensor& learning_rate, T epsilon,
framework::Tensor* moment, framework::Tensor* param);
};
template <typename Place, typename T>
template <typename DeviceContext, typename T>
class AdagradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
@ -52,11 +52,11 @@ class AdagradOpKernel : public framework::OpKernel<T> {
auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
auto place = ctx.GetEigenDevice<Place>();
auto* place = ctx.template device_context<DeviceContext>().eigen_device();
moment_out.device(place) = moment + grad * grad;
moment_out.device(*place) = moment + grad * grad;
Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
param_out.device(place) =
param_out.device(*place) =
param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
} else if (grad_var->IsType<framework::SelectedRows>()) {
auto* param_tensor = ctx.Input<framework::Tensor>("Param");
@ -65,8 +65,9 @@ class AdagradOpKernel : public framework::OpKernel<T> {
auto* moment_tensor = ctx.Input<framework::Tensor>("Moment");
PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor);
SparseAdagradFunctor<Place, T> functor;
functor(ctx.device_context(), *ctx.Input<framework::SelectedRows>("Grad"),
SparseAdagradFunctor<DeviceContext, T> functor;
functor(ctx.template device_context<DeviceContext>(),
*ctx.Input<framework::SelectedRows>("Grad"),
*ctx.Input<framework::Tensor>("LearningRate"), epsilon,
moment_out_tensor, param_out_tensor);
} else {

@ -128,6 +128,6 @@ $$
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker);
REGISTER_OP_CPU_KERNEL(adam,
ops::AdamOpKernel<paddle::platform::CPUPlace, float>,
ops::AdamOpKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL(
adam, ops::AdamOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::AdamOpKernel<paddle::platform::CPUDeviceContext, double>);

@ -16,6 +16,6 @@
#include "paddle/operators/adam_op.h"
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(adam,
ops::AdamOpKernel<paddle::platform::GPUPlace, float>,
ops::AdamOpKernel<paddle::platform::GPUPlace, double>);
REGISTER_OP_CUDA_KERNEL(
adam, ops::AdamOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::AdamOpKernel<paddle::platform::CUDADeviceContext, double>);

@ -19,7 +19,7 @@ limitations under the License. */
namespace paddle {
namespace operators {
template <typename Place, typename T>
template <typename DeviceContext, typename T>
class AdamOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
@ -52,17 +52,17 @@ class AdamOpKernel : public framework::OpKernel<T> {
auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
auto moment1_out = framework::EigenVector<T>::Flatten(*moment1_out_tensor);
auto moment2_out = framework::EigenVector<T>::Flatten(*moment2_out_tensor);
auto place = ctx.GetEigenDevice<Place>();
auto* place = ctx.template device_context<DeviceContext>().eigen_device();
moment1_out.device(place) = beta1 * moment1 + (1 - beta1) * grad;
moment2_out.device(place) = beta2 * moment2 + (1 - beta2) * grad.square();
moment1_out.device(*place) = beta1 * moment1 + (1 - beta1) * grad;
moment2_out.device(*place) = beta2 * moment2 + (1 - beta2) * grad.square();
// All of these are tensors of 1 element
auto lr_t = lr * (1 - beta2_pow).sqrt() / (1 - beta1_pow);
// Eigen does not support automatic broadcast
// Get dimensions of moment vector to broadcast lr_t
Eigen::DSizes<int, 1> m_dsize(moment1_out_tensor->numel());
param_out.device(place) =
param_out.device(*place) =
param -
lr_t.broadcast(m_dsize) *
(moment1_out / (moment2_out.sqrt() + epsilon));

@ -127,6 +127,6 @@ division by 0 error.
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker);
REGISTER_OP_CPU_KERNEL(adamax,
ops::AdamaxOpKernel<paddle::platform::CPUPlace, float>,
ops::AdamaxOpKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL(
adamax, ops::AdamaxOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::AdamaxOpKernel<paddle::platform::CPUDeviceContext, double>);

@ -16,6 +16,6 @@
#include "paddle/operators/adamax_op.h"
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(adamax,
ops::AdamaxOpKernel<paddle::platform::GPUPlace, float>,
ops::AdamaxOpKernel<paddle::platform::GPUPlace, double>);
REGISTER_OP_CUDA_KERNEL(
adamax, ops::AdamaxOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::AdamaxOpKernel<paddle::platform::CUDADeviceContext, double>);

@ -19,7 +19,7 @@ limitations under the License. */
namespace paddle {
namespace operators {
template <typename Place, typename T>
template <typename DeviceContext, typename T>
class AdamaxOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
@ -51,14 +51,14 @@ class AdamaxOpKernel : public framework::OpKernel<T> {
auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
auto inf_norm_out =
framework::EigenVector<T>::Flatten(*inf_norm_out_tensor);
auto place = ctx.GetEigenDevice<Place>();
auto* place = ctx.template device_context<DeviceContext>().eigen_device();
moment_out.device(place) = beta1 * moment + (1 - beta1) * grad;
inf_norm_out.device(place) =
moment_out.device(*place) = beta1 * moment + (1 - beta1) * grad;
inf_norm_out.device(*place) =
grad.abs().cwiseMax((beta2 * inf_norm) + epsilon);
auto lr_t = lr / (1 - beta1_pow);
Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
param_out.device(place) =
param_out.device(*place) =
param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out);
}
};

@ -25,7 +25,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename Place, typename T>
template <typename DeviceContext, typename T>
class AucKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {

@ -135,7 +135,8 @@ The required data format for this layer is one of the following:
};
template <typename T>
class BatchNormKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
class BatchNormKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const float epsilon = ctx.Attr<float>("epsilon");
@ -318,12 +319,12 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
PADDLE_THROW("can't find Y@GRAD");
}
return framework::OpKernelType(framework::ToDataType(t->type()),
ctx.device_context());
ctx.GetPlace());
}
};
template <typename T>
class BatchNormGradKernel<platform::CPUPlace, T>
class BatchNormGradKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
@ -436,8 +437,9 @@ class BatchNormGradKernel<platform::CPUPlace, T>
namespace ops = paddle::operators;
REGISTER_OP(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
batch_norm_grad, ops::BatchNormGradOp);
REGISTER_OP_CPU_KERNEL(batch_norm,
ops::BatchNormKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
batch_norm,
ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
batch_norm_grad,
ops::BatchNormGradKernel<paddle::platform::CPUPlace, float>);
ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>);

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save