Paddle/paddle/fluid/operators/adam_op.cc

/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/operators/adam_op.h"

namespace paddle {
namespace operators {

class AdamOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("Param"),
                   "Input(Param) of AdamOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Grad"),
                   "Input(Grad) of AdamOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Moment1"),
                   "Input(Moment1) of AdamOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Moment2"),
                   "Input(Moment2) of AdamOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
                   "Input(LearningRate) of AdamOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
                   "Input(Beta1Pow) of AdamOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"),
                   "Input(Beta2Pow) of AdamOp should not be null.");

    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
                   "Output(ParamOut) of AdamOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"),
                   "Output(Moment1Out) of AdamOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"),
                   "Output(Moment2Out) of AdamOp should not be null.");

    auto lr_dims = ctx->GetInputDim("LearningRate");
    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
                      "Learning rate should have 1 dimension");
    auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
                      "Beta1 power accumulator should have 1 dimension");
    auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
    PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1,
                      "Beta2 power accumulator should have 1 dimension");

    auto param_dims = ctx->GetInputDim("Param");
    PADDLE_ENFORCE_EQ(
        param_dims, ctx->GetInputDim("Grad"),
        "Param and Grad input of AdamOp should have same dimension");
    PADDLE_ENFORCE_EQ(
        param_dims, ctx->GetInputDim("Moment1"),
        "Param and Moment1 input of AdamOp should have same dimension");
    PADDLE_ENFORCE_EQ(
        param_dims, ctx->GetInputDim("Moment2"),
        "Param and Moment2 input of AdamOp should have same dimension");

    ctx->SetOutputDim("ParamOut", param_dims);
    ctx->SetOutputDim("Moment1Out", param_dims);
    ctx->SetOutputDim("Moment2Out", param_dims);
  }
};

class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  AdamOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Param", "(Tensor) Input parameter");
    AddInput("Grad", "(Tensor) Input gradient");
    AddInput("LearningRate", "(Tensor) Learning rate");
    AddInput("Moment1", "(Tensor) Input first moment");
    AddInput("Moment2", "(Tensor) Input second moment");
    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
    AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");

    AddOutput("ParamOut", "(Tensor) Output parameter");
    AddOutput("Moment1Out", "(Tensor) Output first moment");
    AddOutput("Moment2Out", "(Tensor) Output second moment");

    AddAttr<float>("beta1",
                   "(float, default 0.9) "
                   "Exponential decay rate for the "
                   "first moment estimates.")
        .SetDefault(0.9f);
    AddAttr<float>("beta2",
                   "(float, default 0.999) "
                   "exponential decay rate for the "
                   "second moment estimates.")
        .SetDefault(0.999f);
    AddAttr<float>("epsilon",
                   "(float, default 1.0e-8) "
                   "Constant for numerical stability")
        .SetDefault(1.0e-8f);

    AddComment(R"DOC(
Adam Optimizer.

This implements the Adam optimizer from Section 2 of the Adam
paper : https://arxiv.org/abs/1412.6980.
Adam is a first-order gradient-based optimization method based on
adaptive estimates of lower-order moments.

Adam updates:

$$
moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
learning\_rate = learning\_rate *
                  \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
$$

)DOC");
  }
};
}  // namespace operators
}  // namespace paddle

namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker);
REGISTER_OP_CPU_KERNEL(
    adam, ops::AdamOpKernel<paddle::platform::CPUDeviceContext, float>,
    ops::AdamOpKernel<paddle::platform::CPUDeviceContext, double>);
Adding the Adam Optimizer operator (#4733) * add adam op moment1_out = beta1 * moment1 + (1 − beta1) * grad moment2_out = beta2 * moment2 + (1 − beta2) * grad * grad moment1_hat = moment1_out / (1 - beta1^t) moment2_hat = moment2_out / (1 - beta2^t) param_out = param - learning_rate * moment1_hat / (sqrt(moment2_hat) + epsilon) * fix moment 2 * Adding the Adam optimization operator * Adding more tests for Adam op 7 years ago			`/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.`

			`Licensed under the Apache License, Version 2.0 (the "License");`
			`you may not use this file except in compliance with the License.`
			`You may obtain a copy of the License at`

			`http://www.apache.org/licenses/LICENSE-2.0`

			`Unless required by applicable law or agreed to in writing, software`
			`distributed under the License is distributed on an "AS IS" BASIS,`
			`WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`See the License for the specific language governing permissions and`
			`limitations under the License. */`

			`#include "paddle/operators/adam_op.h"`

			`namespace paddle {`
			`namespace operators {`

			`class AdamOp : public framework::OperatorWithKernel {`
			`public:`
			`using framework::OperatorWithKernel::OperatorWithKernel;`

			`void InferShape(framework::InferShapeContext *ctx) const override {`
			`PADDLE_ENFORCE(ctx->HasInput("Param"),`
			`"Input(Param) of AdamOp should not be null.");`
			`PADDLE_ENFORCE(ctx->HasInput("Grad"),`
			`"Input(Grad) of AdamOp should not be null.");`
			`PADDLE_ENFORCE(ctx->HasInput("Moment1"),`
			`"Input(Moment1) of AdamOp should not be null.");`
			`PADDLE_ENFORCE(ctx->HasInput("Moment2"),`
			`"Input(Moment2) of AdamOp should not be null.");`
			`PADDLE_ENFORCE(ctx->HasInput("LearningRate"),`
			`"Input(LearningRate) of AdamOp should not be null.");`
			`PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),`
			`"Input(Beta1Pow) of AdamOp should not be null.");`
			`PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"),`
			`"Input(Beta2Pow) of AdamOp should not be null.");`

			`PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),`
			`"Output(ParamOut) of AdamOp should not be null.");`
			`PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"),`
			`"Output(Moment1Out) of AdamOp should not be null.");`
			`PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"),`
			`"Output(Moment2Out) of AdamOp should not be null.");`

			`auto lr_dims = ctx->GetInputDim("LearningRate");`
			`PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,`
			`"Learning rate should have 1 dimension");`
			`auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");`
			`PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,`
			`"Beta1 power accumulator should have 1 dimension");`
			`auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");`
Adding the doc format for AdaDelta, AdaMax, Adam, AdaGrad, BatchNorm, Clip, Cast and AUC (#5317) * Adding the doc format for AdaDelta * Updating the documentation for Adagrad, Adam and Adamax * Updating the auc op * Fix review comments * Updating doc for Batch Norm * Updating the cast op * Updating the clip op * Fixing review comment * Fixing review comment: * Small change to restart PR_CI 7 years ago			`PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1,`
			`"Beta2 power accumulator should have 1 dimension");`
Adding the Adam Optimizer operator (#4733) * add adam op moment1_out = beta1 * moment1 + (1 − beta1) * grad moment2_out = beta2 * moment2 + (1 − beta2) * grad * grad moment1_hat = moment1_out / (1 - beta1^t) moment2_hat = moment2_out / (1 - beta2^t) param_out = param - learning_rate * moment1_hat / (sqrt(moment2_hat) + epsilon) * fix moment 2 * Adding the Adam optimization operator * Adding more tests for Adam op 7 years ago
			`auto param_dims = ctx->GetInputDim("Param");`
			`PADDLE_ENFORCE_EQ(`
			`param_dims, ctx->GetInputDim("Grad"),`
			`"Param and Grad input of AdamOp should have same dimension");`
			`PADDLE_ENFORCE_EQ(`
			`param_dims, ctx->GetInputDim("Moment1"),`
Adding the doc format for AdaDelta, AdaMax, Adam, AdaGrad, BatchNorm, Clip, Cast and AUC (#5317) * Adding the doc format for AdaDelta * Updating the documentation for Adagrad, Adam and Adamax * Updating the auc op * Fix review comments * Updating doc for Batch Norm * Updating the cast op * Updating the clip op * Fixing review comment * Fixing review comment: * Small change to restart PR_CI 7 years ago			`"Param and Moment1 input of AdamOp should have same dimension");`
Adding the Adam Optimizer operator (#4733) * add adam op moment1_out = beta1 * moment1 + (1 − beta1) * grad moment2_out = beta2 * moment2 + (1 − beta2) * grad * grad moment1_hat = moment1_out / (1 - beta1^t) moment2_hat = moment2_out / (1 - beta2^t) param_out = param - learning_rate * moment1_hat / (sqrt(moment2_hat) + epsilon) * fix moment 2 * Adding the Adam optimization operator * Adding more tests for Adam op 7 years ago			`PADDLE_ENFORCE_EQ(`
			`param_dims, ctx->GetInputDim("Moment2"),`
Adding the doc format for AdaDelta, AdaMax, Adam, AdaGrad, BatchNorm, Clip, Cast and AUC (#5317) * Adding the doc format for AdaDelta * Updating the documentation for Adagrad, Adam and Adamax * Updating the auc op * Fix review comments * Updating doc for Batch Norm * Updating the cast op * Updating the clip op * Fixing review comment * Fixing review comment: * Small change to restart PR_CI 7 years ago			`"Param and Moment2 input of AdamOp should have same dimension");`
Adding the Adam Optimizer operator (#4733) * add adam op moment1_out = beta1 * moment1 + (1 − beta1) * grad moment2_out = beta2 * moment2 + (1 − beta2) * grad * grad moment1_hat = moment1_out / (1 - beta1^t) moment2_hat = moment2_out / (1 - beta2^t) param_out = param - learning_rate * moment1_hat / (sqrt(moment2_hat) + epsilon) * fix moment 2 * Adding the Adam optimization operator * Adding more tests for Adam op 7 years ago
			`ctx->SetOutputDim("ParamOut", param_dims);`
			`ctx->SetOutputDim("Moment1Out", param_dims);`
			`ctx->SetOutputDim("Moment2Out", param_dims);`
			`}`
			`};`

			`class AdamOpMaker : public framework::OpProtoAndCheckerMaker {`
			`public:`
Move framework.proto to proto namespace (#6718) * Move framework.proto to proto namespace * Fix compile * Fix compile * Fix Compile 7 years ago			`AdamOpMaker(OpProto proto, OpAttrChecker op_checker)`
Adding the Adam Optimizer operator (#4733) * add adam op moment1_out = beta1 * moment1 + (1 − beta1) * grad moment2_out = beta2 * moment2 + (1 − beta2) * grad * grad moment1_hat = moment1_out / (1 - beta1^t) moment2_hat = moment2_out / (1 - beta2^t) param_out = param - learning_rate * moment1_hat / (sqrt(moment2_hat) + epsilon) * fix moment 2 * Adding the Adam optimization operator * Adding more tests for Adam op 7 years ago			`: OpProtoAndCheckerMaker(proto, op_checker) {`
			`AddInput("Param", "(Tensor) Input parameter");`
			`AddInput("Grad", "(Tensor) Input gradient");`
			`AddInput("LearningRate", "(Tensor) Learning rate");`
			`AddInput("Moment1", "(Tensor) Input first moment");`
			`AddInput("Moment2", "(Tensor) Input second moment");`
			`AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");`
			`AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");`

			`AddOutput("ParamOut", "(Tensor) Output parameter");`
			`AddOutput("Moment1Out", "(Tensor) Output first moment");`
			`AddOutput("Moment2Out", "(Tensor) Output second moment");`

			`AddAttr<float>("beta1",`
			`"(float, default 0.9) "`
			`"Exponential decay rate for the "`
			`"first moment estimates.")`
			`.SetDefault(0.9f);`
			`AddAttr<float>("beta2",`
			`"(float, default 0.999) "`
			`"exponential decay rate for the "`
			`"second moment estimates.")`
			`.SetDefault(0.999f);`
			`AddAttr<float>("epsilon",`
			`"(float, default 1.0e-8) "`
			`"Constant for numerical stability")`
			`.SetDefault(1.0e-8f);`

			`AddComment(R"DOC(`
Adding the doc format for AdaDelta, AdaMax, Adam, AdaGrad, BatchNorm, Clip, Cast and AUC (#5317) * Adding the doc format for AdaDelta * Updating the documentation for Adagrad, Adam and Adamax * Updating the auc op * Fix review comments * Updating doc for Batch Norm * Updating the cast op * Updating the clip op * Fixing review comment * Fixing review comment: * Small change to restart PR_CI 7 years ago			`Adam Optimizer.`
Adding the Adam Optimizer operator (#4733) * add adam op moment1_out = beta1 * moment1 + (1 − beta1) * grad moment2_out = beta2 * moment2 + (1 − beta2) * grad * grad moment1_hat = moment1_out / (1 - beta1^t) moment2_hat = moment2_out / (1 - beta2^t) param_out = param - learning_rate * moment1_hat / (sqrt(moment2_hat) + epsilon) * fix moment 2 * Adding the Adam optimization operator * Adding more tests for Adam op 7 years ago
			`This implements the Adam optimizer from Section 2 of the Adam`
Adding the doc format for AdaDelta, AdaMax, Adam, AdaGrad, BatchNorm, Clip, Cast and AUC (#5317) * Adding the doc format for AdaDelta * Updating the documentation for Adagrad, Adam and Adamax * Updating the auc op * Fix review comments * Updating doc for Batch Norm * Updating the cast op * Updating the clip op * Fixing review comment * Fixing review comment: * Small change to restart PR_CI 7 years ago			`paper : https://arxiv.org/abs/1412.6980.`
			`Adam is a first-order gradient-based optimization method based on`
			`adaptive estimates of lower-order moments.`
Adding the Adam Optimizer operator (#4733) * add adam op moment1_out = beta1 * moment1 + (1 − beta1) * grad moment2_out = beta2 * moment2 + (1 − beta2) * grad * grad moment1_hat = moment1_out / (1 - beta1^t) moment2_hat = moment2_out / (1 - beta2^t) param_out = param - learning_rate * moment1_hat / (sqrt(moment2_hat) + epsilon) * fix moment 2 * Adding the Adam optimization operator * Adding more tests for Adam op 7 years ago
			`Adam updates:`

Updating the Latex equation for Adagrad (#6009) * Updating the Latex equation for Adagrad * Fixing Latex euqations for adadelta, adam and adamax 7 years ago			`$$`
			`moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\`
			`moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\`
			`learning\_rate = learning\_rate *`
			`\frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\`
			`param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}`
			`$$`
Adding the Adam Optimizer operator (#4733) * add adam op moment1_out = beta1 * moment1 + (1 − beta1) * grad moment2_out = beta2 * moment2 + (1 − beta2) * grad * grad moment1_hat = moment1_out / (1 - beta1^t) moment2_hat = moment2_out / (1 - beta2^t) param_out = param - learning_rate * moment1_hat / (sqrt(moment2_hat) + epsilon) * fix moment 2 * Adding the Adam optimization operator * Adding more tests for Adam op 7 years ago
			`)DOC");`
			`}`
			`};`
			`} // namespace operators`
			`} // namespace paddle`

			`namespace ops = paddle::operators;`
			`REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker);`
Refine device context (#6433) There are mainly following fixes: - take `DeviceContext` as the template parameter of math functors and OpKernel instead of `Place` - remove `eigen_device` interface in base class `DeviceContext` - remove `GetEigenDevice` interface in `ExecutionContext` and base class `DeviceContext` - remove unused `platform::EigenDeviceConverter` - rename `REGISTER_OP_GPU_KERNEL` to `REGISTER_OP_CUDA_KERNEL` - rename `USE_GPU_ONLY_OP` to `USE_CUDA_ONLY_OP` 7 years ago			`REGISTER_OP_CPU_KERNEL(`
			`adam, ops::AdamOpKernel<paddle::platform::CPUDeviceContext, float>,`
			`ops::AdamOpKernel<paddle::platform::CPUDeviceContext, double>);`