add optimizer:dpsgd,test=develop (#19915)

5 years ago · 766bd529d1
parent 37f76407b0
commit 766bd529d1
7 changed files with 431 additions and 7 deletions
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -936,6 +936,14 @@ paddle.fluid.optimizer.AdamaxOptimizer.backward (ArgSpec(args=['self', 'loss', '
 paddle.fluid.optimizer.AdamaxOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdamaxOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
 paddle.fluid.optimizer.AdamaxOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
 paddle.fluid.optimizer.DpsgdOptimizer ('paddle.fluid.optimizer.DpsgdOptimizer', ('document', '71113c30b66c0f4035b10ebd8af8c5ad'))
 paddle.fluid.optimizer.DpsgdOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'clip', 'batch_size', 'sigma'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.DpsgdOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610'))
 paddle.fluid.optimizer.DpsgdOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
 paddle.fluid.optimizer.DpsgdOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
 paddle.fluid.optimizer.DpsgdOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.DpsgdOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
 paddle.fluid.optimizer.DpsgdOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
 paddle.fluid.optimizer.DecayedAdagradOptimizer ('paddle.fluid.optimizer.DecayedAdagradOptimizer', ('document', 'e76838a8586bf2e58e6b5cdd2f67f780'))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610'))
--- a/paddle/fluid/operators/optimizers/dpsgd_op.cc
+++ b/paddle/fluid/operators/optimizers/dpsgd_op.cc
@ -0,0 +1,107 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/optimizers/dpsgd_op.h"
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
 class DpsgdOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
                      "Input(Param) of DpsgdOp should not be null.");
    PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
                      "Input(Grad) of DpsgdOp should not be null.");
    PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), true,
                      "Input(LearningRate) of DpsgdOp should not be null.");
    PADDLE_ENFORCE_EQ(
        ctx->GetInputsVarType("Param").front(),
        framework::proto::VarType::LOD_TENSOR,
        "The input var's type should be LoDTensor, but the received is %s",
        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
    PADDLE_ENFORCE_EQ(
        ctx->GetInputsVarType("Grad").front(),
        framework::proto::VarType::LOD_TENSOR,
        "The input var's type should be LoDTensor, but the received is %s",
        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
    PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true,
                      "Output(ParamOut) of DpsgdOp should not be null.");
    auto lr_dims = ctx->GetInputDim("LearningRate");
    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
                      "Learning rate should have 1 dimension");
    auto param_dims = ctx->GetInputDim("Param");
    PADDLE_ENFORCE_EQ(
        param_dims, ctx->GetInputDim("Grad"),
        "Param and Grad input of DpsgdOp should have same dimension");
    ctx->SetOutputDim("ParamOut", param_dims);
  }
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(ctx.Input<Tensor>("Param")->type(),
                                   ctx.GetPlace());
  }
 };
 class DpsgdOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
    AddInput("Param", "(Tensor) Input parameter");
    AddInput("Grad", "(Tensor) Input gradient");
    AddInput("LearningRate", "(Tensor) Learning rate");
    AddOutput("ParamOut", "(Tensor) Output parameter");
    AddAttr<float>("clip",
                   "(float, default 0.9) "
                   "Exponential decay rate for the "
                   "1st moment estimates.")
        .SetDefault(10.0f);
    AddAttr<float>("batch_size",
                   "(float, default 0.999) "
                   "exponential decay rate for the weighted "
                   "infinity norm estimates.")
        .SetDefault(16.0f);
    AddAttr<float>("sigma",
                   "(float, default 1.0e-8) "
                   "Constant for numerical stability")
        .SetDefault(1.0f);
    AddComment(R"DOC(
 Dpsgd Optimizer.
 We implement the Dpsgd optimizer according to CCS16 paper - 
 Deep Learning with Differential Privacy.
 Dpsgd updates:
 CCS16 - Deep Learning with Differential Privacy.
 [https://arxiv.org/abs/1607.00133]
 )DOC");
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(dpsgd, ops::DpsgdOp, ops::DpsgdOpMaker);
 REGISTER_OP_CPU_KERNEL(
    dpsgd, ops::DpsgdOpKernel<paddle::platform::CPUDeviceContext, float>,
    ops::DpsgdOpKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/optimizers/dpsgd_op.h
+++ b/paddle/fluid/operators/optimizers/dpsgd_op.h
@ -0,0 +1,114 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <math.h>
 #include <stdlib.h>
 #include <iostream>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 namespace paddle {
 namespace operators {
 template <typename DeviceContext, typename T>
 class DpsgdOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
    const auto *param_var = ctx.InputVar("Param");
    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
                      "The Var(%s)'s type should be LoDTensor, "
                      "but the received is %s",
                      ctx.Inputs("Param").front(),
                      framework::ToTypeName(param_var->Type()));
    const auto *grad_var = ctx.InputVar("Grad");
    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
                      "The Var(%s)'s type should be LoDTensor, "
                      "but the received is %s",
                      ctx.Inputs("Grad").front(),
                      framework::ToTypeName(grad_var->Type()));
    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
    const auto *param = ctx.Input<framework::Tensor>("Param");
    const auto *grad = ctx.Input<framework::Tensor>("Grad");
    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
    auto sz = param_out->numel();
    PADDLE_ENFORCE_EQ(param->numel(), sz);
    PADDLE_ENFORCE_EQ(grad->numel(), sz);
    const T *lr = learning_rate->data<T>();
    const T *param_data = param->data<T>();
    const T *grad_data = grad->data<T>();
    T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
    T clip = static_cast<T>(ctx.Attr<float>("clip"));
    T batch_size = static_cast<T>(ctx.Attr<float>("batch_size"));
    T sigma = static_cast<T>(ctx.Attr<float>("sigma"));
    // compute clipping
    float l2_norm = 0.0;
    for (int64_t i = 0; i < grad->numel(); ++i) {
      l2_norm = l2_norm + grad_data[i] * grad_data[i];
    }
    l2_norm = std::sqrt(l2_norm);
    float scale = 1.0;
    if (l2_norm > clip) {
      scale = l2_norm / clip;
    }
    // generate gaussian noise.
    // [https://en.wikipedia.org/wiki/Box-Muller_transform]
    float V1, V2, S;
    float X;
    float mu = 0.0;
    float U1, U2;
    unsigned seed = (unsigned int)(time(NULL));
    std::minstd_rand engine;
    engine.seed(seed);
    std::uniform_real_distribution<T> dist(0.0, 1.0);
    do {
      // srand((unsigned int)(time(NULL)));
      // U1 = (rand() * 1.0) / RAND_MAX;
      // U2 = (rand() * 1.0) / RAND_MAX;
      // U1 = rand_rr(&seed) * (1.0 / RAND_MAX);
      // U2 = rand_rr(&seed) * (1.0 / RAND_MAX);
      U1 = dist(engine);
      U2 = dist(engine);
      V1 = 2 * U1 - 1;
      V2 = 2 * U2 - 1;
      S = V1 * V1 + V2 * V2;
    } while (S >= 1 || S == 0);
    X = V1 * sqrt(-2 * log(S) / S);
    float gaussian_noise = mu + X * sigma;
    // update parameters
    for (int64_t i = 0; i < grad->numel(); ++i) {
      out_data[i] =
          param_data[i] -
          lr[0] * (grad_data[i] / scale + gaussian_noise / batch_size);
    }
    // CCS16 - Deep Learning with Differential Privacy.
    // [https://arxiv.org/abs/1607.00133]
  }  // Compute
 };   // class
 }  // namespace operators
 }  // namespace paddle
--- a/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py
+++ b/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py
@ -35,6 +35,7 @@ OPTIMIZER_OPS = [
    'adagrad',
    'adam',
    'adamax',
    'dpsgd',
    'decayed_adagrad',
    'adadelta',
    'rmsprop',
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@ -39,13 +39,13 @@ from .wrapped_decorator import signature_safe_contextmanager
 from .. import compat as cpt
 __all__ = [
-    'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
+    'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'Dpsgd', 'DecayedAdagrad',
-    'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
+    'Ftrl', 'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer',
-    'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer',
+    'AdamOptimizer', 'AdamaxOptimizer', 'DpsgdOptimizer',
-    'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'LarsMomentum',
+    'DecayedAdagradOptimizer', 'RMSPropOptimizer', 'FtrlOptimizer', 'Adadelta',
-    'LarsMomentumOptimizer', 'DGCMomentumOptimizer', 'LambOptimizer',
+    'ModelAverage', 'LarsMomentum', 'LarsMomentumOptimizer',
-    'ExponentialMovingAverage', 'PipelineOptimizer', 'LookaheadOptimizer',
+    'DGCMomentumOptimizer', 'LambOptimizer', 'ExponentialMovingAverage',
-    'RecomputeOptimizer'
+    'PipelineOptimizer', 'LookaheadOptimizer', 'RecomputeOptimizer'
 ]
@ -1605,6 +1605,85 @@ class AdamaxOptimizer(Optimizer):
                    stop_gradient=True)
 class DpsgdOptimizer(Optimizer):
    """
    We implement the Dpsgd optimizer according to CCS16 paper -
    Deep Learning with Differential Privacy.
    Examples:
        .. code-block:: python
          import paddle.fluid as fluid
          import numpy
          # First create the Executor.
          place = fluid.CPUPlace() # fluid.CUDAPlace(0)
          exe = fluid.Executor(place)
          train_program = fluid.Program()
          startup_program = fluid.Program()
          with fluid.program_guard(train_program, startup_program):
              data = fluid.layers.data(name='X', shape=[1], dtype='float32')
              hidden = fluid.layers.fc(input=data, size=10)
              loss = fluid.layers.mean(hidden)
              optimizer = fluid.optimizer.Dpsgd(learning_rate=0.01, clip=10.0, batch_size=16.0, sigma=1.0)
              optimizer.minimize(loss)
          # Run the startup program once and only once.
          exe.run(startup_program)
          x = numpy.random.random(size=(10, 1)).astype('float32')
          outs = exe.run(program=train_program,
                        feed={'X': x},
                         fetch_list=[loss.name])
    Args:
        learning_rate (float|Variable): the learning rate used to update parameters. \
        Can be a float value or a Variable with one float value as data element.
        clip (float): clipping threshold
        batch_size (float): batch size.
        sigma (float): for gaussian noise.
    Notes:
       Currently, DpsgdOptimizer doesn't support sparse parameter optimization.
    """
    def __init__(self,
                 learning_rate=0.001,
                 clip=0.9,
                 batch_size=0.999,
                 sigma=1e-8):
        assert learning_rate is not None
        assert clip is not None
        assert batch_size is not None
        assert sigma is not None
        super(DpsgdOptimizer, self).__init__(learning_rate=learning_rate)
        self.type = "dpsgd"
        self._clip = clip
        self._batch_size = batch_size
        self._sigma = sigma
    def _append_optimize_op(self, block, param_and_grad):
        assert isinstance(block, framework.Block)
        # create the dpsgd optimize op
        dpsgd_op = block.append_op(
            type=self.type,
            inputs={
                "Param": param_and_grad[0],
                "Grad": param_and_grad[1],
                "LearningRate": self._create_param_lr(param_and_grad)
            },
            outputs={"ParamOut": param_and_grad[0]},
            attrs={
                "clip": self._clip,
                "batch_size": self._batch_size,
                "sigma": self._sigma
            },
            stop_gradient=True)
        return dpsgd_op
 class DecayedAdagradOptimizer(Optimizer):
    """
    **Decayed Adagrad Optimizer**
@ -2258,6 +2337,7 @@ Momentum = MomentumOptimizer
 Adagrad = AdagradOptimizer
 Adam = AdamOptimizer
 Adamax = AdamaxOptimizer
 Dpsgd = DpsgdOptimizer
 DecayedAdagrad = DecayedAdagradOptimizer
 Adadelta = AdadeltaOptimizer
 RMSProp = RMSPropOptimizer
--- a/python/paddle/fluid/tests/unittests/test_dpsgd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dpsgd_op.py
@ -0,0 +1,73 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
 class TestDpsgdOp(OpTest):
    def setUp(self):
        '''Test Dpsgd Operator with supplied attributes
        '''
        self.op_type = "dpsgd"
        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
        learning_rate = 0.001
        clip = 10000.0
        batch_size = 16.0
        sigma = 0.0
        self.inputs = {
            'Param': param,
            'Grad': grad,
            'LearningRate': np.array([learning_rate]).astype("float32")
        }
        self.attrs = {'clip': clip, 'batch_size': batch_size, 'sigma': sigma}
        param_out = dpsgd_step(self.inputs, self.attrs)
        self.outputs = {'ParamOut': param_out}
    def test_check_output(self):
        self.check_output()
 def dpsgd_step(inputs, attributes):
    '''
    Simulate one step of the dpsgd optimizer
    :param inputs: dict of inputs
    :param attributes: dict of attributes
    :return tuple: tuple of output param, moment, inf_norm and
    beta1 power accumulator
    '''
    param = inputs['Param']
    grad = inputs['Grad']
    lr = inputs['LearningRate']
    clip = attributes['clip']
    batch_size = attributes['batch_size']
    sigma = attributes['sigma']
    param_out = param - lr * grad
    return param_out
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@ -408,6 +408,47 @@ class TestAdamaxOptimizer(unittest.TestCase):
        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
 class TestDpsgdOptimizer(unittest.TestCase):
    def test_dpsgd_optimizer(self):
        def check_dpsgd_optimizer(optimizer_attr):
            init_program = framework.Program()
            program = framework.Program()
            block = program.global_block()
            mul_x = block.create_parameter(
                dtype="float32",
                shape=[5, 10],
                lod_level=0,
                name="mul.x",
                optimize_attr=optimizer_attr)
            mul_y = block.create_var(
                dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
            mul_out = block.create_var(
                dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
            block.append_op(
                type="mul",
                inputs={"X": mul_x,
                        "Y": mul_y},
                outputs={"Out": mul_out},
                attrs={"x_num_col_dims": 1})
            mean_out = block.create_var(
                dtype="float32", shape=[1], lod_level=0, name="mean.out")
            block.append_op(
                type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
            dpsgd_optimizer = optimizer.DpsgdOptimizer(
                learning_rate=0.01, clip=100.0, batch_size=16.0, sigma=0.0)
            opts, _ = dpsgd_optimizer.minimize(mean_out, init_program)
            return opts
        opts = check_dpsgd_optimizer({
            'learning_rate': 1.1,
            'clip': 100.0,
            'batch_size': 16.0,
            'sigma': 4.0
        })
        self.assertEqual(len(opts), 2)
        self.assertEqual([op.type for op in opts], ["scale", "dpsgd"])
 class TestDecayedAdagradOptimizer(unittest.TestCase):
    class MockDecayedAdagrad(optimizer.DecayedAdagradOptimizer):
        def get_accumulators(self):