refactor momentum op to combine weight (#27414)

* refactor momentum op to combine weight_decay (scale op and sum op)
musl/disable_test_yolov3_temporarily
furnace 5 years ago committed by GitHub
parent bd1d6d3b30
commit 8ff3550658
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/optimizers/momentum_op.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle {
namespace operators {
@ -61,6 +62,12 @@ void MomentumOpMaker::Make() {
"(bool, default false) "
"Use Nesterov Momentum")
.SetDefault(false);
AddAttr<std::string>(
"regularization_method",
"(string) regularization_method, right now only support l2decay or none")
.SetDefault("");
AddAttr<float>("regularization_coeff", "(float) regularization_coeff")
.SetDefault(0);
AddComment(R"DOC(
Momentum Optimizer.
@ -90,3 +97,16 @@ REGISTER_OPERATOR(
REGISTER_OP_CPU_KERNEL(
momentum, ops::MomentumOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::MomentumOpKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_VERSION(momentum)
.AddCheckpoint(
R"ROC(
Upgrade momentum add 2 attributes [regularization_method, regularization_coeff].
)ROC",
paddle::framework::compatible::OpVersionDesc()
.NewAttr("regularization_method",
"(string) regularization_method, right now only support "
"l2decay or none",
std::string(""))
.NewAttr("regularization_coeff", "(float) regularization_coeff",
0.0f));

File diff suppressed because it is too large Load Diff

@ -35,6 +35,7 @@ from . import mixed_precision
from .mixed_precision import *
from . import layers
from .layers import *
from . import optimizer
__all__ = []
__all__ += decoder.__all__
@ -46,3 +47,4 @@ __all__ += utils.__all__
__all__ += extend_optimizer.__all__
__all__ += ['mixed_precision']
__all__ += layers.__all__
__all__ += optimizer.__all__

@ -0,0 +1,175 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.fluid.optimizer import Optimizer
from paddle.fluid.regularizer import L1DecayRegularizer
from paddle.fluid.regularizer import L2DecayRegularizer
from paddle.fluid.regularizer import append_regularization_ops
from paddle.fluid import framework
from paddle.fluid import core
from paddle.fluid.framework import program_guard
from paddle.fluid.clip import append_gradient_clip_ops
__all__ = ['Momentum']
class Momentum(Optimizer):
"""
Simple Momentum optimizer with velocity state
This optimizer has a flag for Nestrov Momentum.
The update equations are as follows:
.. math::
& velocity = mu * velocity + gradient
& if (use\_nesterov):
&\quad param = param - (gradient + mu * velocity) * learning\_rate
& else:
&\quad param = param - learning\_rate * velocity
Parameters:
learning_rate (float|Variable): The learning rate used to update parameters. \
Can be a float value or a Variable with one float value as data element.
momentum (float): Momentum factor
parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
use_nesterov (bool, optional): Enables Nesterov momentum, default is false.
regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
:ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): This parameter is used by developers to print debugging information. \
For details, please refer to :ref:`api_guide_Name`. Default is None.
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
import numpy as np
paddle.enable_static()
place = fluid.CPUPlace()
main = fluid.Program()
with fluid.program_guard(main):
x = paddle.static.data(name='x', shape=[1, 13], dtype='float32')
y = paddle.static.data(name='y', shape=[1], dtype='float32')
linear = paddle.nn.Linear(13, 1)
y_predict = linear(x)
cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y)
avg_cost = paddle.mean(cost)
moment_optimizer = fluid.contrib.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
moment_optimizer.minimize(avg_cost)
fetch_list = [avg_cost]
train_reader = paddle.batch(
paddle.dataset.uci_housing.train(), batch_size=1)
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place)
exe.run(paddle.static.default_startup_program())
for data in train_reader():
exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
"""
_velocity_acc_str = "velocity"
def __init__(self,
learning_rate,
momentum,
parameter_list=None,
use_nesterov=False,
regularization=None,
grad_clip=None,
name=None):
assert learning_rate is not None
assert momentum is not None
predicate = lambda regular: isinstance(regular, L2DecayRegularizer)
py_regular = None if predicate(regularization) else regularization
super(Momentum, self).__init__(
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=py_regular,
grad_clip=grad_clip,
name=name)
self.type = "momentum"
self._momentum = momentum
self._use_nesterov = bool(use_nesterov)
self._regularization_method = ""
self._regularization_coeff = 0
if (isinstance(regularization, L2DecayRegularizer)):
self._regularization_method = "l2_decay"
self._regularization_coeff = regularization._regularization_coeff
def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block)
for p in parameters:
self._add_accumulator(self._velocity_acc_str, p)
def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)
velocity_acc = self._get_accumulator(self._velocity_acc_str,
param_and_grad[0])
lr = self._create_param_lr(param_and_grad)
if framework.in_dygraph_mode():
_, _ = core.ops.momentum(
param_and_grad[0], param_and_grad[1], velocity_acc, lr,
param_and_grad[0], velocity_acc, 'mu', self._momentum,
'use_nesterov', self._use_nesterov, 'regularization_method',
self._regularization_method, 'regularization_coeff',
self._regularization_coeff)
return None
attrs = {
"mu": self._momentum,
"use_nesterov": self._use_nesterov,
"regularization_method": self._regularization_method,
"regularization_coeff": self._regularization_coeff
}
inputs = {
"Param": [param_and_grad[0]],
"Grad": [param_and_grad[1]],
"Velocity": [velocity_acc],
"LearningRate": [lr]
}
outputs = {
"ParamOut": [param_and_grad[0]],
"VelocityOut": [velocity_acc]
}
# create the momentum optimize op
momentum_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True)
return momentum_op

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save