|
|
@ -24,7 +24,7 @@ from . import framework
|
|
|
|
from . import layers
|
|
|
|
from . import layers
|
|
|
|
from . import unique_name
|
|
|
|
from . import unique_name
|
|
|
|
from .backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name
|
|
|
|
from .backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name
|
|
|
|
from .clip import append_gradient_clip_ops, error_clip_callback
|
|
|
|
from .clip import GradientClipBase, error_clip_callback, append_gradient_clip_ops
|
|
|
|
from .framework import program_guard
|
|
|
|
from .framework import program_guard
|
|
|
|
from .initializer import Constant
|
|
|
|
from .initializer import Constant
|
|
|
|
from .layer_helper import LayerHelper
|
|
|
|
from .layer_helper import LayerHelper
|
|
|
@ -109,6 +109,8 @@ class Optimizer(object):
|
|
|
|
self._opti_name_list = []
|
|
|
|
self._opti_name_list = []
|
|
|
|
self._accumulators_holder = {}
|
|
|
|
self._accumulators_holder = {}
|
|
|
|
self._param_device_map = dict()
|
|
|
|
self._param_device_map = dict()
|
|
|
|
|
|
|
|
# if pass grad_clip into minimize, it will not be None
|
|
|
|
|
|
|
|
self._grad_clip = None
|
|
|
|
|
|
|
|
|
|
|
|
@framework.dygraph_only
|
|
|
|
@framework.dygraph_only
|
|
|
|
def state_dict(self):
|
|
|
|
def state_dict(self):
|
|
|
@ -690,12 +692,17 @@ class Optimizer(object):
|
|
|
|
# ...
|
|
|
|
# ...
|
|
|
|
optimizer.apply_gradients(params_grads)
|
|
|
|
optimizer.apply_gradients(params_grads)
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
params_grads = sorted(params_grads, key=lambda x: x[0].name)
|
|
|
|
params_grads = sorted(params_grads, key=lambda x: x[0].name)
|
|
|
|
|
|
|
|
|
|
|
|
params_grads, table_param_and_grad, table_optimize_op = \
|
|
|
|
params_grads, table_param_and_grad, table_optimize_op = \
|
|
|
|
self._process_distribute_lookuptable(params_grads)
|
|
|
|
self._process_distribute_lookuptable(params_grads)
|
|
|
|
|
|
|
|
|
|
|
|
params_grads = append_gradient_clip_ops(params_grads)
|
|
|
|
# 'minimize(grad_clip)' or 'set_gradient_clip'
|
|
|
|
|
|
|
|
if self._grad_clip is not None:
|
|
|
|
|
|
|
|
params_grads = self._grad_clip(params_grads)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
params_grads = append_gradient_clip_ops(params_grads)
|
|
|
|
|
|
|
|
|
|
|
|
# Add regularization if any
|
|
|
|
# Add regularization if any
|
|
|
|
params_grads = append_regularization_ops(params_grads,
|
|
|
|
params_grads = append_regularization_ops(params_grads,
|
|
|
@ -712,19 +719,19 @@ class Optimizer(object):
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
Second part of `minimize`, appending optimization operators for
|
|
|
|
Second part of `minimize`, appending optimization operators for
|
|
|
|
given `params_grads` pairs.
|
|
|
|
given `params_grads` pairs.
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
Args:
|
|
|
|
loss (Variable): loss variable to run optimizations.
|
|
|
|
loss (Variable): loss variable to run optimizations.
|
|
|
|
startup_program (Program): startup_program for initializing parameters
|
|
|
|
startup_program (Program): startup_program for initializing parameters
|
|
|
|
in `parameter_list`.
|
|
|
|
in `parameter_list`.
|
|
|
|
params_grads (list): list of (param, grad) pair to do optimization.
|
|
|
|
params_grads (list): list of (param, grad) pair to do optimization.
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Returns:
|
|
|
|
list: A list of operators appended to the current program.
|
|
|
|
list: A list of operators appended to the current program.
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
if framework.in_dygraph_mode():
|
|
|
|
if framework.in_dygraph_mode():
|
|
|
|
with program_guard(framework.default_main_program(),
|
|
|
|
with program_guard(framework.default_main_program(),
|
|
|
|
framework.default_startup_program()):
|
|
|
|
framework.default_startup_program()):
|
|
|
|
|
|
|
|
if self._grad_clip is not None:
|
|
|
|
|
|
|
|
params_grads = self._grad_clip(params_grads)
|
|
|
|
params_grads = append_regularization_ops(params_grads,
|
|
|
|
params_grads = append_regularization_ops(params_grads,
|
|
|
|
self.regularization)
|
|
|
|
self.regularization)
|
|
|
|
optimize_ops = self._create_optimization_pass(params_grads)
|
|
|
|
optimize_ops = self._create_optimization_pass(params_grads)
|
|
|
@ -809,16 +816,19 @@ class Optimizer(object):
|
|
|
|
Please refer to the example of current Optimizer.
|
|
|
|
Please refer to the example of current Optimizer.
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
assert isinstance(loss, Variable), "The loss should be an Variable."
|
|
|
|
assert isinstance(loss, Variable), "The loss should be an Variable."
|
|
|
|
|
|
|
|
if grad_clip is not None:
|
|
|
|
|
|
|
|
if not isinstance(grad_clip, GradientClipBase):
|
|
|
|
|
|
|
|
raise TypeError(
|
|
|
|
|
|
|
|
"'grad_clip' should be an instance of GradientClipBase's derived class"
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
self._grad_clip = grad_clip
|
|
|
|
|
|
|
|
|
|
|
|
params_grads = self.backward(
|
|
|
|
params_grads = self.backward(
|
|
|
|
loss,
|
|
|
|
loss,
|
|
|
|
startup_program=startup_program,
|
|
|
|
startup_program=startup_program,
|
|
|
|
parameter_list=parameter_list,
|
|
|
|
parameter_list=parameter_list,
|
|
|
|
no_grad_set=no_grad_set)
|
|
|
|
no_grad_set=no_grad_set)
|
|
|
|
|
|
|
|
|
|
|
|
if grad_clip is not None and framework.in_dygraph_mode():
|
|
|
|
|
|
|
|
# TODO(hongyu): FIX later, this is only for dygraph, should be work for static mode
|
|
|
|
|
|
|
|
params_grads = grad_clip(params_grads)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
optimize_ops = self.apply_optimize(
|
|
|
|
optimize_ops = self.apply_optimize(
|
|
|
|
loss, startup_program=startup_program, params_grads=params_grads)
|
|
|
|
loss, startup_program=startup_program, params_grads=params_grads)
|
|
|
|
|
|
|
|
|
|
|
@ -1148,6 +1158,7 @@ class DGCMomentumOptimizer(Optimizer):
|
|
|
|
|
|
|
|
|
|
|
|
self.regular_type, self.regular_coeff = self._get_regularization_param(
|
|
|
|
self.regular_type, self.regular_coeff = self._get_regularization_param(
|
|
|
|
self.regularization)
|
|
|
|
self.regularization)
|
|
|
|
|
|
|
|
self._grad_clip = None
|
|
|
|
|
|
|
|
|
|
|
|
def _get_regularization_param(self, regularization):
|
|
|
|
def _get_regularization_param(self, regularization):
|
|
|
|
regular_type = 0
|
|
|
|
regular_type = 0
|
|
|
@ -1404,24 +1415,28 @@ class DGCMomentumOptimizer(Optimizer):
|
|
|
|
dgc_op._set_attr(op_maker.kOpRoleVarAttrName(),
|
|
|
|
dgc_op._set_attr(op_maker.kOpRoleVarAttrName(),
|
|
|
|
[param_var.name, grad_var.name])
|
|
|
|
[param_var.name, grad_var.name])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@imperative_base.no_grad
|
|
|
|
def apply_gradients(self, params_grads):
|
|
|
|
def apply_gradients(self, params_grads):
|
|
|
|
params_grads = sorted(params_grads, key=lambda x: x[0].name)
|
|
|
|
params_grads = sorted(params_grads, key=lambda x: x[0].name)
|
|
|
|
|
|
|
|
|
|
|
|
params_grads, table_param_and_grad, table_optimize_op = \
|
|
|
|
params_grads, table_param_and_grad, table_optimize_op = \
|
|
|
|
self._process_distribute_lookuptable(params_grads)
|
|
|
|
self._process_distribute_lookuptable(params_grads)
|
|
|
|
|
|
|
|
|
|
|
|
not_dgc_params_grads = []
|
|
|
|
not_dgc_params_grads = []
|
|
|
|
dgc_params_grads = []
|
|
|
|
dgc_params_grads = []
|
|
|
|
|
|
|
|
# DGC clip and regularization in optimizer.backward
|
|
|
|
for param, grad in params_grads:
|
|
|
|
for param, grad in params_grads:
|
|
|
|
if not self._is_use_dgc(param, grad):
|
|
|
|
if not self._is_use_dgc(param, grad):
|
|
|
|
not_dgc_params_grads.append((param, grad))
|
|
|
|
not_dgc_params_grads.append((param, grad))
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
dgc_params_grads.append((param, grad))
|
|
|
|
dgc_params_grads.append((param, grad))
|
|
|
|
|
|
|
|
|
|
|
|
# DGC clip and regularization in local
|
|
|
|
# 'minimize(grad_clip)' or 'set_gradient_clip'
|
|
|
|
not_dgc_params_grads = append_gradient_clip_ops(not_dgc_params_grads)
|
|
|
|
if self._grad_clip is not None:
|
|
|
|
|
|
|
|
not_dgc_params_grads = self._grad_clip(not_dgc_params_grads)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
not_dgc_params_grads = append_gradient_clip_ops(
|
|
|
|
|
|
|
|
not_dgc_params_grads)
|
|
|
|
|
|
|
|
|
|
|
|
# Add regularization if any
|
|
|
|
|
|
|
|
not_dgc_params_grads = append_regularization_ops(not_dgc_params_grads,
|
|
|
|
not_dgc_params_grads = append_regularization_ops(not_dgc_params_grads,
|
|
|
|
self.regularization)
|
|
|
|
self.regularization)
|
|
|
|
|
|
|
|
|
|
|
@ -3942,16 +3957,13 @@ class RecomputeOptimizer(Optimizer):
|
|
|
|
def apply_optimize(self, loss, startup_program, params_grads):
|
|
|
|
def apply_optimize(self, loss, startup_program, params_grads):
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
call the apply_optimize function of self._optimizer
|
|
|
|
call the apply_optimize function of self._optimizer
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
Args:
|
|
|
|
loss (Variable): loss variable to run optimizations.
|
|
|
|
loss (Variable): loss variable to run optimizations.
|
|
|
|
startup_program (Program): startup_program for initializing parameters
|
|
|
|
startup_program (Program): startup_program for initializing parameters
|
|
|
|
in `parameter_list`.
|
|
|
|
in `parameter_list`.
|
|
|
|
params_grads (list): list of (param, grad) pair to do optimization.
|
|
|
|
params_grads (list): list of (param, grad) pair to do optimization.
|
|
|
|
|
|
|
|
|
|
|
|
Examples:
|
|
|
|
Examples:
|
|
|
|
.. code-block:: python
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
|
|
|
|
import paddle.fluid as fluid
|
|
|
|
import paddle.fluid as fluid
|
|
|
|
|
|
|
|
|
|
|
|
def mlp(input_x, input_y, hid_dim=128, label_dim=2):
|
|
|
|
def mlp(input_x, input_y, hid_dim=128, label_dim=2):
|
|
|
@ -3979,7 +3991,6 @@ class RecomputeOptimizer(Optimizer):
|
|
|
|
cost, startup_program=None, params_grads=params_grads)
|
|
|
|
cost, startup_program=None, params_grads=params_grads)
|
|
|
|
|
|
|
|
|
|
|
|
print("Finished apply_optimize")
|
|
|
|
print("Finished apply_optimize")
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
return self._optimizer.apply_optimize(
|
|
|
|
return self._optimizer.apply_optimize(
|
|
|
@ -3991,24 +4002,24 @@ class RecomputeOptimizer(Optimizer):
|
|
|
|
parameter_list=None,
|
|
|
|
parameter_list=None,
|
|
|
|
no_grad_set=None,
|
|
|
|
no_grad_set=None,
|
|
|
|
grad_clip=None):
|
|
|
|
grad_clip=None):
|
|
|
|
|
|
|
|
assert isinstance(loss, Variable), "The loss should be an Variable."
|
|
|
|
assert (isinstance(loss, Variable)), "The loss should be an Variable."
|
|
|
|
|
|
|
|
assert (self._checkpoints is not None
|
|
|
|
assert (self._checkpoints is not None
|
|
|
|
), "You should call _set_checkpoints first"
|
|
|
|
), "You should call _set_checkpoints first"
|
|
|
|
if framework.in_dygraph_mode():
|
|
|
|
if framework.in_dygraph_mode():
|
|
|
|
raise NotImplementedError(
|
|
|
|
raise NotImplementedError(
|
|
|
|
"DyGraph current does not support recompute")
|
|
|
|
"DyGraph current does not support recompute")
|
|
|
|
|
|
|
|
if grad_clip is not None:
|
|
|
|
|
|
|
|
if not isinstance(grad_clip, GradientClipBase):
|
|
|
|
|
|
|
|
raise TypeError(
|
|
|
|
|
|
|
|
"'grad_clip' should be an instance of GradientClipBase's derived class"
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
self._optimizer._grad_clip = grad_clip
|
|
|
|
params_grads = self.backward(
|
|
|
|
params_grads = self.backward(
|
|
|
|
loss,
|
|
|
|
loss,
|
|
|
|
startup_program=startup_program,
|
|
|
|
startup_program=startup_program,
|
|
|
|
parameter_list=parameter_list,
|
|
|
|
parameter_list=parameter_list,
|
|
|
|
no_grad_set=no_grad_set)
|
|
|
|
no_grad_set=no_grad_set)
|
|
|
|
|
|
|
|
|
|
|
|
if grad_clip:
|
|
|
|
|
|
|
|
# TODO(guru4elephant): should add grad_clip for static graph
|
|
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
optimize_ops = self.apply_optimize(
|
|
|
|
optimize_ops = self.apply_optimize(
|
|
|
|
loss, startup_program=startup_program, params_grads=params_grads)
|
|
|
|
loss, startup_program=startup_program, params_grads=params_grads)
|
|
|
|
|
|
|
|
|
|
|
|