add new method of gradient_clip, better to use,test=develop (#23224)

revert-23830-2.0-beta
Zhou Wei 5 years ago committed by GitHub
parent b7b0b3595b
commit 7fda333ac1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -75,7 +75,6 @@ from .transpiler import DistributeTranspiler, \
memory_optimize, release_memory, DistributeTranspilerConfig
from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
from . import clip
from . import dygraph_grad_clip
from . import profiler
from . import unique_name
from . import parallel_executor
@ -122,7 +121,6 @@ __all__ = framework.__all__ + executor.__all__ + \
'WeightNormParamAttr',
'DataFeeder',
'clip',
'dygraph_grad_clip',
'profiler',
'unique_name',
'Scope',

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -2409,7 +2409,6 @@ class Block(object):
trainable = v.trainable
optimize_attr = v.optimize_attr
regularizer = v.regularizer
gradient_clip_attr = v.gradient_clip_attr
error_clip = v.error_clip
elif type(v) == Variable:
var_type = "Variable"
@ -2432,7 +2431,6 @@ class Block(object):
trainable=trainable,
optimize_attr=optimize_attr,
regularizer=regularizer,
gradient_clip_attr=gradient_clip_attr,
error_clip=error_clip)
else:
var = Parameter(
@ -2445,7 +2443,6 @@ class Block(object):
trainable=trainable,
optimize_attr=optimize_attr,
regularizer=regularizer,
gradient_clip_attr=gradient_clip_attr,
error_clip=error_clip)
elif var_type == "Variable":
var = Variable(
@ -2723,7 +2720,6 @@ class Block(object):
trainable=p.trainable,
optimize_attr=p.optimize_attr,
regularizer=p.regularizer,
gradient_clip_attr=p.gradient_clip_attr,
error_clip=p.error_clip,
name=v.name)
else:
@ -2737,7 +2733,6 @@ class Block(object):
trainable=p.trainable,
optimize_attr=p.optimize_attr,
regularizer=p.regularizer,
gradient_clip_attr=p.gradient_clip_attr,
error_clip=p.error_clip,
name=v.name)
self.vars[new_p.name] = new_p
@ -4646,8 +4641,6 @@ class Parameter(Variable):
Default: {'learning_rate': 1.0}
regularizer(WeightDecayRegularizer): The Regularizer which will
be applied on the parameter. Default: None
gradient_clip_attr(BaseGradientClipAttr): The gradient clip strategy
which will be applied on the parameter. Default: None
do_model_average(bool): True if the model average strategy will
be applied on this parameter.
"""
@ -4687,8 +4680,6 @@ class Parameter(Variable):
self.regularizer = kwargs.get('regularizer', None)
self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None)
self.do_model_average = kwargs.get('do_model_average', None)
self.is_distributed = False
@ -4723,7 +4714,7 @@ class Parameter(Variable):
if with_details:
res_str = Variable.to_string(self, throw_on_error, True)
additional_attr = ("trainable", "optimize_attr", "regularizer",
"gradient_clip_attr", "do_model_average")
"do_model_average")
for attr_name in additional_attr:
res_str += "%s: %s\n" % (attr_name,
cpt.to_text(getattr(self, attr_name)))
@ -4752,8 +4743,6 @@ class ParamBase(core.VarBase):
Default: {'learning_rate': 1.0}
regularizer(WeightDecayRegularizer): The Regularizer which will
be applied on the ParamBase. Default: None
gradient_clip_attr(BaseGradientClipAttr): The gradient clip strategy
which will be applied on the ParamBase. Default: None
do_model_average(bool): True if the model average strategy will
be applied on this ParamBase.
"""
@ -4792,8 +4781,6 @@ class ParamBase(core.VarBase):
self.regularizer = kwargs.get('regularizer', None)
self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None)
self.do_model_average = kwargs.get('do_model_average', None)
self.is_distributed = False

@ -24,7 +24,7 @@ from . import framework
from . import layers
from . import unique_name
from .backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name
from .clip import append_gradient_clip_ops, error_clip_callback
from .clip import GradientClipBase, error_clip_callback, append_gradient_clip_ops
from .framework import program_guard
from .initializer import Constant
from .layer_helper import LayerHelper
@ -109,6 +109,8 @@ class Optimizer(object):
self._opti_name_list = []
self._accumulators_holder = {}
self._param_device_map = dict()
# if pass grad_clip into minimize, it will not be None
self._grad_clip = None
@framework.dygraph_only
def state_dict(self):
@ -690,12 +692,17 @@ class Optimizer(object):
# ...
optimizer.apply_gradients(params_grads)
"""
params_grads = sorted(params_grads, key=lambda x: x[0].name)
params_grads, table_param_and_grad, table_optimize_op = \
self._process_distribute_lookuptable(params_grads)
params_grads = append_gradient_clip_ops(params_grads)
# 'minimize(grad_clip)' or 'set_gradient_clip'
if self._grad_clip is not None:
params_grads = self._grad_clip(params_grads)
else:
params_grads = append_gradient_clip_ops(params_grads)
# Add regularization if any
params_grads = append_regularization_ops(params_grads,
@ -712,19 +719,19 @@ class Optimizer(object):
"""
Second part of `minimize`, appending optimization operators for
given `params_grads` pairs.
Args:
loss (Variable): loss variable to run optimizations.
startup_program (Program): startup_program for initializing parameters
in `parameter_list`.
params_grads (list): list of (param, grad) pair to do optimization.
Returns:
list: A list of operators appended to the current program.
"""
if framework.in_dygraph_mode():
with program_guard(framework.default_main_program(),
framework.default_startup_program()):
if self._grad_clip is not None:
params_grads = self._grad_clip(params_grads)
params_grads = append_regularization_ops(params_grads,
self.regularization)
optimize_ops = self._create_optimization_pass(params_grads)
@ -809,16 +816,19 @@ class Optimizer(object):
Please refer to the example of current Optimizer.
"""
assert isinstance(loss, Variable), "The loss should be an Variable."
if grad_clip is not None:
if not isinstance(grad_clip, GradientClipBase):
raise TypeError(
"'grad_clip' should be an instance of GradientClipBase's derived class"
)
self._grad_clip = grad_clip
params_grads = self.backward(
loss,
startup_program=startup_program,
parameter_list=parameter_list,
no_grad_set=no_grad_set)
if grad_clip is not None and framework.in_dygraph_mode():
# TODO(hongyu): FIX later, this is only for dygraph, should be work for static mode
params_grads = grad_clip(params_grads)
optimize_ops = self.apply_optimize(
loss, startup_program=startup_program, params_grads=params_grads)
@ -1148,6 +1158,7 @@ class DGCMomentumOptimizer(Optimizer):
self.regular_type, self.regular_coeff = self._get_regularization_param(
self.regularization)
self._grad_clip = None
def _get_regularization_param(self, regularization):
regular_type = 0
@ -1404,24 +1415,28 @@ class DGCMomentumOptimizer(Optimizer):
dgc_op._set_attr(op_maker.kOpRoleVarAttrName(),
[param_var.name, grad_var.name])
@imperative_base.no_grad
def apply_gradients(self, params_grads):
params_grads = sorted(params_grads, key=lambda x: x[0].name)
params_grads, table_param_and_grad, table_optimize_op = \
self._process_distribute_lookuptable(params_grads)
not_dgc_params_grads = []
dgc_params_grads = []
# DGC clip and regularization in optimizer.backward
for param, grad in params_grads:
if not self._is_use_dgc(param, grad):
not_dgc_params_grads.append((param, grad))
else:
dgc_params_grads.append((param, grad))
# DGC clip and regularization in local
not_dgc_params_grads = append_gradient_clip_ops(not_dgc_params_grads)
# 'minimize(grad_clip)' or 'set_gradient_clip'
if self._grad_clip is not None:
not_dgc_params_grads = self._grad_clip(not_dgc_params_grads)
else:
not_dgc_params_grads = append_gradient_clip_ops(
not_dgc_params_grads)
# Add regularization if any
not_dgc_params_grads = append_regularization_ops(not_dgc_params_grads,
self.regularization)
@ -3942,16 +3957,13 @@ class RecomputeOptimizer(Optimizer):
def apply_optimize(self, loss, startup_program, params_grads):
"""
call the apply_optimize function of self._optimizer
Args:
loss (Variable): loss variable to run optimizations.
startup_program (Program): startup_program for initializing parameters
in `parameter_list`.
params_grads (list): list of (param, grad) pair to do optimization.
Examples:
.. code-block:: python
import paddle.fluid as fluid
def mlp(input_x, input_y, hid_dim=128, label_dim=2):
@ -3979,7 +3991,6 @@ class RecomputeOptimizer(Optimizer):
cost, startup_program=None, params_grads=params_grads)
print("Finished apply_optimize")
"""
return self._optimizer.apply_optimize(
@ -3991,24 +4002,24 @@ class RecomputeOptimizer(Optimizer):
parameter_list=None,
no_grad_set=None,
grad_clip=None):
assert (isinstance(loss, Variable)), "The loss should be an Variable."
assert isinstance(loss, Variable), "The loss should be an Variable."
assert (self._checkpoints is not None
), "You should call _set_checkpoints first"
if framework.in_dygraph_mode():
raise NotImplementedError(
"DyGraph current does not support recompute")
if grad_clip is not None:
if not isinstance(grad_clip, GradientClipBase):
raise TypeError(
"'grad_clip' should be an instance of GradientClipBase's derived class"
)
self._optimizer._grad_clip = grad_clip
params_grads = self.backward(
loss,
startup_program=startup_program,
parameter_list=parameter_list,
no_grad_set=no_grad_set)
if grad_clip:
# TODO(guru4elephant): should add grad_clip for static graph
pass
optimize_ops = self.apply_optimize(
loss, startup_program=startup_program, params_grads=params_grads)

@ -15,6 +15,7 @@
from __future__ import print_function
import six
import warnings
from .initializer import Initializer, Xavier, Constant
from .regularizer import WeightDecayRegularizer
@ -68,7 +69,6 @@ class ParamAttr(object):
learning_rate=1.0,
regularizer=None,
trainable=True,
gradient_clip=None,
do_model_average=True):
self.name = name
if isinstance(self.name, six.string_types) and self.name == "":
@ -78,7 +78,6 @@ class ParamAttr(object):
self.learning_rate = learning_rate
self.regularizer = regularizer
self.trainable = trainable
self.gradient_clip = gradient_clip
self.do_model_average = do_model_average
def _set_default_initializer(self, initializer):
@ -176,7 +175,6 @@ class ParamAttr(object):
},
'regularizer': self.regularizer,
'trainable': self.trainable,
'gradient_clip_attr': self.gradient_clip,
'do_model_average': self.do_model_average
}
if with_initializer:
@ -248,7 +246,6 @@ class WeightNormParamAttr(ParamAttr):
learning_rate=1.0,
regularizer=None,
trainable=True,
gradient_clip=None,
do_model_average=False):
super(WeightNormParamAttr, self).__init__(
name=name,
@ -256,6 +253,5 @@ class WeightNormParamAttr(ParamAttr):
learning_rate=learning_rate,
regularizer=regularizer,
trainable=trainable,
gradient_clip=gradient_clip,
do_model_average=do_model_average)
self.dim = dim

@ -476,15 +476,18 @@ class TestL2Decay(TranspilerTest):
size=1000,
act=None,
param_attr=fluid.ParamAttr(
name='fc_w',
regularizer=fluid.regularizer.L2Decay(),
gradient_clip=fluid.clip.GradientClipByValue(0.1)),
name='fc_w', regularizer=fluid.regularizer.L2Decay()),
bias_attr=fluid.ParamAttr(name='fc_b'))
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
sgd_optimizer.minimize(avg_cost)
def filter(param):
return param.name == "fc_w"
clip = fluid.clip.GradientClipByValue(0.1, need_clip=filter)
sgd_optimizer.minimize(avg_cost, grad_clip=clip)
def transpiler_test_impl(self):
pserver, startup = self.get_pserver(self.pserver1_ep)

@ -25,7 +25,7 @@ from paddle.fluid import core
from paddle.fluid.dygraph.base import to_variable
from paddle.fluid.dygraph_grad_clip import GradClipByValue, GradClipByNorm, GradClipByGlobalNorm
from paddle.fluid.clip import GradientClipByValue, GradientClipByNorm, GradientClipByGlobalNorm
class TestGradClipByGlobalNorm(unittest.TestCase):
@ -65,7 +65,7 @@ class TestGradClipByGlobalNorm(unittest.TestCase):
def get_dygrap_global_norm_result(self):
with fluid.dygraph.guard():
gloabl_norm_clip = GradClipByGlobalNorm(self.max_global_norm)
gloabl_norm_clip = GradientClipByGlobalNorm(self.max_global_norm)
p_g_var = []
for p, g in self.para_and_grad:
new_p = to_variable(p)
@ -135,7 +135,7 @@ class TestGradClipByNorm(unittest.TestCase):
def get_dygrap_norm_result(self):
with fluid.dygraph.guard():
norm_clip = GradClipByNorm(self.max_norm)
norm_clip = GradientClipByNorm(self.max_norm)
p_g_var = []
for p, g in self.para_and_grad:
new_p = to_variable(p)
@ -200,8 +200,8 @@ class TestGradClipByValue(unittest.TestCase):
def get_dygrap_clip_result(self):
with fluid.dygraph.guard():
value_clip = GradClipByValue(self.min_value, self.max_value)
value_clip = GradientClipByValue(
max=self.max_value, min=self.min_value)
p_g_var = []
for p, g in self.para_and_grad:
new_p = to_variable(p)
@ -225,7 +225,7 @@ class TestGradClipByValue(unittest.TestCase):
for (p_np, g_np), (p_dy, g_dy) in zip(np_p_g, dy_out_p_g):
self.assertTrue(np.allclose(g_np, g_dy, rtol=1e-6, atol=1e-8))
def test_clip_by_norm_2(self):
def test_clip_by_value_2(self):
self.init_value()
self.init_scale = 0.2
@ -236,7 +236,7 @@ class TestGradClipByValue(unittest.TestCase):
for (p_np, g_np), (p_dy, g_dy) in zip(np_p_g, dy_out_p_g):
self.assertTrue(np.allclose(g_np, g_dy, rtol=1e-6, atol=1e-8))
def test_clip_by_norm_3(self):
def test_clip_by_value_3(self):
self.init_value()
self.init_scale = 0.5

File diff suppressed because it is too large Load Diff

@ -331,7 +331,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
model = MyLayer(size, vocab_size, size)
optimizer = fluid.optimizer.AdamOptimizer(
0.001, parameter_list=model.parameters())
grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001)
grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
indices = fluid.dygraph.to_variable(indices)
embed = fluid.dygraph.to_variable(embed)
@ -350,7 +350,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
model = MyLayer2(size, vocab_size, size)
optimizer = fluid.optimizer.AdamOptimizer(
0.001, parameter_list=model.parameters())
grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001)
grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
indices = fluid.dygraph.to_variable(indices)
emebd = fluid.dygraph.to_variable(embed)

@ -49,7 +49,7 @@ class TestSimpleNet(unittest.TestCase):
with fluid.dygraph.guard(place):
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = sort_sum_gradient
# grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(5.0)
# grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
input_word = np.array([[1, 2], [2, 1]]).astype('int64')
input = to_variable(input_word)
@ -83,8 +83,7 @@ class TestSimpleNet(unittest.TestCase):
with fluid.dygraph.guard(place):
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = sort_sum_gradient
grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(
5.0)
grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
input_word = np.array([[1, 2], [2, 1]]).astype('int64')
input = to_variable(input_word)

Loading…
Cancel
Save