add new method of gradient_clip, better to use,test=develop (#23224)

revert-23830-2.0-beta
Zhou Wei 5 years ago committed by GitHub
parent b7b0b3595b
commit 7fda333ac1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -75,7 +75,6 @@ from .transpiler import DistributeTranspiler, \
memory_optimize, release_memory, DistributeTranspilerConfig memory_optimize, release_memory, DistributeTranspilerConfig
from .lod_tensor import create_lod_tensor, create_random_int_lodtensor from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
from . import clip from . import clip
from . import dygraph_grad_clip
from . import profiler from . import profiler
from . import unique_name from . import unique_name
from . import parallel_executor from . import parallel_executor
@ -122,7 +121,6 @@ __all__ = framework.__all__ + executor.__all__ + \
'WeightNormParamAttr', 'WeightNormParamAttr',
'DataFeeder', 'DataFeeder',
'clip', 'clip',
'dygraph_grad_clip',
'profiler', 'profiler',
'unique_name', 'unique_name',
'Scope', 'Scope',

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -2409,7 +2409,6 @@ class Block(object):
trainable = v.trainable trainable = v.trainable
optimize_attr = v.optimize_attr optimize_attr = v.optimize_attr
regularizer = v.regularizer regularizer = v.regularizer
gradient_clip_attr = v.gradient_clip_attr
error_clip = v.error_clip error_clip = v.error_clip
elif type(v) == Variable: elif type(v) == Variable:
var_type = "Variable" var_type = "Variable"
@ -2432,7 +2431,6 @@ class Block(object):
trainable=trainable, trainable=trainable,
optimize_attr=optimize_attr, optimize_attr=optimize_attr,
regularizer=regularizer, regularizer=regularizer,
gradient_clip_attr=gradient_clip_attr,
error_clip=error_clip) error_clip=error_clip)
else: else:
var = Parameter( var = Parameter(
@ -2445,7 +2443,6 @@ class Block(object):
trainable=trainable, trainable=trainable,
optimize_attr=optimize_attr, optimize_attr=optimize_attr,
regularizer=regularizer, regularizer=regularizer,
gradient_clip_attr=gradient_clip_attr,
error_clip=error_clip) error_clip=error_clip)
elif var_type == "Variable": elif var_type == "Variable":
var = Variable( var = Variable(
@ -2723,7 +2720,6 @@ class Block(object):
trainable=p.trainable, trainable=p.trainable,
optimize_attr=p.optimize_attr, optimize_attr=p.optimize_attr,
regularizer=p.regularizer, regularizer=p.regularizer,
gradient_clip_attr=p.gradient_clip_attr,
error_clip=p.error_clip, error_clip=p.error_clip,
name=v.name) name=v.name)
else: else:
@ -2737,7 +2733,6 @@ class Block(object):
trainable=p.trainable, trainable=p.trainable,
optimize_attr=p.optimize_attr, optimize_attr=p.optimize_attr,
regularizer=p.regularizer, regularizer=p.regularizer,
gradient_clip_attr=p.gradient_clip_attr,
error_clip=p.error_clip, error_clip=p.error_clip,
name=v.name) name=v.name)
self.vars[new_p.name] = new_p self.vars[new_p.name] = new_p
@ -4646,8 +4641,6 @@ class Parameter(Variable):
Default: {'learning_rate': 1.0} Default: {'learning_rate': 1.0}
regularizer(WeightDecayRegularizer): The Regularizer which will regularizer(WeightDecayRegularizer): The Regularizer which will
be applied on the parameter. Default: None be applied on the parameter. Default: None
gradient_clip_attr(BaseGradientClipAttr): The gradient clip strategy
which will be applied on the parameter. Default: None
do_model_average(bool): True if the model average strategy will do_model_average(bool): True if the model average strategy will
be applied on this parameter. be applied on this parameter.
""" """
@ -4687,8 +4680,6 @@ class Parameter(Variable):
self.regularizer = kwargs.get('regularizer', None) self.regularizer = kwargs.get('regularizer', None)
self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None)
self.do_model_average = kwargs.get('do_model_average', None) self.do_model_average = kwargs.get('do_model_average', None)
self.is_distributed = False self.is_distributed = False
@ -4723,7 +4714,7 @@ class Parameter(Variable):
if with_details: if with_details:
res_str = Variable.to_string(self, throw_on_error, True) res_str = Variable.to_string(self, throw_on_error, True)
additional_attr = ("trainable", "optimize_attr", "regularizer", additional_attr = ("trainable", "optimize_attr", "regularizer",
"gradient_clip_attr", "do_model_average") "do_model_average")
for attr_name in additional_attr: for attr_name in additional_attr:
res_str += "%s: %s\n" % (attr_name, res_str += "%s: %s\n" % (attr_name,
cpt.to_text(getattr(self, attr_name))) cpt.to_text(getattr(self, attr_name)))
@ -4752,8 +4743,6 @@ class ParamBase(core.VarBase):
Default: {'learning_rate': 1.0} Default: {'learning_rate': 1.0}
regularizer(WeightDecayRegularizer): The Regularizer which will regularizer(WeightDecayRegularizer): The Regularizer which will
be applied on the ParamBase. Default: None be applied on the ParamBase. Default: None
gradient_clip_attr(BaseGradientClipAttr): The gradient clip strategy
which will be applied on the ParamBase. Default: None
do_model_average(bool): True if the model average strategy will do_model_average(bool): True if the model average strategy will
be applied on this ParamBase. be applied on this ParamBase.
""" """
@ -4792,8 +4781,6 @@ class ParamBase(core.VarBase):
self.regularizer = kwargs.get('regularizer', None) self.regularizer = kwargs.get('regularizer', None)
self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None)
self.do_model_average = kwargs.get('do_model_average', None) self.do_model_average = kwargs.get('do_model_average', None)
self.is_distributed = False self.is_distributed = False

@ -24,7 +24,7 @@ from . import framework
from . import layers from . import layers
from . import unique_name from . import unique_name
from .backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name from .backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name
from .clip import append_gradient_clip_ops, error_clip_callback from .clip import GradientClipBase, error_clip_callback, append_gradient_clip_ops
from .framework import program_guard from .framework import program_guard
from .initializer import Constant from .initializer import Constant
from .layer_helper import LayerHelper from .layer_helper import LayerHelper
@ -109,6 +109,8 @@ class Optimizer(object):
self._opti_name_list = [] self._opti_name_list = []
self._accumulators_holder = {} self._accumulators_holder = {}
self._param_device_map = dict() self._param_device_map = dict()
# if pass grad_clip into minimize, it will not be None
self._grad_clip = None
@framework.dygraph_only @framework.dygraph_only
def state_dict(self): def state_dict(self):
@ -690,12 +692,17 @@ class Optimizer(object):
# ... # ...
optimizer.apply_gradients(params_grads) optimizer.apply_gradients(params_grads)
""" """
params_grads = sorted(params_grads, key=lambda x: x[0].name) params_grads = sorted(params_grads, key=lambda x: x[0].name)
params_grads, table_param_and_grad, table_optimize_op = \ params_grads, table_param_and_grad, table_optimize_op = \
self._process_distribute_lookuptable(params_grads) self._process_distribute_lookuptable(params_grads)
params_grads = append_gradient_clip_ops(params_grads) # 'minimize(grad_clip)' or 'set_gradient_clip'
if self._grad_clip is not None:
params_grads = self._grad_clip(params_grads)
else:
params_grads = append_gradient_clip_ops(params_grads)
# Add regularization if any # Add regularization if any
params_grads = append_regularization_ops(params_grads, params_grads = append_regularization_ops(params_grads,
@ -712,19 +719,19 @@ class Optimizer(object):
""" """
Second part of `minimize`, appending optimization operators for Second part of `minimize`, appending optimization operators for
given `params_grads` pairs. given `params_grads` pairs.
Args: Args:
loss (Variable): loss variable to run optimizations. loss (Variable): loss variable to run optimizations.
startup_program (Program): startup_program for initializing parameters startup_program (Program): startup_program for initializing parameters
in `parameter_list`. in `parameter_list`.
params_grads (list): list of (param, grad) pair to do optimization. params_grads (list): list of (param, grad) pair to do optimization.
Returns: Returns:
list: A list of operators appended to the current program. list: A list of operators appended to the current program.
""" """
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
with program_guard(framework.default_main_program(), with program_guard(framework.default_main_program(),
framework.default_startup_program()): framework.default_startup_program()):
if self._grad_clip is not None:
params_grads = self._grad_clip(params_grads)
params_grads = append_regularization_ops(params_grads, params_grads = append_regularization_ops(params_grads,
self.regularization) self.regularization)
optimize_ops = self._create_optimization_pass(params_grads) optimize_ops = self._create_optimization_pass(params_grads)
@ -809,16 +816,19 @@ class Optimizer(object):
Please refer to the example of current Optimizer. Please refer to the example of current Optimizer.
""" """
assert isinstance(loss, Variable), "The loss should be an Variable." assert isinstance(loss, Variable), "The loss should be an Variable."
if grad_clip is not None:
if not isinstance(grad_clip, GradientClipBase):
raise TypeError(
"'grad_clip' should be an instance of GradientClipBase's derived class"
)
self._grad_clip = grad_clip
params_grads = self.backward( params_grads = self.backward(
loss, loss,
startup_program=startup_program, startup_program=startup_program,
parameter_list=parameter_list, parameter_list=parameter_list,
no_grad_set=no_grad_set) no_grad_set=no_grad_set)
if grad_clip is not None and framework.in_dygraph_mode():
# TODO(hongyu): FIX later, this is only for dygraph, should be work for static mode
params_grads = grad_clip(params_grads)
optimize_ops = self.apply_optimize( optimize_ops = self.apply_optimize(
loss, startup_program=startup_program, params_grads=params_grads) loss, startup_program=startup_program, params_grads=params_grads)
@ -1148,6 +1158,7 @@ class DGCMomentumOptimizer(Optimizer):
self.regular_type, self.regular_coeff = self._get_regularization_param( self.regular_type, self.regular_coeff = self._get_regularization_param(
self.regularization) self.regularization)
self._grad_clip = None
def _get_regularization_param(self, regularization): def _get_regularization_param(self, regularization):
regular_type = 0 regular_type = 0
@ -1404,24 +1415,28 @@ class DGCMomentumOptimizer(Optimizer):
dgc_op._set_attr(op_maker.kOpRoleVarAttrName(), dgc_op._set_attr(op_maker.kOpRoleVarAttrName(),
[param_var.name, grad_var.name]) [param_var.name, grad_var.name])
@imperative_base.no_grad
def apply_gradients(self, params_grads): def apply_gradients(self, params_grads):
params_grads = sorted(params_grads, key=lambda x: x[0].name) params_grads = sorted(params_grads, key=lambda x: x[0].name)
params_grads, table_param_and_grad, table_optimize_op = \ params_grads, table_param_and_grad, table_optimize_op = \
self._process_distribute_lookuptable(params_grads) self._process_distribute_lookuptable(params_grads)
not_dgc_params_grads = [] not_dgc_params_grads = []
dgc_params_grads = [] dgc_params_grads = []
# DGC clip and regularization in optimizer.backward
for param, grad in params_grads: for param, grad in params_grads:
if not self._is_use_dgc(param, grad): if not self._is_use_dgc(param, grad):
not_dgc_params_grads.append((param, grad)) not_dgc_params_grads.append((param, grad))
else: else:
dgc_params_grads.append((param, grad)) dgc_params_grads.append((param, grad))
# DGC clip and regularization in local # 'minimize(grad_clip)' or 'set_gradient_clip'
not_dgc_params_grads = append_gradient_clip_ops(not_dgc_params_grads) if self._grad_clip is not None:
not_dgc_params_grads = self._grad_clip(not_dgc_params_grads)
else:
not_dgc_params_grads = append_gradient_clip_ops(
not_dgc_params_grads)
# Add regularization if any
not_dgc_params_grads = append_regularization_ops(not_dgc_params_grads, not_dgc_params_grads = append_regularization_ops(not_dgc_params_grads,
self.regularization) self.regularization)
@ -3942,16 +3957,13 @@ class RecomputeOptimizer(Optimizer):
def apply_optimize(self, loss, startup_program, params_grads): def apply_optimize(self, loss, startup_program, params_grads):
""" """
call the apply_optimize function of self._optimizer call the apply_optimize function of self._optimizer
Args: Args:
loss (Variable): loss variable to run optimizations. loss (Variable): loss variable to run optimizations.
startup_program (Program): startup_program for initializing parameters startup_program (Program): startup_program for initializing parameters
in `parameter_list`. in `parameter_list`.
params_grads (list): list of (param, grad) pair to do optimization. params_grads (list): list of (param, grad) pair to do optimization.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.fluid as fluid import paddle.fluid as fluid
def mlp(input_x, input_y, hid_dim=128, label_dim=2): def mlp(input_x, input_y, hid_dim=128, label_dim=2):
@ -3979,7 +3991,6 @@ class RecomputeOptimizer(Optimizer):
cost, startup_program=None, params_grads=params_grads) cost, startup_program=None, params_grads=params_grads)
print("Finished apply_optimize") print("Finished apply_optimize")
""" """
return self._optimizer.apply_optimize( return self._optimizer.apply_optimize(
@ -3991,24 +4002,24 @@ class RecomputeOptimizer(Optimizer):
parameter_list=None, parameter_list=None,
no_grad_set=None, no_grad_set=None,
grad_clip=None): grad_clip=None):
assert isinstance(loss, Variable), "The loss should be an Variable."
assert (isinstance(loss, Variable)), "The loss should be an Variable."
assert (self._checkpoints is not None assert (self._checkpoints is not None
), "You should call _set_checkpoints first" ), "You should call _set_checkpoints first"
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
raise NotImplementedError( raise NotImplementedError(
"DyGraph current does not support recompute") "DyGraph current does not support recompute")
if grad_clip is not None:
if not isinstance(grad_clip, GradientClipBase):
raise TypeError(
"'grad_clip' should be an instance of GradientClipBase's derived class"
)
self._optimizer._grad_clip = grad_clip
params_grads = self.backward( params_grads = self.backward(
loss, loss,
startup_program=startup_program, startup_program=startup_program,
parameter_list=parameter_list, parameter_list=parameter_list,
no_grad_set=no_grad_set) no_grad_set=no_grad_set)
if grad_clip:
# TODO(guru4elephant): should add grad_clip for static graph
pass
optimize_ops = self.apply_optimize( optimize_ops = self.apply_optimize(
loss, startup_program=startup_program, params_grads=params_grads) loss, startup_program=startup_program, params_grads=params_grads)

@ -15,6 +15,7 @@
from __future__ import print_function from __future__ import print_function
import six import six
import warnings
from .initializer import Initializer, Xavier, Constant from .initializer import Initializer, Xavier, Constant
from .regularizer import WeightDecayRegularizer from .regularizer import WeightDecayRegularizer
@ -68,7 +69,6 @@ class ParamAttr(object):
learning_rate=1.0, learning_rate=1.0,
regularizer=None, regularizer=None,
trainable=True, trainable=True,
gradient_clip=None,
do_model_average=True): do_model_average=True):
self.name = name self.name = name
if isinstance(self.name, six.string_types) and self.name == "": if isinstance(self.name, six.string_types) and self.name == "":
@ -78,7 +78,6 @@ class ParamAttr(object):
self.learning_rate = learning_rate self.learning_rate = learning_rate
self.regularizer = regularizer self.regularizer = regularizer
self.trainable = trainable self.trainable = trainable
self.gradient_clip = gradient_clip
self.do_model_average = do_model_average self.do_model_average = do_model_average
def _set_default_initializer(self, initializer): def _set_default_initializer(self, initializer):
@ -176,7 +175,6 @@ class ParamAttr(object):
}, },
'regularizer': self.regularizer, 'regularizer': self.regularizer,
'trainable': self.trainable, 'trainable': self.trainable,
'gradient_clip_attr': self.gradient_clip,
'do_model_average': self.do_model_average 'do_model_average': self.do_model_average
} }
if with_initializer: if with_initializer:
@ -248,7 +246,6 @@ class WeightNormParamAttr(ParamAttr):
learning_rate=1.0, learning_rate=1.0,
regularizer=None, regularizer=None,
trainable=True, trainable=True,
gradient_clip=None,
do_model_average=False): do_model_average=False):
super(WeightNormParamAttr, self).__init__( super(WeightNormParamAttr, self).__init__(
name=name, name=name,
@ -256,6 +253,5 @@ class WeightNormParamAttr(ParamAttr):
learning_rate=learning_rate, learning_rate=learning_rate,
regularizer=regularizer, regularizer=regularizer,
trainable=trainable, trainable=trainable,
gradient_clip=gradient_clip,
do_model_average=do_model_average) do_model_average=do_model_average)
self.dim = dim self.dim = dim

@ -476,15 +476,18 @@ class TestL2Decay(TranspilerTest):
size=1000, size=1000,
act=None, act=None,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name='fc_w', name='fc_w', regularizer=fluid.regularizer.L2Decay()),
regularizer=fluid.regularizer.L2Decay(),
gradient_clip=fluid.clip.GradientClipByValue(0.1)),
bias_attr=fluid.ParamAttr(name='fc_b')) bias_attr=fluid.ParamAttr(name='fc_b'))
y = fluid.layers.data(name='y', shape=[1], dtype='float32') y = fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y) cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost) avg_cost = fluid.layers.mean(cost)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1) sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
sgd_optimizer.minimize(avg_cost)
def filter(param):
return param.name == "fc_w"
clip = fluid.clip.GradientClipByValue(0.1, need_clip=filter)
sgd_optimizer.minimize(avg_cost, grad_clip=clip)
def transpiler_test_impl(self): def transpiler_test_impl(self):
pserver, startup = self.get_pserver(self.pserver1_ep) pserver, startup = self.get_pserver(self.pserver1_ep)

@ -25,7 +25,7 @@ from paddle.fluid import core
from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.base import to_variable
from paddle.fluid.dygraph_grad_clip import GradClipByValue, GradClipByNorm, GradClipByGlobalNorm from paddle.fluid.clip import GradientClipByValue, GradientClipByNorm, GradientClipByGlobalNorm
class TestGradClipByGlobalNorm(unittest.TestCase): class TestGradClipByGlobalNorm(unittest.TestCase):
@ -65,7 +65,7 @@ class TestGradClipByGlobalNorm(unittest.TestCase):
def get_dygrap_global_norm_result(self): def get_dygrap_global_norm_result(self):
with fluid.dygraph.guard(): with fluid.dygraph.guard():
gloabl_norm_clip = GradClipByGlobalNorm(self.max_global_norm) gloabl_norm_clip = GradientClipByGlobalNorm(self.max_global_norm)
p_g_var = [] p_g_var = []
for p, g in self.para_and_grad: for p, g in self.para_and_grad:
new_p = to_variable(p) new_p = to_variable(p)
@ -135,7 +135,7 @@ class TestGradClipByNorm(unittest.TestCase):
def get_dygrap_norm_result(self): def get_dygrap_norm_result(self):
with fluid.dygraph.guard(): with fluid.dygraph.guard():
norm_clip = GradClipByNorm(self.max_norm) norm_clip = GradientClipByNorm(self.max_norm)
p_g_var = [] p_g_var = []
for p, g in self.para_and_grad: for p, g in self.para_and_grad:
new_p = to_variable(p) new_p = to_variable(p)
@ -200,8 +200,8 @@ class TestGradClipByValue(unittest.TestCase):
def get_dygrap_clip_result(self): def get_dygrap_clip_result(self):
with fluid.dygraph.guard(): with fluid.dygraph.guard():
value_clip = GradientClipByValue(
value_clip = GradClipByValue(self.min_value, self.max_value) max=self.max_value, min=self.min_value)
p_g_var = [] p_g_var = []
for p, g in self.para_and_grad: for p, g in self.para_and_grad:
new_p = to_variable(p) new_p = to_variable(p)
@ -225,7 +225,7 @@ class TestGradClipByValue(unittest.TestCase):
for (p_np, g_np), (p_dy, g_dy) in zip(np_p_g, dy_out_p_g): for (p_np, g_np), (p_dy, g_dy) in zip(np_p_g, dy_out_p_g):
self.assertTrue(np.allclose(g_np, g_dy, rtol=1e-6, atol=1e-8)) self.assertTrue(np.allclose(g_np, g_dy, rtol=1e-6, atol=1e-8))
def test_clip_by_norm_2(self): def test_clip_by_value_2(self):
self.init_value() self.init_value()
self.init_scale = 0.2 self.init_scale = 0.2
@ -236,7 +236,7 @@ class TestGradClipByValue(unittest.TestCase):
for (p_np, g_np), (p_dy, g_dy) in zip(np_p_g, dy_out_p_g): for (p_np, g_np), (p_dy, g_dy) in zip(np_p_g, dy_out_p_g):
self.assertTrue(np.allclose(g_np, g_dy, rtol=1e-6, atol=1e-8)) self.assertTrue(np.allclose(g_np, g_dy, rtol=1e-6, atol=1e-8))
def test_clip_by_norm_3(self): def test_clip_by_value_3(self):
self.init_value() self.init_value()
self.init_scale = 0.5 self.init_scale = 0.5

File diff suppressed because it is too large Load Diff

@ -331,7 +331,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
model = MyLayer(size, vocab_size, size) model = MyLayer(size, vocab_size, size)
optimizer = fluid.optimizer.AdamOptimizer( optimizer = fluid.optimizer.AdamOptimizer(
0.001, parameter_list=model.parameters()) 0.001, parameter_list=model.parameters())
grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001) grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
indices = fluid.dygraph.to_variable(indices) indices = fluid.dygraph.to_variable(indices)
embed = fluid.dygraph.to_variable(embed) embed = fluid.dygraph.to_variable(embed)
@ -350,7 +350,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
model = MyLayer2(size, vocab_size, size) model = MyLayer2(size, vocab_size, size)
optimizer = fluid.optimizer.AdamOptimizer( optimizer = fluid.optimizer.AdamOptimizer(
0.001, parameter_list=model.parameters()) 0.001, parameter_list=model.parameters())
grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001) grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
indices = fluid.dygraph.to_variable(indices) indices = fluid.dygraph.to_variable(indices)
emebd = fluid.dygraph.to_variable(embed) emebd = fluid.dygraph.to_variable(embed)

@ -49,7 +49,7 @@ class TestSimpleNet(unittest.TestCase):
with fluid.dygraph.guard(place): with fluid.dygraph.guard(place):
backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = sort_sum_gradient backward_strategy.sort_sum_gradient = sort_sum_gradient
# grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(5.0) # grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
input_word = np.array([[1, 2], [2, 1]]).astype('int64') input_word = np.array([[1, 2], [2, 1]]).astype('int64')
input = to_variable(input_word) input = to_variable(input_word)
@ -83,8 +83,7 @@ class TestSimpleNet(unittest.TestCase):
with fluid.dygraph.guard(place): with fluid.dygraph.guard(place):
backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = sort_sum_gradient backward_strategy.sort_sum_gradient = sort_sum_gradient
grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm( grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
5.0)
input_word = np.array([[1, 2], [2, 1]]).astype('int64') input_word = np.array([[1, 2], [2, 1]]).astype('int64')
input = to_variable(input_word) input = to_variable(input_word)

Loading…
Cancel
Save