@ -14,11 +14,13 @@
from paddle . fluid . optimizer import Optimizer
from paddle . fluid . optimizer import Optimizer
from paddle . fluid . regularizer import L1DecayRegularizer
from paddle . fluid . regularizer import L1DecayRegularizer
from paddle . fluid . regularizer import L2DecayRegularizer
from paddle . fluid . regularizer import L2DecayRegularizer
from paddle . fluid . regularizer import append_regularization_ops
from paddle . fluid import framework
from paddle . fluid import core
from paddle . fluid import core
from paddle . fluid import framework
from paddle . fluid . framework import program_guard
from paddle . fluid . framework import program_guard
from paddle . fluid . clip import append_gradient_clip_ops
from paddle . fluid import unique_name
from paddle . fluid import layers
from paddle . fluid . layer_helper import LayerHelper
import warnings
__all__ = [ ' Momentum ' ]
__all__ = [ ' Momentum ' ]
@ -61,6 +63,9 @@ class Momentum(Optimizer):
some derived class of ` ` GradientClipBase ` ` . There are three cliping strategies
some derived class of ` ` GradientClipBase ` ` . There are three cliping strategies
( : ref : ` api_fluid_clip_GradientClipByGlobalNorm ` , : ref : ` api_fluid_clip_GradientClipByNorm ` ,
( : ref : ` api_fluid_clip_GradientClipByGlobalNorm ` , : ref : ` api_fluid_clip_GradientClipByNorm ` ,
: ref : ` api_fluid_clip_GradientClipByValue ` ) . Default None , meaning there is no gradient clipping .
: ref : ` api_fluid_clip_GradientClipByValue ` ) . Default None , meaning there is no gradient clipping .
multi_precision ( bool , optional ) : Whether to use multi - precision during weight updating . Default is false .
rescale_grad ( float , optional ) : Multiply the gradient with ` rescale_grad ` before updating . \
Often choose to be ` ` 1.0 / batch_size ` ` .
name ( str , optional ) : This parameter is used by developers to print debugging information . \
name ( str , optional ) : This parameter is used by developers to print debugging information . \
For details , please refer to : ref : ` api_guide_Name ` . Default is None .
For details , please refer to : ref : ` api_guide_Name ` . Default is None .
@ -105,6 +110,8 @@ class Momentum(Optimizer):
use_nesterov = False ,
use_nesterov = False ,
regularization = None ,
regularization = None ,
grad_clip = None ,
grad_clip = None ,
multi_precision = False ,
rescale_grad = 1.0 ,
name = None ) :
name = None ) :
assert learning_rate is not None
assert learning_rate is not None
assert momentum is not None
assert momentum is not None
@ -124,11 +131,68 @@ class Momentum(Optimizer):
if ( isinstance ( regularization , L2DecayRegularizer ) ) :
if ( isinstance ( regularization , L2DecayRegularizer ) ) :
self . _regularization_method = " l2_decay "
self . _regularization_method = " l2_decay "
self . _regularization_coeff = regularization . _regularization_coeff
self . _regularization_coeff = regularization . _regularization_coeff
self . _multi_precision = multi_precision
self . _rescale_grad = rescale_grad
self . _master_weights = { }
def _create_master_weight ( self , param ) :
assert isinstance ( self . helper , LayerHelper )
var_name = param . name + " _fp32_master "
var_name = unique_name . generate ( var_name )
var = layers . create_global_var (
name = var_name ,
shape = param . shape ,
value = 0 ,
dtype = ' float32 ' ,
persistable = True )
block = self . helper . startup_program . global_block ( )
block . append_op (
type = " cast " ,
inputs = { " X " : [ param ] } ,
outputs = { " Out " : [ var ] } ,
attrs = {
" in_dtype " : param . dtype ,
" out_dtype " : core . VarDesc . VarType . FP32
} )
self . _master_weights [ param . name ] = var
return var
def _get_accumulator ( self , name , param ) :
""" Utility function to fetch an accumulator for a parameter
Args :
name : name of the accumulator
param : parameter variable for which accumulator is to be fetched
Returns :
accumulator variable for the parameter
"""
if self . _name is not None :
name = self . _name + " _ " + name
find_master = self . _multi_precision and param . dtype == core . VarDesc . VarType . FP16
target_param = self . _master_weights [
param . name ] if find_master else param
target_name = target_param . name
if ( name not in self . _accumulators or
target_name not in self . _accumulators [ name ] ) :
raise Exception ( " Accumulator {} does not exist for parameter {} " .
format ( name , target_name ) )
return self . _accumulators [ name ] [ target_name ]
def _create_accumulators ( self , block , parameters ) :
def _create_accumulators ( self , block , parameters ) :
assert isinstance ( block , framework . Block )
assert isinstance ( block , framework . Block )
for p in parameters :
for p in parameters :
if self . _multi_precision and p . dtype == core . VarDesc . VarType . FP16 :
master_p = self . _create_master_weight ( p )
self . _add_accumulator ( self . _velocity_acc_str , master_p )
continue
if p . dtype == core . VarDesc . VarType . FP16 and not self . _multi_precision :
warnings . warn (
" Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence. "
" Consider using multi_precision=True option of the Momentum optimizer. "
)
self . _add_accumulator ( self . _velocity_acc_str , p )
self . _add_accumulator ( self . _velocity_acc_str , p )
def _append_optimize_op ( self , block , param_and_grad ) :
def _append_optimize_op ( self , block , param_and_grad ) :
@ -136,6 +200,10 @@ class Momentum(Optimizer):
velocity_acc = self . _get_accumulator ( self . _velocity_acc_str ,
velocity_acc = self . _get_accumulator ( self . _velocity_acc_str ,
param_and_grad [ 0 ] )
param_and_grad [ 0 ] )
find_master = self . _multi_precision and param_and_grad [
0 ] . dtype == core . VarDesc . VarType . FP16
master_weight = ( self . _master_weights [ param_and_grad [ 0 ] . name ]
if find_master else None )
lr = self . _create_param_lr ( param_and_grad )
lr = self . _create_param_lr ( param_and_grad )
if framework . in_dygraph_mode ( ) :
if framework . in_dygraph_mode ( ) :
@ -151,7 +219,9 @@ class Momentum(Optimizer):
" mu " : self . _momentum ,
" mu " : self . _momentum ,
" use_nesterov " : self . _use_nesterov ,
" use_nesterov " : self . _use_nesterov ,
" regularization_method " : self . _regularization_method ,
" regularization_method " : self . _regularization_method ,
" regularization_coeff " : self . _regularization_coeff
" regularization_coeff " : self . _regularization_coeff ,
" multi_precision " : find_master ,
" rescale_grad " : self . _rescale_grad
}
}
inputs = {
inputs = {
" Param " : [ param_and_grad [ 0 ] ] ,
" Param " : [ param_and_grad [ 0 ] ] ,
@ -159,11 +229,15 @@ class Momentum(Optimizer):
" Velocity " : [ velocity_acc ] ,
" Velocity " : [ velocity_acc ] ,
" LearningRate " : [ lr ]
" LearningRate " : [ lr ]
}
}
outputs = {
outputs = {
" ParamOut " : [ param_and_grad [ 0 ] ] ,
" ParamOut " : [ param_and_grad [ 0 ] ] ,
" VelocityOut " : [ velocity_acc ]
" VelocityOut " : [ velocity_acc ]
}
}
if find_master :
inputs [ " MasterParam " ] = master_weight
outputs [ " MasterParamOut " ] = master_weight
# create the momentum optimize op
# create the momentum optimize op
momentum_op = block . append_op (
momentum_op = block . append_op (
type = self . type ,
type = self . type ,