@ -17,8 +17,10 @@ from ..fluid import core
from . . fluid import framework
from . . fluid . framework import Variable , name_scope
from . . fluid . layer_helper import LayerHelper
from . . fluid import unique_name
from . . fluid import layers
import paddle . fluid as fluid
from paddle . fluid . regularizer import L2DecayRegularizer
__all__ = [ " Momentum " ]
@ -62,6 +64,9 @@ class Momentum(Optimizer):
some derived class of ` ` GradientClipBase ` ` . There are three cliping strategies
( : ref : ` api_fluid_clip_GradientClipByGlobalNorm ` , : ref : ` api_fluid_clip_GradientClipByNorm ` ,
: ref : ` api_fluid_clip_GradientClipByValue ` ) . Default None , meaning there is no gradient clipping .
multi_precision ( bool , optional ) : Whether to use multi - precision during weight updating . Default is false .
rescale_grad ( float , optional ) : Multiply the gradient with ` rescale_grad ` before updating . \
Often choose to be ` ` 1.0 / batch_size ` ` .
name ( str , optional ) : The default value is None . Normally there is no need for user
to set this property . For more information , please refer to
: ref : ` api_guide_Name ` .
@ -92,20 +97,33 @@ class Momentum(Optimizer):
use_nesterov = False ,
weight_decay = None ,
grad_clip = None ,
multi_precision = False ,
rescale_grad = 1.0 ,
name = None ) :
if learning_rate is None :
raise ValueError ( " learning_rate is not set " )
if momentum is None :
raise ValueError ( " momentum is not set " )
predicate = lambda regular : isinstance ( regular , L2DecayRegularizer )
py_regular = None if predicate ( weight_decay ) else weight_decay
super ( Momentum , self ) . __init__ (
learning_rate = learning_rate ,
parameters = parameters ,
weight_decay = weight_decay ,
weight_decay = py_regular ,
grad_clip = grad_clip ,
name = name )
self . type = " momentum "
self . _momentum = momentum
self . _use_nesterov = bool ( use_nesterov )
self . _regularization_method = " "
self . _regularization_coeff = 0
if ( isinstance ( weight_decay , L2DecayRegularizer ) ) :
self . _regularization_method = " l2_decay "
self . _regularization_coeff = weight_decay . _regularization_coeff
self . _multi_precision = multi_precision
self . _rescale_grad = rescale_grad
self . _master_weights = { }
if framework . in_dygraph_mode ( ) :
self . helper = LayerHelper ( self . __class__ . __name__ )
for p in parameters :
@ -115,8 +133,62 @@ class Momentum(Optimizer):
) . all_parameters ( )
self . helper = LayerHelper ( self . __class__ . __name__ )
for p in all_parameters :
if self . _multi_precision and p . dtype == core . VarDesc . VarType . FP16 :
master_p = self . _create_master_weight ( p )
self . _add_accumulator ( self . _velocity_acc_str , master_p )
continue
if p . dtype == core . VarDesc . VarType . FP16 and not self . _multi_precision :
warnings . warn (
" Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence. "
" Consider using multi_precision=True option of the Momentum optimizer. "
)
self . _add_accumulator ( self . _velocity_acc_str , p )
def _create_master_weight ( self , param ) :
assert isinstance ( self . helper , LayerHelper )
var_name = param . name + " _fp32_master "
var_name = unique_name . generate ( var_name )
var = layers . create_global_var (
name = var_name ,
shape = param . shape ,
value = 0 ,
dtype = ' float32 ' ,
persistable = True )
block = self . helper . startup_program . global_block ( )
block . append_op (
type = " cast " ,
inputs = { " X " : [ param ] } ,
outputs = { " Out " : [ var ] } ,
attrs = {
" in_dtype " : param . dtype ,
" out_dtype " : core . VarDesc . VarType . FP32
} )
self . _master_weights [ param . name ] = var
return var
def _get_accumulator ( self , name , param ) :
""" Utility function to fetch an accumulator for a parameter
Args :
name : name of the accumulator
param : parameter variable for which accumulator is to be fetched
Returns :
accumulator variable for the parameter
"""
if self . _name is not None :
name = self . _name + " _ " + name
find_master = self . _multi_precision and param . dtype == core . VarDesc . VarType . FP16
target_param = self . _master_weights [
param . name ] if find_master else param
target_name = target_param . name
if ( name not in self . _accumulators or
target_name not in self . _accumulators [ name ] ) :
raise Exception ( " Accumulator {} does not exist for parameter {} " .
format ( name , target_name ) )
return self . _accumulators [ name ] [ target_name ]
def _create_accumulators ( self , block , parameters ) :
assert isinstance ( block , framework . Block )
# create accumulator in init func, so no implementation here
@ -126,16 +198,30 @@ class Momentum(Optimizer):
velocity_acc = self . _get_accumulator ( self . _velocity_acc_str ,
param_and_grad [ 0 ] )
find_master = self . _multi_precision and param_and_grad [
0 ] . dtype == core . VarDesc . VarType . FP16
master_weight = ( self . _master_weights [ param_and_grad [ 0 ] . name ]
if find_master else None )
lr = self . _create_param_lr ( param_and_grad )
if framework . in_dygraph_mode ( ) :
_ , _ = core . ops . momentum ( param_and_grad [ 0 ] , param_and_grad [ 1 ] ,
velocity_acc , lr , param_and_grad [ 0 ] ,
velocity_acc , ' mu ' , self . _momentum ,
' use_nesterov ' , self . _use_nesterov )
_ , _ = core . ops . momentum (
param_and_grad [ 0 ] , param_and_grad [ 1 ] , velocity_acc , lr ,
param_and_grad [ 0 ] , velocity_acc , ' mu ' , self . _momentum ,
' use_nesterov ' , self . _use_nesterov , ' regularization_method ' ,
self . _regularization_method , ' regularization_coeff ' ,
self . _regularization_coeff )
return None
attrs = { " mu " : self . _momentum , " use_nesterov " : self . _use_nesterov }
attrs = {
" mu " : self . _momentum ,
" use_nesterov " : self . _use_nesterov ,
" regularization_method " : self . _regularization_method ,
" regularization_coeff " : self . _regularization_coeff ,
" multi_precision " : find_master ,
" rescale_grad " : self . _rescale_grad
}
inputs = {
" Param " : [ param_and_grad [ 0 ] ] ,
" Grad " : [ param_and_grad [ 1 ] ] ,
@ -147,6 +233,11 @@ class Momentum(Optimizer):
" ParamOut " : [ param_and_grad [ 0 ] ] ,
" VelocityOut " : [ velocity_acc ]
}
if find_master :
inputs [ " MasterParam " ] = master_weight
outputs [ " MasterParamOut " ] = master_weight
# create the momentum optimize op
momentum_op = block . append_op (
type = self . type ,