|
|
|
@ -1,5 +1,6 @@
|
|
|
|
|
import functools
|
|
|
|
|
import layers
|
|
|
|
|
from framework import Variable
|
|
|
|
|
from . import core
|
|
|
|
|
|
|
|
|
|
__all__ = [
|
|
|
|
@ -44,7 +45,7 @@ def error_clip_callback(block, context):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BaseGradientClipAttr(object):
|
|
|
|
|
def process_context(self, context, p_g):
|
|
|
|
|
def process_context(self, context, param, grad):
|
|
|
|
|
raise NotImplementedError()
|
|
|
|
|
|
|
|
|
|
def create_operators(self, param, grad):
|
|
|
|
@ -52,7 +53,7 @@ class BaseGradientClipAttr(object):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class NullGradientClipAttr(BaseGradientClipAttr):
|
|
|
|
|
def process_context(self, context, p_g):
|
|
|
|
|
def process_context(self, context, param, grad):
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
def create_operators(self, param, grad):
|
|
|
|
@ -69,7 +70,7 @@ class GradientClipByValue(BaseGradientClipAttr):
|
|
|
|
|
self.max = max
|
|
|
|
|
self.min = min
|
|
|
|
|
|
|
|
|
|
def process_context(self, context, p_g):
|
|
|
|
|
def process_context(self, context, param, grad):
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
def create_operators(self, param, grad):
|
|
|
|
@ -81,7 +82,7 @@ class GradientClipByNorm(BaseGradientClipAttr):
|
|
|
|
|
def __init__(self, clip_norm):
|
|
|
|
|
self.clip_norm = clip_norm
|
|
|
|
|
|
|
|
|
|
def process_context(self, context, p_g):
|
|
|
|
|
def process_context(self, context, param, grad):
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
def create_operators(self, param, grad):
|
|
|
|
@ -89,6 +90,46 @@ class GradientClipByNorm(BaseGradientClipAttr):
|
|
|
|
|
return param, new_grad
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class GradientClipByGlobalNorm(BaseGradientClipAttr):
|
|
|
|
|
global_norm_var = None
|
|
|
|
|
clip_norm_var = None
|
|
|
|
|
ratio_var = None
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def init(cls, clip_norm):
|
|
|
|
|
cls.global_norm_var = layers.fill_constant(
|
|
|
|
|
shape=[1], dtype="float32", value=0.0)
|
|
|
|
|
cls.clip_norm_var = layers.fill_constant(
|
|
|
|
|
shape=[1], dtype="float32", value=clip_norm)
|
|
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
|
if not (isinstance(self.__class__.global_norm_var, Variable) and
|
|
|
|
|
isinstance(self.__class__.clip_norm_var, Variable)):
|
|
|
|
|
raise ValueError(
|
|
|
|
|
"Class 'GradientClipByGlobalNorm' has not been properly initialized. Please call GradientClipByGlobalNorm.init() first."
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def process_context(self, context, param, grad):
|
|
|
|
|
local_norm_var = layers.reduce_sum(
|
|
|
|
|
x=layers.pow(x=grad, factor=2), reduce_all=True)
|
|
|
|
|
layers.sums(
|
|
|
|
|
input=[local_norm_var, self.__class__.global_norm_var],
|
|
|
|
|
out=[self.__class__.global_norm_var])
|
|
|
|
|
|
|
|
|
|
def create_operators(self, param, grad):
|
|
|
|
|
if self.__class__.ratio_var is None:
|
|
|
|
|
self.__class__.global_norm_var = layers.sqrt(
|
|
|
|
|
x=self.__class__.global_norm_var)
|
|
|
|
|
self.__class__.ratio_var = layers.elementwise_div(
|
|
|
|
|
x=self.__class__.clip_norm_var,
|
|
|
|
|
y=layers.elementwise_max(
|
|
|
|
|
x=self.__class__.clip_norm_var,
|
|
|
|
|
y=self.__class__.global_norm_var))
|
|
|
|
|
# 缺乏elementwise_max
|
|
|
|
|
# 没法将ratio_var送给scale_op。
|
|
|
|
|
# new_grad = layers.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def append_gradient_clip_ops(param_grad):
|
|
|
|
|
context = dict()
|
|
|
|
|
create_op_callbacks = []
|
|
|
|
@ -98,10 +139,9 @@ def append_gradient_clip_ops(param_grad):
|
|
|
|
|
clip_attr = NullGradientClipAttr()
|
|
|
|
|
if not isinstance(clip_attr, BaseGradientClipAttr):
|
|
|
|
|
raise TypeError(
|
|
|
|
|
"clip attribute should be an instance of BaseGradientClippingAttr"
|
|
|
|
|
)
|
|
|
|
|
"clip attribute should be an instance of BaseGradientClipAttr")
|
|
|
|
|
|
|
|
|
|
clip_attr.process_context(context=context, p_g=param_grad)
|
|
|
|
|
clip_attr.process_context(context=context, param=p, grad=g)
|
|
|
|
|
create_op_callbacks.append(
|
|
|
|
|
functools.partial(
|
|
|
|
|
clip_attr.create_operators, param=p, grad=g))
|
|
|
|
|