You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Paddle/python/paddle/fluid/layer_helper_base.py

398 lines
16 KiB

# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import copy
import numpy as np
from .framework import Variable, default_main_program, default_startup_program, in_dygraph_mode, _current_expected_place
from . import unique_name
from .param_attr import ParamAttr, WeightNormParamAttr
from . import core
class LayerHelperBase(object):
def __init__(self, name, layer_type):
self._layer_type = layer_type
self._name = name
@property
def name(self):
return self._name
@property
def layer_type(self):
return self._layer_type
@property
def main_program(self):
return default_main_program()
@property
def startup_program(self):
return default_startup_program()
def to_variable(self, value, block=None):
"""convert value to variable
Args:
value: value to be convert
block: the block of the variable
Return Variable construct from value
"""
if isinstance(value, np.ndarray):
assert in_dygraph_mode(
), "to_variable could only be called in dygraph mode"
if not block:
block = default_main_program().current_block()
py_var = Variable(
block,
type=core.VarDesc.VarType.LOD_TENSOR,
name=None,
shape=value.shape,
dtype=value.dtype)
var = py_var._ivar.value()
tensor = var.get_tensor()
tensor.set(value, _current_expected_place())
return py_var
elif isinstance(value, Variable):
return value
def _create_weight_normalize(self, attr, shape, dtype):
from .layers import elementwise_mul, elementwise_div, reshape
# Remove these ops when LayerHelper and layers support indicating
# program and block.
def __norm_op(x,
out=None,
p=2,
dim=None,
keep_dim=False,
block=self.startup_program.global_block()):
if out is None:
out = block.create_var(
name=unique_name.generate_with_ignorable_key(".".join(
[self.name, 'weight_norm_norm'])),
dtype=dtype,
persistable=False)
abs_out = block.create_var(
name=unique_name.generate_with_ignorable_key(".".join(
[self.name, 'weight_norm_abs'])),
dtype=dtype,
persistable=False)
block.append_op(
type='abs', inputs={'X': x}, outputs={'Out': abs_out})
pow_out = block.create_var(
name=unique_name.generate_with_ignorable_key(".".join(
[self.name, 'weight_norm_pow'])),
dtype=dtype,
persistable=False)
block.append_op(
type='pow',
inputs={'X': abs_out},
outputs={'Out': pow_out},
attrs={'factor': float(p)})
sum_out = block.create_var(
name=unique_name.generate_with_ignorable_key(".".join(
[self.name, 'weight_norm_sum'])),
dtype=dtype,
persistable=False)
block.append_op(
type='reduce_sum',
inputs={'X': pow_out},
outputs={'Out': sum_out},
attrs={
'dim': dim,
'keep_dim': keep_dim,
'reduce_all': True if dim is None else False
})
block.append_op(
type='pow',
inputs={'X': sum_out},
outputs={'Out': out},
attrs={'factor': 1. / p})
return out
def __reshape_op(x,
shape,
out=None,
block=self.startup_program.global_block()):
if out is None:
out = block.create_var(
name=unique_name.generate_with_ignorable_key(".".join(
[self.name, 'weight_norm_reshape'])),
dtype=dtype,
persistable=False)
block.append_op(
type='reshape',
inputs={'X': x},
outputs={'Out': out},
attrs={'shape': shape})
return out
def __transpose_op(x,
axis,
out=None,
block=self.startup_program.global_block()):
if out is None:
out = block.create_var(
name=unique_name.generate_with_ignorable_key(".".join(
[self.name, 'weight_norm_transpose'])),
dtype=dtype,
persistable=False)
block.append_op(
type='transpose',
inputs={'X': x},
outputs={'Out': out},
attrs={'axis': axis})
return out
def __norm_except_dim(x,
out=None,
dim=None,
block=self.startup_program.global_block()):
"""Computes the norm over all dimensions except dim"""
if out is None:
out = block.create_var(
name=unique_name.generate_with_ignorable_key(".".join(
[self.name, 'weight_norm_norm'])),
dtype=dtype,
persistable=False)
if dim is None:
__norm_op(x, out, dim=dim, block=block)
elif dim == 0:
out_shape = [x.shape[0]] + [1] * (len(x.shape) - 1)
reshape = __reshape_op(x, shape=[x.shape[0], -1], block=block)
norm = __norm_op(reshape, dim=[1], block=block)
__reshape_op(norm, out=out, shape=out_shape, block=block)
elif dim == len(x.shape) - 1:
out_shape = [1] * (len(x.shape) - 1) + [x.shape[-1]]
reshape = __reshape_op(x, shape=[-1, x.shape[-1]], block=block)
norm = __norm_op(reshape, dim=[0], block=block)
__reshape_op(norm, out=out, shape=out_shape, block=block)
else:
perm = list(range(len(x.shape)))
perm[0], perm[dim] = dim, 0
transpose = __transpose_op(x, perm, block=block)
out_shape = [transpose.shape[0]] + [1] * (len(transpose.shape) -
1)
reshape = __reshape_op(
transpose, shape=[transpose.shape[0], -1], block=block)
norm = __norm_op(reshape, dim=[1], block=block)
reshape2 = __reshape_op(norm, shape=out_shape, block=block)
__transpose_op(reshape2, perm, out=out, block=block)
return out
def __weight_normalize(g, v, dim):
"""Calculations for weight normalization"""
norm = __norm_except_dim(
v, dim=dim, block=self.main_program.current_block())
scale = elementwise_div(
x=g, y=norm) # The shapes of g and norm are the same.
# Currently, elementwise_mul only support broadcast when the shape
# of y is a subset of the shape of x. Thus, we reshape y to squeeze
# to achive the subset.
w = elementwise_mul(
x=v,
y=scale if dim is None else reshape(
x=scale, shape=[v.shape[dim]]),
axis=-1 if dim is None else dim)
# To serialize the original parameter for inference, maybe a
# parameter rather than a variable should be returned.
return w
g_param_attr = copy.deepcopy(attr)
g_param_attr.name = attr.name + '_g'
g_param_shape = [1] * len(shape)
if attr.dim is not None:
g_param_shape[attr.dim] = shape[attr.dim]
v_param_attr = copy.deepcopy(attr)
v_param_attr.name = attr.name + '_v'
v_param_shape = shape
# Add to startup_program to initialize g and v.
# Try to reconstruct the initializer of w by initializing g and v.
# Set the initializers of g and v as below, then the distribution
# of w is the same as initializing w with the given initializer.
# For Data-Dependent Initialization, please compute the init-values
# of g and v in external and then feed the values to g and v by
# executing an extra program.
g_param = self.startup_program.global_block().create_parameter(
dtype=dtype,
shape=g_param_shape,
**g_param_attr._to_kwargs(with_initializer=False))
v_param = self.startup_program.global_block().create_parameter(
dtype=dtype,
shape=v_param_shape,
**v_param_attr._to_kwargs(with_initializer=True))
__norm_except_dim(
x=v_param,
out=g_param,
dim=attr.dim,
block=self.startup_program.global_block())
# keep g_param shape to be consistent with that in main_program
__reshape_op(
g_param,
g_param_shape,
out=g_param,
block=self.startup_program.global_block())
# Add weight normalization to main_program
g_param = self.main_program.global_block().create_parameter(
dtype=dtype, shape=g_param_shape, **g_param_attr._to_kwargs())
v_param = self.main_program.global_block().create_parameter(
dtype=dtype, shape=v_param_shape, **v_param_attr._to_kwargs())
w_param = __weight_normalize(g_param, v_param, dim=attr.dim)
return w_param
# TODO: hide the func after we move the layers to Layers
def create_parameter(self,
attr,
shape,
dtype,
is_bias=False,
default_initializer=None,
stop_gradient=False):
"""Create parameters for this layers.
Args:
attr: [ParamAttr] should be the parameter attribute for this parameter
shape: shape of the paramter
dtype: data type of this parameter
is_bias: if this is a bias parameter
default_initializer: set the default initializer for this parameter
Returns created parameter Variable.
"""
# Deepcopy the attr so that parameters can be shared in program
attr = copy.deepcopy(attr)
attr = ParamAttr._to_attr(attr)
if not attr:
return None
assert isinstance(attr, ParamAttr)
suffix = 'b' if is_bias else 'w'
if attr.name is None:
attr.name = unique_name.generate(".".join([self.name, suffix]))
if default_initializer is None and attr.initializer is None:
if isinstance(dtype, core.VarDesc.VarType):
if dtype != core.VarDesc.VarType.FP32 and \
dtype != core.VarDesc.VarType.FP64 and \
dtype != core.VarDesc.VarType.FP16:
raise TypeError(
"Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
)
else:
if not (dtype.startswith("float") or dtype == "double"):
raise TypeError(
"Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
)
if is_bias:
attr._set_default_bias_initializer()
else:
attr._set_default_param_initializer()
else:
attr._set_default_initializer(default_initializer)
# If weight normalization is set, insert extra parameters and ops.
# Refer to https://arxiv.org/pdf/1602.07868.pdf
if isinstance(attr, WeightNormParamAttr):
param = self._create_weight_normalize(attr, shape, dtype)
WeightNormParamAttr.params_with_weight_norm.append(param)
return param
if in_dygraph_mode():
# In dygraph mode, we want the returned parameter to be
# initialized so that it can be used imperatively.
return self.main_program.global_block().create_parameter(
dtype=dtype,
shape=shape,
stop_gradient=stop_gradient,
**attr._to_kwargs(with_initializer=True))
else:
self.startup_program.global_block().create_parameter(
dtype=dtype,
shape=shape,
**attr._to_kwargs(with_initializer=True))
return self.main_program.global_block().create_parameter(
dtype=dtype, shape=shape, **attr._to_kwargs())
def create_variable_for_type_inference(self, dtype, stop_gradient=False):
"""Create a temporary variable that should be type inferred layer.
Note:
The default type will be set to LOD_TENSOR. However, when
the var is used as operator output, its type will be updated
based on operator's `VarTypeInference` implementation in
infer_var_type.
"""
return self.main_program.current_block().create_var(
name=unique_name.generate_with_ignorable_key(".".join(
[self.name, 'tmp'])),
dtype=dtype,
type=core.VarDesc.VarType.LOD_TENSOR,
persistable=False,
stop_gradient=stop_gradient)
def create_variable(self, *args, **kwargs):
"""Create Variable for this layers.
Returns created Variable.
"""
return self.main_program.current_block().create_var(*args, **kwargs)
def create_global_variable(self, persistable=False, *args, **kwargs):
"""
create global variable, note that there is no initializer for this global variable.
Args:
persistable(bool): True if it is a checkpoint value.
*args: See create_var's documentation
**kwargs: See create_var's documentation
Returns(Variable): the created variable.
"""
return self.main_program.global_block().create_var(
*args, persistable=persistable, **kwargs)
def create_or_get_global_variable(self, name, *args, **kwargs):
"""
Creates a global variable if not exists and returns the variable and
a boolean flag which is true when it is a new variable.
"""
if self.main_program.global_block().has_var(name):
return self.main_program.global_block().var(name), False
else:
return self.create_global_variable(name=name, *args, **kwargs), True
def set_variable_initializer(self, var, initializer):
"""Set target Variable's initializer
Args:
var: target Variable
initializer: initializer to use
"""
assert isinstance(var, Variable)
if in_dygraph_mode():
initializer(var, var.block)
else:
self.startup_program.global_block().create_var(
name=var.name,
type=var.type,
dtype=var.dtype,
shape=var.shape,
persistable=True,
initializer=initializer)