|
|
@ -19,10 +19,10 @@ from framework import convert_np_dtype_to_dtype_
|
|
|
|
from core import VarDesc
|
|
|
|
from core import VarDesc
|
|
|
|
|
|
|
|
|
|
|
|
__all__ = [
|
|
|
|
__all__ = [
|
|
|
|
'Constant', 'Uniform', 'Normal', 'Xavier', 'Bilinear', 'force_init_on_cpu',
|
|
|
|
'Constant', 'Uniform', 'Normal', 'Xavier', 'Bilinear', 'MSRA',
|
|
|
|
'init_on_cpu', 'ConstantInitializer', 'UniformInitializer',
|
|
|
|
'force_init_on_cpu', 'init_on_cpu', 'ConstantInitializer',
|
|
|
|
'NormalInitializer', 'XavierInitializer', 'BilinearInitializer',
|
|
|
|
'UniformInitializer', 'NormalInitializer', 'XavierInitializer',
|
|
|
|
'MSRAInitializer'
|
|
|
|
'BilinearInitializer', 'MSRAInitializer'
|
|
|
|
]
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
_force_init_on_cpu_ = False
|
|
|
|
_force_init_on_cpu_ = False
|
|
|
@ -353,30 +353,42 @@ class MSRAInitializer(Initializer):
|
|
|
|
"""Implements the MSRA initializer a.k.a. Kaiming Initializer
|
|
|
|
"""Implements the MSRA initializer a.k.a. Kaiming Initializer
|
|
|
|
|
|
|
|
|
|
|
|
This class implements the weight initialization from the paper
|
|
|
|
This class implements the weight initialization from the paper
|
|
|
|
Delving Deep into Rectifiers: Surpassing Human-Level Performance on
|
|
|
|
`Delving Deep into Rectifiers: Surpassing Human-Level Performance on
|
|
|
|
ImageNet Classification[1] by Kaiming He, Xiangyu Zhang, Shaoqing Ren
|
|
|
|
ImageNet Classification <https://arxiv.org/abs/1502.01852>`_
|
|
|
|
and Jian Sun. This is a robust initialization method that particularly
|
|
|
|
by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a
|
|
|
|
considers the rectifier nonlinearities. In case of Uniform distribution,
|
|
|
|
robust initialization method that particularly considers the rectifier
|
|
|
|
the range is [-x, x], where x = sqrt(6 / fan_in). In case of Normal
|
|
|
|
nonlinearities. In case of Uniform distribution, the range is [-x, x], where
|
|
|
|
distribution, the mean is 0 and the standard deviation
|
|
|
|
|
|
|
|
is sqrt(2/ fan_in).
|
|
|
|
.. math::
|
|
|
|
|
|
|
|
|
|
|
|
References:
|
|
|
|
x = \sqrt{\\frac{6.0}{fan\_in}}
|
|
|
|
[1] Delving Deep into Rectifiers: Surpassing Human-Level Performance
|
|
|
|
|
|
|
|
on ImageNet Classification
|
|
|
|
In case of Normal distribution, the mean is 0 and the standard deviation
|
|
|
|
(https://arxiv.org/abs/1502.01852)
|
|
|
|
is
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.. math::
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
\sqrt{\\frac{2.0}{fan\_in}}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
|
|
uniform (bool): whether to use uniform or normal distribution
|
|
|
|
|
|
|
|
fan_in (float): fan_in for MSRAInitializer. If None, it is\
|
|
|
|
|
|
|
|
inferred from the variable.
|
|
|
|
|
|
|
|
seed (int): random seed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Note:
|
|
|
|
|
|
|
|
It is recommended to set fan_in to None for most cases.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Examples:
|
|
|
|
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fc = fluid.layers.fc(
|
|
|
|
|
|
|
|
input=queries, size=10,
|
|
|
|
|
|
|
|
param_attr=fluid.initializer.MSRA(uniform=False))
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, uniform=True, fan_in=None, seed=0):
|
|
|
|
def __init__(self, uniform=True, fan_in=None, seed=0):
|
|
|
|
"""Constructor for MSRAInitializer
|
|
|
|
"""Constructor for MSRAInitializer
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
|
|
uniform: whether to use uniform or normal distribution
|
|
|
|
|
|
|
|
fan_in: fan_in for MSRAInitializer. If None, it is
|
|
|
|
|
|
|
|
inferred from the variable.
|
|
|
|
|
|
|
|
seed: random seed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Note: It is recommended to set fan_in to None for most cases.
|
|
|
|
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
assert uniform is not None
|
|
|
|
assert uniform is not None
|
|
|
|
assert seed is not None
|
|
|
|
assert seed is not None
|
|
|
@ -436,34 +448,37 @@ class MSRAInitializer(Initializer):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BilinearInitializer(Initializer):
|
|
|
|
class BilinearInitializer(Initializer):
|
|
|
|
"""Implements the bilinear initializer.
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
This initializer can be used in transposed convolution operator to
|
|
|
|
This initializer can be used in transposed convolution operator to
|
|
|
|
act as upsampling. Users can upsample a feature map with shape of
|
|
|
|
act as upsampling. Users can upsample a feature map with shape of
|
|
|
|
(B, C, H, W) by any integer factor. The usage is:
|
|
|
|
(B, C, H, W) by any integer factor. The usage is:
|
|
|
|
|
|
|
|
|
|
|
|
>>> factor = 2
|
|
|
|
Examples:
|
|
|
|
>>> w_attr = ParamAttr(learning_rate=0., regularizer=L2Decay(0.),
|
|
|
|
|
|
|
|
>>> initializer=Bilinear())
|
|
|
|
.. code-block:: python
|
|
|
|
>>> conv_up = fluid.layers.conv2d_transpose(
|
|
|
|
|
|
|
|
>>> input,
|
|
|
|
factor = 2
|
|
|
|
>>> num_filters=C,
|
|
|
|
w_attr = ParamAttr(learning_rate=0., regularizer=L2Decay(0.),
|
|
|
|
>>> output_size=None,
|
|
|
|
initializer=Bilinear())
|
|
|
|
>>> filter_size=2 * factor - factor % 2,
|
|
|
|
conv_up = fluid.layers.conv2d_transpose(
|
|
|
|
>>> padding=ceil((factor - 1) / 2.),
|
|
|
|
input,
|
|
|
|
>>> stride=factor,
|
|
|
|
num_filters=C,
|
|
|
|
>>> groups=C,
|
|
|
|
output_size=None,
|
|
|
|
>>> param_attr=w_attr,
|
|
|
|
filter_size=2 * factor - factor % 2,
|
|
|
|
>>> bias_attr=False)
|
|
|
|
padding=ceil((factor - 1) / 2.),
|
|
|
|
|
|
|
|
stride=factor,
|
|
|
|
|
|
|
|
groups=C,
|
|
|
|
Where, `num_filters=C` and `groups=C` means this is channel-wise tranposed
|
|
|
|
param_attr=w_attr,
|
|
|
|
|
|
|
|
bias_attr=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Where, `num_filters=C` and `groups=C` means this is channel-wise transposed
|
|
|
|
convolution. The filter shape will be (C, 1, K, K) where K is `filer_size`,
|
|
|
|
convolution. The filter shape will be (C, 1, K, K) where K is `filer_size`,
|
|
|
|
This initializer will set a (K, K) interpolation kernel for every channel
|
|
|
|
This initializer will set a (K, K) interpolation kernel for every channel
|
|
|
|
of the filter identically. The resulting shape of the output feature map
|
|
|
|
of the filter identically. The resulting shape of the output feature map
|
|
|
|
will be (B, C, factor * H, factor * W). Note that the learning rate and the
|
|
|
|
will be (B, C, factor * H, factor * W). Note that the learning rate and the
|
|
|
|
weight decay are set to 0 in order to keep coefficient values of bilinear
|
|
|
|
weight decay are set to 0 in order to keep coefficient values of bilinear
|
|
|
|
interpolation unchanged during training.
|
|
|
|
interpolation unchanged during training.
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
def __init__(self):
|
|
|
@ -480,7 +495,7 @@ class BilinearInitializer(Initializer):
|
|
|
|
be added.
|
|
|
|
be added.
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Returns:
|
|
|
|
the initialization op
|
|
|
|
Operator: the initialization op
|
|
|
|
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
Raises:
|
|
|
|
ValueError: If type of `var` and `block` is not right.
|
|
|
|
ValueError: If type of `var` and `block` is not right.
|
|
|
|