You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
7532 lines
258 KiB
7532 lines
258 KiB
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
import functools
|
|
import collections
|
|
import inspect
|
|
|
|
import paddle.trainer.config_parser as cp
|
|
from paddle.trainer.config_parser import *
|
|
from .activations import LinearActivation, SigmoidActivation, TanhActivation, \
|
|
ReluActivation, IdentityActivation, SoftmaxActivation, BaseActivation
|
|
from .evaluators import *
|
|
from .poolings import MaxPooling, AvgPooling, MaxWithMaskPooling, BasePoolingType, \
|
|
CudnnAvgPooling, CudnnAvgInclPadPooling, CudnnMaxPooling
|
|
from .attrs import *
|
|
from .default_decorators import *
|
|
|
|
try:
|
|
import cPickle as pickle
|
|
except ImportError:
|
|
import pickle
|
|
import copy
|
|
|
|
__all__ = [
|
|
'full_matrix_projection',
|
|
'AggregateLevel',
|
|
'ExpandLevel',
|
|
'identity_projection',
|
|
'dotmul_projection',
|
|
'dotmul_operator',
|
|
'repeat_layer',
|
|
'seq_reshape_layer',
|
|
'table_projection',
|
|
'mixed_layer',
|
|
'data_layer',
|
|
'embedding_layer',
|
|
'fc_layer',
|
|
'grumemory',
|
|
'pooling_layer',
|
|
'lstmemory',
|
|
'last_seq',
|
|
'first_seq',
|
|
'cos_sim',
|
|
'l2_distance_layer',
|
|
'hsigmoid',
|
|
'conv_projection',
|
|
'square_error_cost',
|
|
'regression_cost',
|
|
'classification_cost',
|
|
'LayerOutput',
|
|
'img_conv_layer',
|
|
'img_pool_layer',
|
|
'batch_norm_layer',
|
|
'img_cmrnorm_layer',
|
|
'addto_layer',
|
|
'concat_layer',
|
|
'seq_concat_layer',
|
|
'lstm_step_layer',
|
|
'recurrent_group',
|
|
'memory',
|
|
'StaticInput',
|
|
'expand_layer',
|
|
'scaling_layer',
|
|
'scaling_projection',
|
|
'power_layer',
|
|
'interpolation_layer',
|
|
'bilinear_interp_layer',
|
|
'trans_layer',
|
|
'rotate_layer',
|
|
'sum_to_one_norm_layer',
|
|
'row_l2_norm_layer',
|
|
'get_output_layer',
|
|
'LayerType',
|
|
'context_projection',
|
|
'beam_search',
|
|
'maxid_layer',
|
|
'GeneratedInput',
|
|
'SubsequenceInput',
|
|
'gru_step_layer',
|
|
'gru_step_naive_layer',
|
|
'recurrent_layer',
|
|
'BaseGeneratedInput',
|
|
'conv_operator',
|
|
'conv_shift_layer',
|
|
'tensor_layer',
|
|
'selective_fc_layer',
|
|
'sampling_id_layer',
|
|
'slope_intercept_layer',
|
|
'trans_full_matrix_projection',
|
|
'linear_comb_layer',
|
|
'convex_comb_layer',
|
|
'ctc_layer',
|
|
'warp_ctc_layer',
|
|
'crf_layer',
|
|
'crf_decoding_layer',
|
|
'nce_layer',
|
|
'cross_entropy_with_selfnorm',
|
|
'cross_entropy',
|
|
'BeamInput',
|
|
'cross_entropy_over_beam',
|
|
'multi_binary_label_cross_entropy',
|
|
'sum_cost',
|
|
'rank_cost',
|
|
'lambda_cost',
|
|
'huber_regression_cost',
|
|
'huber_classification_cost',
|
|
'block_expand_layer',
|
|
'maxout_layer',
|
|
'dot_prod_layer',
|
|
'out_prod_layer',
|
|
'printer_layer',
|
|
'print_layer',
|
|
'priorbox_layer',
|
|
'cross_channel_norm_layer',
|
|
'multibox_loss_layer',
|
|
'detection_output_layer',
|
|
'roi_pool_layer',
|
|
'spp_layer',
|
|
'pad_layer',
|
|
'eos_layer',
|
|
'smooth_l1_cost',
|
|
'layer_support',
|
|
'multiplex_layer',
|
|
'row_conv_layer',
|
|
'dropout_layer',
|
|
'prelu_layer',
|
|
'switch_order_layer',
|
|
'gated_unit_layer',
|
|
'crop_layer',
|
|
'sub_nested_seq_layer',
|
|
'clip_layer',
|
|
'slice_projection',
|
|
'seq_slice_layer',
|
|
'kmax_seq_score_layer',
|
|
'img_pool3d_layer',
|
|
'scale_shift_layer',
|
|
'img_conv3d_layer',
|
|
'resize_layer',
|
|
'sub_seq_layer',
|
|
'scale_sub_region_layer',
|
|
'factorization_machine',
|
|
]
|
|
|
|
|
|
class LayerType(object):
|
|
"""
|
|
Layer type enumerations.
|
|
"""
|
|
|
|
DATA = 'data'
|
|
MIXED_LAYER = 'mixed'
|
|
LSTMEMORY = 'lstmemory'
|
|
GRUMEMORY = 'gated_recurrent'
|
|
SEQUENCE_LAST_INSTANCE = 'seqlastins'
|
|
SEQUENCE_FIRST_INSTANCE = 'seqfirstins'
|
|
SEQUENCE_RESHAPE = 'seqreshape'
|
|
POOLING_MAX = 'max'
|
|
POOLING_AVG = 'average'
|
|
FC_LAYER = 'fc'
|
|
COST = 'cost'
|
|
COSINE_SIM_VEC = 'cos_vm'
|
|
COSINE_SIM = 'cos'
|
|
L2_DISTANCE = 'l2_distance'
|
|
HSIGMOID = 'hsigmoid'
|
|
CONV_LAYER = 'conv'
|
|
CONVTRANS_LAYER = 'convt'
|
|
EXCONV_LAYER = 'exconv'
|
|
EXCONVTRANS_LAYER = 'exconvt'
|
|
CUDNNCONV_LAYER = 'cudnn_conv'
|
|
CUDNNCONVTRANS_LAYER = 'cudnn_convt'
|
|
POOL_LAYER = 'pool'
|
|
POOL3D_LAYER = 'pool3d'
|
|
BATCH_NORM_LAYER = 'batch_norm'
|
|
NORM_LAYER = 'norm'
|
|
SUM_TO_ONE_NORM_LAYER = 'sum_to_one_norm'
|
|
ROW_L2_NORM_LAYER = 'row_l2_norm'
|
|
ADDTO_LAYER = 'addto'
|
|
|
|
CONCAT_LAYER = 'concat'
|
|
CONCAT_PROJ_LAYER = 'concat2'
|
|
SEQUENCE_CONCAT_LAYER = 'seqconcat'
|
|
|
|
LSTM_STEP_LAYER = 'lstm_step'
|
|
GRU_STEP_LAYER = 'gru_step'
|
|
GET_OUTPUT_LAYER = 'get_output'
|
|
|
|
EXPAND_LAYER = 'expand'
|
|
INTERPOLATION_LAYER = 'interpolation'
|
|
BILINEAR_INTERP_LAYER = 'bilinear_interp'
|
|
POWER_LAYER = 'power'
|
|
SCALING_LAYER = 'scaling'
|
|
TRANS_LAYER = 'trans'
|
|
ROTATE_LAYER = 'rotate'
|
|
DOT_PROD_LAYER = 'dot_prod'
|
|
OUT_PROD_LAYER = 'out_prod'
|
|
FEATURE_MAP_EXPAND_LAYER = 'featmap_expand'
|
|
|
|
MEMORY = 'memory'
|
|
MAXID_LAYER = 'maxid'
|
|
EOSID_LAYER = 'eos_id'
|
|
RECURRENT_LAYER = 'recurrent'
|
|
|
|
CONV_SHIFT_LAYER = "conv_shift"
|
|
TENSOR_LAYER = "tensor"
|
|
SEL_FC_LAYER = "selective_fc"
|
|
SAMPLING_ID_LAYER = "sampling_id"
|
|
SLOPE_INTERCEPT_LAYER = "slope_intercept"
|
|
LINEAR_COMBINATION_LAYER = "convex_comb"
|
|
BLOCK_EXPAND = "blockexpand"
|
|
MAXOUT = "maxout"
|
|
SPP_LAYER = "spp"
|
|
PAD_LAYER = "pad"
|
|
MULTIPLEX_LAYER = "multiplex"
|
|
ROW_CONV_LAYER = "row_conv"
|
|
|
|
PRINT_LAYER = 'print'
|
|
PRIORBOX_LAYER = 'priorbox'
|
|
MULTIBOX_LOSS_LAYER = 'multibox_loss'
|
|
DETECTION_OUTPUT_LAYER = 'detection_output'
|
|
ROI_POOL_LAYER = 'roi_pool'
|
|
|
|
CTC_LAYER = 'ctc'
|
|
WARP_CTC_LAYER = 'warp_ctc'
|
|
CRF_LAYER = 'crf'
|
|
CRF_DECODING_LAYER = 'crf_decoding'
|
|
NCE_LAYER = 'nce'
|
|
|
|
CONV3D_LAYER = 'conv3d'
|
|
DECONV3D_LAYER = 'deconv3d'
|
|
|
|
RANK_COST = 'rank-cost'
|
|
LAMBDA_COST = 'lambda_cost'
|
|
HUBER_REGRESSION = 'huber_regression'
|
|
HUBER_CLASSIFICATION = 'huber_classification'
|
|
CROSS_ENTROPY = 'multi-class-cross-entropy'
|
|
CROSS_ENTROPY_WITH_SELFNORM = 'multi_class_cross_entropy_with_selfnorm'
|
|
CROSS_ENTROPY_OVER_BEAM = 'cross_entropy_over_beam'
|
|
SOFT_BIN_CLASS_CROSS_ENTROPY = 'soft_binary_class_cross_entropy'
|
|
MULTI_BIN_LABEL_CROSS_ENTROPY = 'multi_binary_label_cross_entropy'
|
|
SUM_COST = 'sum_cost'
|
|
SMOOTH_L1 = 'smooth_l1'
|
|
|
|
PRELU = 'prelu'
|
|
SWITCH_ORDER_LAYER = 'switch_order'
|
|
CROP_LAYER = 'crop'
|
|
SUB_NESTED_SEQ = 'sub_nested_seq'
|
|
CLIP_LAYER = 'clip'
|
|
SEQ_SLICE = 'seq_slice'
|
|
|
|
KMAX_SEQ_SCORE = 'kmax_seq_score'
|
|
SCALE_SHIFT_LAYER = 'scale_shift'
|
|
|
|
RESIZE = 'resize'
|
|
SUB_SEQ_LAYER = 'subseq'
|
|
|
|
SCALE_SUB_REGION_LAYER = 'scale_sub_region'
|
|
|
|
FACTORIZATION_MACHINE = 'factorization_machine'
|
|
|
|
@staticmethod
|
|
def is_layer_type(type_name):
|
|
"""
|
|
Whether type_name is a layer type.
|
|
|
|
:param type_name: layer type name. Because layer type enumerations are
|
|
strings.
|
|
:type type_name: basestring
|
|
:return: True if is a layer_type
|
|
:rtype: bool
|
|
"""
|
|
for key in dir(LayerType):
|
|
if key.isupper():
|
|
att = getattr(LayerType, key)
|
|
if isinstance(att, basestring) and type_name == att:
|
|
return True
|
|
return False
|
|
|
|
|
|
class AggregateLevel(object):
|
|
"""
|
|
PaddlePaddle supports three sequence types:
|
|
|
|
- :code:`SequenceType.NO_SEQUENCE` means the sample is not a sequence.
|
|
- :code:`SequenceType.SEQUENCE` means the sample is a sequence.
|
|
- :code:`SequenceType.SUB_SEQUENCE` means the sample is a nested sequence,
|
|
each timestep of which is also a sequence.
|
|
|
|
Accordingly, AggregateLevel supports two modes:
|
|
|
|
- :code:`AggregateLevel.TO_NO_SEQUENCE` means the aggregation acts on each
|
|
timestep of a sequence, both :code:`SUB_SEQUENCE` and :code:`SEQUENCE` will
|
|
be aggregated to :code:`NO_SEQUENCE`.
|
|
|
|
- :code:`AggregateLevel.TO_SEQUENCE` means the aggregation acts on each
|
|
sequence of a nested sequence, :code:`SUB_SEQUENCE` will be aggregated to
|
|
:code:`SEQUENCE`.
|
|
"""
|
|
TO_NO_SEQUENCE = 'non-seq'
|
|
TO_SEQUENCE = 'seq'
|
|
# compatible with previous configuration
|
|
EACH_TIMESTEP = TO_NO_SEQUENCE
|
|
EACH_SEQUENCE = TO_SEQUENCE
|
|
|
|
|
|
class LayerOutput(object):
|
|
"""
|
|
LayerOutput is output for layer function. It is used internally by several
|
|
reasons.
|
|
|
|
- Check layer connection make sense.
|
|
|
|
- FC(Softmax) => Cost(MSE Error) is not good for example.
|
|
|
|
- Tracking layer connection.
|
|
|
|
- Pass to layer methods as input.
|
|
|
|
:param name: Layer output name.
|
|
:type name: basestring
|
|
:param layer_type: Current Layer Type. One of LayerType enumeration.
|
|
:type layer_type: basestring
|
|
:param activation: Layer Activation.
|
|
:type activation: BaseActivation.
|
|
:param parents: Layer's parents.
|
|
:type parents: list | tuple | collections.Sequence
|
|
"""
|
|
|
|
def __init__(self,
|
|
name,
|
|
layer_type,
|
|
parents=None,
|
|
activation=None,
|
|
num_filters=None,
|
|
img_norm_type=None,
|
|
size=None,
|
|
outputs=None,
|
|
reverse=None):
|
|
assert isinstance(name, basestring)
|
|
assert isinstance(layer_type, basestring)
|
|
assert size is not None
|
|
assert LayerType.is_layer_type(layer_type)
|
|
self.name = name
|
|
self.full_name = MakeLayerNameInSubmodel(name)
|
|
self.layer_type = layer_type
|
|
if parents is not None and type(parents) != list:
|
|
parents = [parents]
|
|
self.parents = [] if parents is None else parents
|
|
self.activation = activation
|
|
self.num_filters = num_filters
|
|
self.img_norm_type = img_norm_type
|
|
self.size = size
|
|
if outputs is None:
|
|
outputs = ['default']
|
|
self.outputs = outputs
|
|
self.reverse = reverse
|
|
|
|
@property
|
|
def width(self):
|
|
return cp.g_layer_map[self.full_name].width
|
|
|
|
@property
|
|
def height(self):
|
|
return cp.g_layer_map[self.full_name].height
|
|
|
|
@property
|
|
def depth(self):
|
|
return cp.g_layer_map[self.full_name].depth
|
|
|
|
def set_input(self, input):
|
|
"""
|
|
Set the input for a memory layer. Can only be used for memory layer
|
|
"""
|
|
assert isinstance(input, LayerOutput)
|
|
assert self.layer_type == LayerType.MEMORY
|
|
SetMemoryInput(self.name, input.name)
|
|
|
|
|
|
ERROR_CLIPPING = 'error_clipping_threshold'
|
|
DROPOUT = 'drop_rate'
|
|
DEVICE = 'device'
|
|
|
|
|
|
def layer_support(*attrs):
|
|
attrs_list = list(attrs)
|
|
attrs_list.append(DEVICE)
|
|
|
|
def decorator(method):
|
|
@functools.wraps(method)
|
|
def wrapper(*args, **kwargs):
|
|
for attr in attrs_list:
|
|
for each in args:
|
|
if isinstance(each, ExtraLayerAttribute):
|
|
setattr(each, '_'.join(['can', attr]), True)
|
|
for key in kwargs:
|
|
val = kwargs[key]
|
|
if isinstance(val, ExtraLayerAttribute):
|
|
setattr(val, '_'.join(['can', attr]), True)
|
|
for each in args:
|
|
if isinstance(each, ExtraLayerAttribute):
|
|
each.check(method.__name__)
|
|
for key in kwargs:
|
|
val = kwargs[key]
|
|
if isinstance(val, ExtraLayerAttribute):
|
|
val.check(method.__name__)
|
|
return method(*args, **kwargs)
|
|
|
|
if hasattr(method, 'argspec'):
|
|
wrapper.argspec = method.argspec
|
|
else:
|
|
wrapper.argspec = inspect.getargspec(method)
|
|
|
|
return wrapper
|
|
|
|
return decorator
|
|
|
|
|
|
@wrap_param_attr_default()
|
|
def full_matrix_projection(input, size=0, param_attr=None):
|
|
"""
|
|
Full Matrix Projection. It performs full matrix multiplication.
|
|
|
|
.. math::
|
|
out.row[i] += in.row[i] * weight
|
|
|
|
There are two styles of usage.
|
|
|
|
1. When used in mixed_layer like this, you can only set the input:
|
|
|
|
.. code-block:: python
|
|
|
|
with mixed_layer(size=100) as m:
|
|
m += full_matrix_projection(input=layer)
|
|
|
|
2. When used as an independent object like this, you must set the size:
|
|
|
|
.. code-block:: python
|
|
|
|
proj = full_matrix_projection(input=layer,
|
|
size=100,
|
|
param_attr=ParamAttr(name='_proj'))
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param size: The dimension of this layer.
|
|
:type size: int
|
|
:param param_attr: The parameter attribute. See ParameterAttribute for details.
|
|
:type param_attr: ParameterAttribute
|
|
:return: FullMatrixProjection Object.
|
|
:rtype: FullMatrixProjection
|
|
"""
|
|
proj = FullMatrixProjection(
|
|
input_layer_name=input.name, size=size, **param_attr.attr)
|
|
proj.origin = input
|
|
return proj
|
|
|
|
|
|
@wrap_param_attr_default()
|
|
def trans_full_matrix_projection(input, size=0, param_attr=None):
|
|
"""
|
|
Different from full_matrix_projection, this projection performs matrix
|
|
multiplication, using the transpose of weight.
|
|
|
|
.. math::
|
|
out.row[i] += in.row[i] * w^\mathrm{T}
|
|
|
|
:math:`w^\mathrm{T}` means the transpose of weight.
|
|
The simply usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
proj = trans_full_matrix_projection(input=layer,
|
|
size=100,
|
|
param_attr=ParamAttr(
|
|
name='_proj',
|
|
initial_mean=0.0,
|
|
initial_std=0.01))
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param size: The parameter size. Means the width of parameter.
|
|
:type size: int
|
|
:param param_attr: The parameter attribute. See ParameterAttribute for details.
|
|
:type param_attr: ParameterAttribute
|
|
:return: TransposedFullMatrixProjection Object.
|
|
:rtype: TransposedFullMatrixProjection
|
|
"""
|
|
proj = TransposedFullMatrixProjection(
|
|
input_layer_name=input.name, size=size, **param_attr.attr)
|
|
proj.origin = input
|
|
return proj
|
|
|
|
|
|
@wrap_param_attr_default()
|
|
def table_projection(input, size=0, param_attr=None):
|
|
"""
|
|
Table Projection. It selects rows from parameter where row\_id
|
|
is in input\_ids.
|
|
|
|
.. math::
|
|
out.row[i] += table.row[ids[i]]
|
|
|
|
where :math:`out` is output, :math:`table` is parameter, :math:`ids` is input\_ids,
|
|
and :math:`i` is row\_id.
|
|
|
|
There are two styles of usage.
|
|
|
|
1. When used in mixed_layer like this, you can only set the input:
|
|
|
|
.. code-block:: python
|
|
|
|
with mixed_layer(size=100) as m:
|
|
m += table_projection(input=layer)
|
|
|
|
2. When used as an independent object like this, you must set the size:
|
|
|
|
.. code-block:: python
|
|
|
|
proj = table_projection(input=layer,
|
|
size=100,
|
|
param_attr=ParamAttr(name='_proj'))
|
|
|
|
|
|
:param input: The input of this layer, which must contains id fields.
|
|
:type input: LayerOutput
|
|
:param size: The dimension of the output.
|
|
:type size: int
|
|
:param param_attr: The parameter attribute. See ParameterAttribute for details.
|
|
:type param_attr: ParameterAttribute
|
|
:return: TableProjection Object.
|
|
:rtype: TableProjection
|
|
"""
|
|
proj = TableProjection(
|
|
input_layer_name=input.name, size=size, **param_attr.attr)
|
|
proj.origin = input
|
|
return proj
|
|
|
|
|
|
def identity_projection(input, offset=None, size=None):
|
|
"""
|
|
1. If offset=None, it performs IdentityProjection as follows:
|
|
|
|
.. math::
|
|
out.row[i] += in.row[i]
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
proj = identity_projection(input=layer)
|
|
|
|
|
|
2. If offset!=None, It executes IdentityOffsetProjection and takes the
|
|
elements of the input in the range [offset, offset+size) as output.
|
|
|
|
.. math::
|
|
out.row[i] += in.row[i + \\textrm{offset}]
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
proj = identity_projection(input=layer,
|
|
offset=10)
|
|
|
|
Note that neither of the projections have trainable parameter.
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param offset: The offset from the start of the input. The input's
|
|
elements in the range [offset, offset+size) will be
|
|
taken as output. If this parameter is not set or set
|
|
to None, the output will be the same as the input.
|
|
:type offset: int
|
|
:param size: The dimension of this layer. It will be neglected
|
|
when offset is None or not set.
|
|
:type size: int
|
|
:return: IdentityProjection or IdentityOffsetProjection object
|
|
:rtype: IdentityProjection | IdentityOffsetProjection
|
|
"""
|
|
if offset is None:
|
|
proj = IdentityProjection(input_layer_name=input.name)
|
|
proj.origin = input
|
|
else:
|
|
if size is None:
|
|
size = input.size - offset
|
|
proj = IdentityOffsetProjection(
|
|
input_layer_name=input.name, offset=offset, size=size)
|
|
proj.origin = input
|
|
return proj
|
|
|
|
|
|
def slice_projection(input, slices):
|
|
"""
|
|
slice_projection slices the input value into multiple parts,
|
|
then selects and merges some of them into a new output.
|
|
|
|
.. math::
|
|
output = [input.slices()]
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
proj = slice_projection(input=layer, slices=[(0, 10), (20, 30)])
|
|
|
|
Note that slice_projection has no trainable parameter.
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param slices: A list of start and end offsets of each slice.
|
|
:type slices: list of tuple
|
|
:return: SliceProjection object.
|
|
:rtype: SliceProjection
|
|
"""
|
|
assert len(slices) >= 1
|
|
start = 0
|
|
for i in xrange(len(slices)):
|
|
assert len(slices[i]) == 2
|
|
# The start position of the next slice needs to be greater than
|
|
# or equal to the end position of the previous slice.
|
|
assert slices[i][0] >= start
|
|
assert slices[i][1] >= slices[i][0]
|
|
start = slices[i][1]
|
|
proj = SliceProjection(input_layer_name=input.name, slices=slices)
|
|
proj.origin = input
|
|
return proj
|
|
|
|
|
|
@wrap_param_attr_default()
|
|
def scaling_projection(input, param_attr=None):
|
|
"""
|
|
scaling_projection multiplies the input with a scalar parameter.
|
|
|
|
.. math::
|
|
out += w * in
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
proj = scaling_projection(input=layer)
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param param_attr: The parameter attribute. See ParameterAttribute for details.
|
|
:type param_attr: ParameterAttribute
|
|
:return: ScalingProjection object.
|
|
:rtype: ScalingProjection
|
|
"""
|
|
proj = ScalingProjection(input_layer_name=input.name, **param_attr.attr)
|
|
proj.origin = input
|
|
return proj
|
|
|
|
|
|
@wrap_param_attr_default()
|
|
def dotmul_projection(input, param_attr=None):
|
|
"""
|
|
DotMulProjection takes a layer as input and performs
|
|
element-wise multiplication with weight.
|
|
|
|
.. math::
|
|
out.row[i] += in.row[i] .* weight
|
|
|
|
where :math:`.*` means element-wise multiplication.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
proj = dotmul_projection(input=layer)
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param param_attr: The parameter attribute. See ParameterAttribute for details.
|
|
:type param_attr: ParameterAttribute
|
|
:return: DotMulProjection object.
|
|
:rtype: DotMulProjection
|
|
"""
|
|
proj = DotMulProjection(
|
|
input_layer_name=input.name, size=input.size, **param_attr.attr)
|
|
proj.origin = input
|
|
return proj
|
|
|
|
|
|
def dotmul_operator(a=None, b=None, scale=1, **kwargs):
|
|
"""
|
|
DotMulOperator takes two inputs and performs element-wise multiplication:
|
|
|
|
.. math::
|
|
out.row[i] += scale * (a.row[i] .* b.row[i])
|
|
|
|
where :math:`.*` means element-wise multiplication, and
|
|
scale is a config scalar, its default value is 1.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
op = dotmul_operator(a=layer1, b=layer2, scale=0.5)
|
|
|
|
:param a: The first input of this layer.
|
|
:type a: LayerOutput
|
|
:param b: The second input of this layer.
|
|
:type b: LayerOutput
|
|
:param scale: A scalar to scale the product. Its default value is 1.
|
|
:type scale: float
|
|
:return: DotMulOperator object.
|
|
:rtype: DotMulOperator
|
|
"""
|
|
if 'x' in kwargs or 'y' in kwargs:
|
|
logger.warning('x and y arguments for dotmul_operator is deprecated. '
|
|
'Please use a and b as parameter.')
|
|
a = kwargs.get('x', a) # For Backward capacity.
|
|
b = kwargs.get('y', b)
|
|
assert isinstance(a, LayerOutput)
|
|
assert isinstance(b, LayerOutput)
|
|
if a.size is not None and b.size is not None:
|
|
assert a.size == b.size
|
|
|
|
op = DotMulOperator(input_layer_names=[a.name, b.name], scale=scale)
|
|
op.origin = [a, b]
|
|
return op
|
|
|
|
|
|
@wrap_bias_attr_default(['padding_attr'])
|
|
def context_projection(input,
|
|
context_len,
|
|
context_start=None,
|
|
padding_attr=False):
|
|
"""
|
|
Context Projection.
|
|
|
|
It just reorganizes input sequence, combines "context_len" elements of the
|
|
sequence to one context from context_start. "context_start" will be set to
|
|
-(context_len - 1) / 2 by default. When context position is out of sequence
|
|
length, padding will be filled as zero if padding_attr = False, otherwise
|
|
it is trainable.
|
|
|
|
For example, origin sequence is [A B C D E F G], context len is 3, padding_attr
|
|
is not set, then after context projection, sequence will
|
|
be [ 0AB ABC BCD CDE DEF EFG FG0 ].
|
|
|
|
:param input: The input of this layer, which should be a sequence.
|
|
:type input: LayerOutput
|
|
:param context_len: The length of the context.
|
|
:type context_len: int
|
|
:param context_start: The start position of the context. The default value is
|
|
-(context_len - 1)/2
|
|
:type context_start: int
|
|
:param padding_attr: Parameter attribute of the padding. If the parameter is
|
|
set to False, padding will be zero. In other cases, the
|
|
padding is trainable, and its parameter attribute is set
|
|
by this parameter.
|
|
:type padding_attr: bool | ParameterAttribute
|
|
:return: Projection object.
|
|
:rtype: Projection
|
|
"""
|
|
context_start = -(
|
|
context_len - 1) / 2 if context_start is None else context_start
|
|
|
|
extra_dict = dict()
|
|
trainable = isinstance(padding_attr, ParameterAttribute)
|
|
if trainable:
|
|
extra_dict = padding_attr.attr
|
|
|
|
proj = ContextProjection(
|
|
input_layer_name=input.name,
|
|
context_length=context_len,
|
|
context_start=context_start,
|
|
trainable_padding=trainable,
|
|
**extra_dict)
|
|
proj.origin = input
|
|
return proj
|
|
|
|
|
|
class MixedLayerType(LayerOutput):
|
|
"""
|
|
The internal object for trainer_helpers.
|
|
"""
|
|
|
|
class AddToSealedMixedLayerException(Exception):
|
|
def __init__(self):
|
|
Exception.__init__(self)
|
|
|
|
def __init__(self, name, size, act, bias_attr, layer_attr, parents=None):
|
|
"""
|
|
:param name: The name of this layer.
|
|
:type name: basestring
|
|
:param size: The dimension of this layer.
|
|
:type size: int
|
|
:param act: Activation type.
|
|
:type act: BaseActivation
|
|
:param bias_attr: The bias attribute. If the parameter is set to False or an object
|
|
whose type is not ParameterAttribute, no bias is defined. If the
|
|
parameter is set to True, the bias is initialized to zero.
|
|
:type bias_attr: ParameterAttribute | None | bool | Any
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute | None
|
|
"""
|
|
LayerOutput.__init__(
|
|
self,
|
|
name,
|
|
LayerType.MIXED_LAYER,
|
|
parents,
|
|
size=size,
|
|
activation=act)
|
|
self.bias_attr = bias_attr
|
|
self.layer_attr = layer_attr
|
|
self.inputs = []
|
|
self.finalized = False
|
|
|
|
def __iadd__(self, other):
|
|
"""
|
|
+ += operator
|
|
:param other: Other projection.
|
|
:type other: Projection
|
|
:return: self.
|
|
:rtype: MixedLayerType
|
|
"""
|
|
if not self.finalized:
|
|
assert isinstance(other, Projection) or isinstance(other, Operator)
|
|
self.inputs.append(other)
|
|
if isinstance(other, Projection):
|
|
self.parents.append(other.origin)
|
|
else:
|
|
self.parents.extend(other.origin)
|
|
return self
|
|
else:
|
|
raise MixedLayerType.AddToSealedMixedLayerException()
|
|
|
|
def __enter__(self):
|
|
assert len(self.inputs) == 0
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_value, tb):
|
|
if exc_value is not None:
|
|
raise exc_value
|
|
assert len(self.inputs) != 0
|
|
ml = MixedLayer(
|
|
name=self.name,
|
|
size=self.size,
|
|
active_type=self.activation.name,
|
|
bias=ParamAttr.to_bias(self.bias_attr),
|
|
inputs=self.inputs,
|
|
**ExtraLayerAttribute.to_kwargs(self.layer_attr))
|
|
# update the size which might be computed inside MixedLayer
|
|
# according to the operator's output size
|
|
self.size = ml.config.size
|
|
self.finalized = True
|
|
|
|
|
|
@wrap_name_default("mixed")
|
|
@wrap_act_default(act=LinearActivation())
|
|
@wrap_bias_attr_default(has_bias=False)
|
|
@layer_support(ERROR_CLIPPING, DROPOUT)
|
|
def mixed_layer(size=0,
|
|
input=None,
|
|
name=None,
|
|
act=None,
|
|
bias_attr=False,
|
|
layer_attr=None):
|
|
"""
|
|
Mixed Layer. A mixed layer will add all inputs together, then activate the sum.
|
|
Each input is a projection or operator.
|
|
|
|
There are two styles of usages.
|
|
|
|
1. When the parameter input is not set, use mixed_layer like this:
|
|
|
|
.. code-block:: python
|
|
|
|
with mixed_layer(size=256) as m:
|
|
m += full_matrix_projection(input=layer1)
|
|
m += identity_projection(input=layer2)
|
|
|
|
2. You can also set all inputs when invoke mixed_layer as follows:
|
|
|
|
.. code-block:: python
|
|
|
|
m = mixed_layer(size=256,
|
|
input=[full_matrix_projection(input=layer1),
|
|
full_matrix_projection(input=layer2)])
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param size: The dimension of this layer.
|
|
:type size: int
|
|
:param input: The input of this layer. It is an optional parameter.
|
|
:param act: Activation Type. LinearActivation is the default activation.
|
|
:type act: BaseActivation
|
|
:param bias_attr: The bias attribute. If the parameter is set to False or an object
|
|
whose type is not ParameterAttribute, no bias is defined. If the
|
|
parameter is set to True, the bias is initialized to zero.
|
|
:type bias_attr: ParameterAttribute | None | bool | Any
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: MixedLayerType object.
|
|
:rtype: MixedLayerType
|
|
"""
|
|
|
|
if input is None:
|
|
return MixedLayerType(name, size, act, bias_attr, layer_attr)
|
|
else:
|
|
with mixed_layer(
|
|
name=name,
|
|
size=size,
|
|
act=act,
|
|
bias_attr=bias_attr,
|
|
layer_attr=layer_attr) as m:
|
|
if isinstance(input, collections.Sequence):
|
|
for each in input:
|
|
m += each
|
|
else:
|
|
m += input
|
|
return m
|
|
|
|
|
|
@layer_support()
|
|
def data_layer(name, size, depth=None, height=None, width=None,
|
|
layer_attr=None):
|
|
"""
|
|
Define DataLayer For NeuralNetwork.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
data = data_layer(name="input", size=1000)
|
|
|
|
:param name: The name of this layer.
|
|
:type name: basestring
|
|
:param size: The dimension of this data layer.
|
|
:type size: int
|
|
:param height: The height of the input image data.
|
|
:type height: int | None
|
|
:param width: The width of the input image data.
|
|
:type width: int | None
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
Layer(
|
|
type=LayerType.DATA,
|
|
name=name,
|
|
size=size,
|
|
depth=depth,
|
|
height=height,
|
|
width=width,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
|
|
if depth is None:
|
|
depth = 1
|
|
num_filters = None
|
|
if height is not None and width is not None:
|
|
num_filters = size / (width * height * depth)
|
|
assert num_filters * width * height * depth == size, \
|
|
"size=%s width=%s height=%s depth=%s" % (size, width, height, depth)
|
|
|
|
return LayerOutput(name, LayerType.DATA, size=size, num_filters=num_filters)
|
|
|
|
|
|
@wrap_name_default("embedding")
|
|
@wrap_param_attr_default()
|
|
@layer_support(ERROR_CLIPPING, DROPOUT)
|
|
def embedding_layer(input, size, name=None, param_attr=None, layer_attr=None):
|
|
"""
|
|
Define a embedding Layer.
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The input of this layer, whose type must be Index Data.
|
|
:type input: LayerOutput
|
|
:param size: The dimension of the embedding vector.
|
|
:type size: int
|
|
:param param_attr: The embedding parameter attribute. See ParameterAttribute
|
|
for details.
|
|
:type param_attr: ParameterAttribute
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute | None
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
with mixed_layer(
|
|
name=name,
|
|
size=size,
|
|
act=LinearActivation(),
|
|
bias_attr=False,
|
|
layer_attr=layer_attr) as mix:
|
|
mix += table_projection(input=input, size=size, param_attr=param_attr)
|
|
return mix
|
|
|
|
|
|
@wrap_name_default()
|
|
@wrap_param_attr_default()
|
|
@wrap_bias_attr_default()
|
|
@wrap_act_default()
|
|
@layer_support(ERROR_CLIPPING, DROPOUT)
|
|
def fc_layer(input,
|
|
size,
|
|
act=None,
|
|
name=None,
|
|
param_attr=None,
|
|
bias_attr=None,
|
|
layer_attr=None):
|
|
"""
|
|
The fully connected layer.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
fc = fc_layer(input=layer,
|
|
size=1024,
|
|
act=LinearActivation(),
|
|
bias_attr=False)
|
|
|
|
which is equal to:
|
|
|
|
.. code-block:: python
|
|
|
|
with mixed_layer(size=1024) as fc:
|
|
fc += full_matrix_projection(input=layer)
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput | list | tuple
|
|
:param size: The dimension of this layer.
|
|
:type size: int
|
|
:param act: Activation Type. TanhActivation is the default activation.
|
|
:type act: BaseActivation
|
|
:param param_attr: The parameter attribute. See ParameterAttribute for details.
|
|
:type param_attr: ParameterAttribute
|
|
:param bias_attr: The bias attribute. If the parameter is set to False or an object
|
|
whose type is not ParameterAttribute, no bias is defined. If the
|
|
parameter is set to True, the bias is initialized to zero.
|
|
:type bias_attr: ParameterAttribute | None | bool | Any
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute | None
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
if isinstance(input, LayerOutput):
|
|
input = [input]
|
|
assert not isinstance(param_attr, collections.Sequence)
|
|
param_attr = [param_attr]
|
|
else:
|
|
if isinstance(param_attr, collections.Sequence):
|
|
assert len(input) == len(param_attr)
|
|
else:
|
|
if "parameter_name" in param_attr.attr and len(input) > 1:
|
|
logger.fatal(
|
|
"When the name field of param_attr is manually specified "
|
|
"and the input is a list, the param_attr should also be a "
|
|
"list with each item being the param_attr for each input "
|
|
"item. If only one named param_attr is provided, all the "
|
|
"input items would share this parameter.")
|
|
param_attr = [copy.deepcopy(param_attr) for _ in range(len(input))]
|
|
|
|
assert isinstance(input, collections.Sequence)
|
|
|
|
Layer(
|
|
inputs=[
|
|
Input(ipt.name, **attr.attr) for ipt, attr in zip(input, param_attr)
|
|
],
|
|
name=name,
|
|
type=LayerType.FC_LAYER,
|
|
size=size,
|
|
bias=ParamAttr.to_bias(bias_attr),
|
|
active_type=act.name,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name, LayerType.FC_LAYER, input, activation=act, size=size)
|
|
|
|
|
|
@wrap_name_default("print")
|
|
def printer_layer(input, format=None, name=None):
|
|
"""
|
|
Print the output value of the layers specified by the parameter input.
|
|
This layer is useful for debugging.
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput | list | tuple
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
if isinstance(input, LayerOutput):
|
|
input = [input]
|
|
assert isinstance(input, collections.Sequence) # list or tuple
|
|
for each in input:
|
|
assert isinstance(each, LayerOutput)
|
|
|
|
Layer(
|
|
name=name,
|
|
format=format,
|
|
type=LayerType.PRINT_LAYER,
|
|
inputs=[l.name for l in input], )
|
|
# this layer don't return anything, can not be input of other layer.
|
|
|
|
# Keep print_layer for compatibility with V1 API.
|
|
# 'print_layer' does not work for V2 API because it will be changed to
|
|
# 'print' for V2 API. But 'print' is a reserved key word in python.
|
|
|
|
|
|
print_layer = printer_layer
|
|
|
|
|
|
@wrap_name_default("priorbox")
|
|
def priorbox_layer(input,
|
|
image,
|
|
aspect_ratio,
|
|
variance,
|
|
min_size,
|
|
max_size=[],
|
|
name=None):
|
|
"""
|
|
Compute the priorbox and set the variance. This layer is necessary for ssd.
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param image: The network input image.
|
|
:type image: LayerOutput
|
|
:param aspect_ratio: The aspect ratio.
|
|
:type aspect_ratio: list
|
|
:param variance: The bounding box variance.
|
|
:type min_size: The minimum size of the priorbox width/height.
|
|
:param min_size: list
|
|
:type max_size: The maximum size of the priorbox width/height. It could be NULL.
|
|
:param max_size: list
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
# plus one for ratio 1.
|
|
num_filters = (len(aspect_ratio) * 2 + 1 + len(max_size)) * 4
|
|
size = (input.size / input.num_filters) * num_filters * 2
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.PRIORBOX_LAYER,
|
|
inputs=[input.name, image.name],
|
|
size=size,
|
|
min_size=min_size,
|
|
max_size=max_size,
|
|
aspect_ratio=aspect_ratio,
|
|
variance=variance)
|
|
return LayerOutput(
|
|
name,
|
|
LayerType.PRIORBOX_LAYER,
|
|
parents=[input, image],
|
|
num_filters=num_filters,
|
|
size=size)
|
|
|
|
|
|
@wrap_name_default("multibox_loss")
|
|
def multibox_loss_layer(input_loc,
|
|
input_conf,
|
|
priorbox,
|
|
label,
|
|
num_classes,
|
|
overlap_threshold=0.5,
|
|
neg_pos_ratio=3.0,
|
|
neg_overlap=0.5,
|
|
background_id=0,
|
|
name=None):
|
|
"""
|
|
Compute the location loss and the confidence loss for ssd.
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input_loc: The input predicted locations.
|
|
:type input_loc: LayerOutput | List of LayerOutput
|
|
:param input_conf: The input priorbox confidence.
|
|
:type input_conf: LayerOutput | List of LayerOutput
|
|
:param priorbox: The input priorbox location and the variance.
|
|
:type priorbox: LayerOutput
|
|
:param label: The input label.
|
|
:type label: LayerOutput
|
|
:param num_classes: The number of the classification.
|
|
:type num_classes: int
|
|
:param overlap_threshold: The threshold of the overlap.
|
|
:type overlap_threshold: float
|
|
:param neg_pos_ratio: The ratio of the negative bounding box to
|
|
the positive bounding box.
|
|
:type neg_pos_ratio: float
|
|
:param neg_overlap: The negative bounding box overlap threshold.
|
|
:type neg_overlap: float
|
|
:param background_id: The background class index.
|
|
:type background_id: int
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
if isinstance(input_loc, LayerOutput):
|
|
input_loc = [input_loc]
|
|
assert isinstance(input_loc, collections.Sequence) # list or tuple
|
|
for each in input_loc:
|
|
assert isinstance(each, LayerOutput)
|
|
input_loc_num = len(input_loc)
|
|
|
|
if isinstance(input_conf, LayerOutput):
|
|
input_conf = [input_conf]
|
|
assert isinstance(input_conf, collections.Sequence) # list or tuple
|
|
for each in input_conf:
|
|
assert isinstance(each, LayerOutput)
|
|
input_conf_num = len(input_conf)
|
|
# Check the input layer number.
|
|
assert input_loc_num == input_conf_num
|
|
|
|
inputs = [priorbox.name, label.name]
|
|
inputs.extend([l.name for l in input_loc])
|
|
inputs.extend([l.name for l in input_conf])
|
|
parents = [priorbox, label]
|
|
parents.extend(input_loc)
|
|
parents.extend(input_conf)
|
|
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.MULTIBOX_LOSS_LAYER,
|
|
inputs=inputs,
|
|
input_num=input_loc_num,
|
|
num_classes=num_classes,
|
|
overlap_threshold=overlap_threshold,
|
|
neg_pos_ratio=neg_pos_ratio,
|
|
neg_overlap=neg_overlap,
|
|
background_id=background_id)
|
|
return LayerOutput(
|
|
name, LayerType.MULTIBOX_LOSS_LAYER, parents=parents, size=1)
|
|
|
|
|
|
@wrap_name_default("detection_output")
|
|
def detection_output_layer(input_loc,
|
|
input_conf,
|
|
priorbox,
|
|
num_classes,
|
|
nms_threshold=0.45,
|
|
nms_top_k=400,
|
|
keep_top_k=200,
|
|
confidence_threshold=0.01,
|
|
background_id=0,
|
|
name=None):
|
|
"""
|
|
Apply the NMS to the output of network and compute the predict bounding
|
|
box location. The output's shape of this layer could be zero if there is
|
|
no valid bounding box.
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input_loc: The input predict locations.
|
|
:type input_loc: LayerOutput | List of LayerOutput.
|
|
:param input_conf: The input priorbox confidence.
|
|
:type input_conf: LayerOutput | List of LayerOutput.
|
|
:param priorbox: The input priorbox location and the variance.
|
|
:type priorbox: LayerOutput
|
|
:param num_classes: The number of the classes.
|
|
:type num_classes: int
|
|
:param nms_threshold: The Non-maximum suppression threshold.
|
|
:type nms_threshold: float
|
|
:param nms_top_k: The bounding boxes number kept of the NMS's output.
|
|
:type nms_top_k: int
|
|
:param keep_top_k: The bounding boxes number kept of the layer's output.
|
|
:type keep_top_k: int
|
|
:param confidence_threshold: The classification confidence threshold.
|
|
:type confidence_threshold: float
|
|
:param background_id: The background class index.
|
|
:type background_id: int
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
if isinstance(input_loc, LayerOutput):
|
|
input_loc = [input_loc]
|
|
assert isinstance(input_loc, collections.Sequence) # list or tuple
|
|
for each in input_loc:
|
|
assert isinstance(each, LayerOutput)
|
|
input_loc_num = len(input_loc)
|
|
|
|
if isinstance(input_conf, LayerOutput):
|
|
input_conf = [input_conf]
|
|
assert isinstance(input_conf, collections.Sequence) # list or tuple
|
|
for each in input_conf:
|
|
assert isinstance(each, LayerOutput)
|
|
input_conf_num = len(input_conf)
|
|
|
|
# Check the input layer number.
|
|
assert input_loc_num == input_conf_num
|
|
|
|
inputs = [priorbox.name]
|
|
inputs.extend([l.name for l in input_loc])
|
|
inputs.extend([l.name for l in input_conf])
|
|
parents = [priorbox]
|
|
parents.extend(input_loc)
|
|
parents.extend(input_conf)
|
|
|
|
size = keep_top_k * 7
|
|
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.DETECTION_OUTPUT_LAYER,
|
|
inputs=inputs,
|
|
size=size,
|
|
input_num=input_loc_num,
|
|
num_classes=num_classes,
|
|
nms_threshold=nms_threshold,
|
|
nms_top_k=nms_top_k,
|
|
keep_top_k=keep_top_k,
|
|
confidence_threshold=confidence_threshold,
|
|
background_id=background_id)
|
|
return LayerOutput(
|
|
name, LayerType.DETECTION_OUTPUT_LAYER, parents=parents, size=size)
|
|
|
|
|
|
@wrap_name_default("roi_pool")
|
|
def roi_pool_layer(input,
|
|
rois,
|
|
pooled_width,
|
|
pooled_height,
|
|
spatial_scale,
|
|
num_channels=None,
|
|
name=None):
|
|
"""
|
|
A layer used by Fast R-CNN to extract feature maps of ROIs from the last
|
|
feature map.
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The input layer.
|
|
:type input: LayerOutput.
|
|
:param rois: The input ROIs' data.
|
|
:type rois: LayerOutput.
|
|
:param pooled_width: The width after pooling.
|
|
:type pooled_width: int
|
|
:param pooled_height: The height after pooling.
|
|
:type pooled_height: int
|
|
:param spatial_scale: The spatial scale between the image and feature map.
|
|
:type spatial_scale: float
|
|
:param num_channels: The number of the input channels.
|
|
:type num_channels: int
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
if num_channels is None:
|
|
assert input.num_filters is not None
|
|
num_channels = input.num_filters
|
|
size = num_channels * pooled_width * pooled_height
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.ROI_POOL_LAYER,
|
|
inputs=[input.name, rois.name],
|
|
pooled_width=pooled_width,
|
|
pooled_height=pooled_height,
|
|
spatial_scale=spatial_scale,
|
|
num_channels=num_channels)
|
|
return LayerOutput(
|
|
name, LayerType.ROI_POOL_LAYER, parents=[input, rois], size=size)
|
|
|
|
|
|
@wrap_name_default("cross_channel_norm")
|
|
def cross_channel_norm_layer(input, name=None, param_attr=None):
|
|
"""
|
|
Normalize a layer's output. This layer is necessary for ssd. This
|
|
layer applys normalization across the channels of each sample to
|
|
a convolutional layer's output and scales the output by a group of
|
|
trainable factors whose dimensions equal to the channel's number.
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param param_attr: The parameter attribute. See ParameterAttribute for details.
|
|
:type param_attr: ParameterAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
assert input.num_filters is not None
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.NORM_LAYER,
|
|
inputs=[
|
|
Input(
|
|
input.name,
|
|
norm=Norm(
|
|
norm_type="cross-channel-norm",
|
|
channels=input.num_filters,
|
|
size=input.size,
|
|
scale=0,
|
|
pow=0,
|
|
blocked=0),
|
|
**param_attr.attr)
|
|
])
|
|
return LayerOutput(
|
|
name,
|
|
LayerType.NORM_LAYER,
|
|
parents=input,
|
|
num_filters=input.num_filters,
|
|
size=input.size)
|
|
|
|
|
|
@wrap_name_default("seq_pooling")
|
|
@wrap_bias_attr_default(has_bias=False)
|
|
@wrap_param_default(['pooling_type'], default_factory=lambda _: MaxPooling())
|
|
@layer_support()
|
|
def pooling_layer(input,
|
|
pooling_type=None,
|
|
name=None,
|
|
bias_attr=None,
|
|
agg_level=AggregateLevel.TO_NO_SEQUENCE,
|
|
stride=-1,
|
|
layer_attr=None):
|
|
"""
|
|
Pooling layer for sequence inputs, not used for Image.
|
|
|
|
If stride > 0, this layer slides a window whose size is determined by stride,
|
|
and returns the pooling value of the sequence in the window as the output. Thus,
|
|
a long sequence will be shortened. Note that for sequence with sub-sequence, the
|
|
default value of stride is -1.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
seq_pool = pooling_layer(input=layer,
|
|
pooling_type=AvgPooling(),
|
|
agg_level=AggregateLevel.TO_NO_SEQUENCE)
|
|
|
|
:param agg_level: AggregateLevel.TO_NO_SEQUENCE or
|
|
AggregateLevel.TO_SEQUENCE
|
|
:type agg_level: AggregateLevel
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param pooling_type: Type of pooling. MaxPooling is the default pooling.
|
|
:type pooling_type: BasePoolingType | None
|
|
:param stride: The step size between successive pooling regions.
|
|
:type stride: int
|
|
:param bias_attr: The bias attribute. If the parameter is set to False or an object
|
|
whose type is not ParameterAttribute, no bias is defined. If the
|
|
parameter is set to True, the bias is initialized to zero.
|
|
:type bias_attr: ParameterAttribute | None | bool | Any
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute | None
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
extra_dict = dict()
|
|
# noinspection PyUnresolvedReferences
|
|
if isinstance(pooling_type, AvgPooling):
|
|
extra_dict['average_strategy'] = pooling_type.strategy
|
|
elif isinstance(pooling_type, MaxPooling) and \
|
|
pooling_type.output_max_index is not None:
|
|
assert isinstance(pooling_type.output_max_index, bool)
|
|
extra_dict['output_max_index'] = pooling_type.output_max_index
|
|
extra_dict.update(ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
|
|
if agg_level == AggregateLevel.TO_SEQUENCE:
|
|
assert stride == -1
|
|
|
|
Layer(
|
|
name=name,
|
|
type=pooling_type.name,
|
|
inputs=[Input(input.name)],
|
|
bias=ParamAttr.to_bias(bias_attr),
|
|
trans_type=agg_level,
|
|
stride=stride,
|
|
**extra_dict)
|
|
|
|
return LayerOutput(
|
|
name, pooling_type.name, parents=[input], size=input.size)
|
|
|
|
|
|
@wrap_bias_attr_default()
|
|
@wrap_param_attr_default()
|
|
@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
|
|
@wrap_act_default(param_names=["act", 'state_act'], act=TanhActivation())
|
|
@wrap_name_default("lstmemory")
|
|
@layer_support()
|
|
def lstmemory(input,
|
|
name=None,
|
|
size=None,
|
|
reverse=False,
|
|
act=None,
|
|
gate_act=None,
|
|
state_act=None,
|
|
bias_attr=None,
|
|
param_attr=None,
|
|
layer_attr=None):
|
|
"""
|
|
Long Short-term Memory Cell.
|
|
|
|
The memory cell was implemented as follow equations.
|
|
|
|
.. math::
|
|
|
|
i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
|
|
|
|
f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
|
|
|
|
c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
|
|
|
|
o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
|
|
|
|
h_t & = o_t tanh(c_t)
|
|
|
|
|
|
NOTE: In PaddlePaddle's implementation, the multiplications
|
|
:math:`W_{xi}x_{t}` , :math:`W_{xf}x_{t}`,
|
|
:math:`W_{xc}x_t`, :math:`W_{xo}x_{t}` are not done in the lstmemory layer,
|
|
so an additional mixed_layer with full_matrix_projection or a fc_layer must
|
|
be included in the configuration file to complete the input-to-hidden
|
|
mappings before lstmemory is called.
|
|
|
|
NOTE: This is a low level user interface. You can use network.simple_lstm
|
|
to config a simple plain lstm layer.
|
|
|
|
Reference:
|
|
`Generating Sequences With Recurrent Neural Networks
|
|
<https://arxiv.org/pdf/1308.0850.pdf>`_
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param size: DEPRECATED. The dimension of the lstm cell.
|
|
:type size: int
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param reverse: Whether the input sequence is processed in a reverse order.
|
|
:type reverse: bool
|
|
:param act: Activation type. TanhActivation is the default activation.
|
|
:type act: BaseActivation
|
|
:param gate_act: Activation type of this layer's gates. SigmoidActivation is the
|
|
default activation.
|
|
:type gate_act: BaseActivation
|
|
:param state_act: Activation type of the state. TanhActivation is the default activation.
|
|
:type state_act: BaseActivation
|
|
:param bias_attr: The bias attribute. If the parameter is set to False or an object
|
|
whose type is not ParameterAttribute, no bias is defined. If the
|
|
parameter is set to True, the bias is initialized to zero.
|
|
:type bias_attr: ParameterAttribute | None | bool | Any
|
|
:param param_attr: The parameter attribute. See ParameterAttribute for details.
|
|
:type param_attr: ParameterAttribute
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute | None
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
|
|
assert gate_act.support_hppl
|
|
assert state_act.support_hppl
|
|
assert act.support_hppl
|
|
assert input.size is not None and input.size % 4 == 0
|
|
|
|
if size is not None:
|
|
if input.size / 4 == size:
|
|
plog = logger.warning
|
|
else:
|
|
plog = logger.fatal
|
|
plog("size of lstmemory layer: %s is automatically set to "
|
|
"size of input layer / 4. The parameter size passing to "
|
|
"this layer is ignored." % (name))
|
|
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.LSTMEMORY,
|
|
active_type=act.name,
|
|
active_state_type=state_act.name,
|
|
active_gate_type=gate_act.name,
|
|
reversed=reverse,
|
|
bias=ParamAttr.to_bias(bias_attr),
|
|
inputs=[Input(input.name, **param_attr.attr)],
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
|
|
return LayerOutput(
|
|
name,
|
|
LayerType.LSTMEMORY, [input],
|
|
size=input.size / 4,
|
|
reverse=reverse)
|
|
|
|
|
|
@wrap_bias_attr_default()
|
|
@wrap_param_attr_default()
|
|
@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
|
|
@wrap_act_default(param_names=["act"], act=TanhActivation())
|
|
@wrap_name_default("gru")
|
|
@layer_support()
|
|
def grumemory(input,
|
|
size=None,
|
|
name=None,
|
|
reverse=False,
|
|
act=None,
|
|
gate_act=None,
|
|
bias_attr=None,
|
|
param_attr=None,
|
|
layer_attr=None):
|
|
"""
|
|
Gate Recurrent Unit Layer.
|
|
|
|
The memory cell was implemented as follow equations.
|
|
|
|
1. update gate :math:`z`: defines how much of the previous memory to
|
|
keep around or the unit updates its activations. The update gate
|
|
is computed by:
|
|
|
|
.. math::
|
|
|
|
z_t = \\sigma(W_{z}x_{t} + U_{z}h_{t-1} + b_z)
|
|
|
|
2. reset gate :math:`r`: determines how to combine the new input with the
|
|
previous memory. The reset gate is computed similarly to the update gate:
|
|
|
|
.. math::
|
|
|
|
r_t = \\sigma(W_{r}x_{t} + U_{r}h_{t-1} + b_r)
|
|
|
|
3. The candidate activation :math:`\\tilde{h_t}` is computed similarly to
|
|
that of the traditional recurrent unit:
|
|
|
|
.. math::
|
|
|
|
{\\tilde{h_t}} = tanh(W x_{t} + U (r_{t} \odot h_{t-1}) + b)
|
|
|
|
4. The hidden activation :math:`h_t` of the GRU at time t is a linear
|
|
interpolation between the previous activation :math:`h_{t-1}` and the
|
|
candidate activation :math:`\\tilde{h_t}`:
|
|
|
|
.. math::
|
|
|
|
h_t = (1 - z_t) h_{t-1} + z_t {\\tilde{h_t}}
|
|
|
|
NOTE: In PaddlePaddle's implementation, the multiplication operations
|
|
:math:`W_{r}x_{t}`, :math:`W_{z}x_{t}` and :math:`W x_t` are not performed
|
|
in gate_recurrent layer. Consequently, an additional mixed_layer with
|
|
full_matrix_projection or a fc_layer must be included before grumemory
|
|
is called.
|
|
|
|
Reference:
|
|
`Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling
|
|
<https://arxiv.org/abs/1412.3555>`_
|
|
|
|
The simple usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
gru = grumemory(input)
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput.
|
|
:param size: DEPRECATED. The dimension of the gru cell.
|
|
:type size: int
|
|
:param reverse: Whether the input sequence is processed in a reverse order.
|
|
:type reverse: bool
|
|
:param act: Activation type, TanhActivation is the default. This activation
|
|
affects the :math:`{\\tilde{h_t}}`.
|
|
:type act: BaseActivation
|
|
:param gate_act: Activation type of this layer's two gates. SigmoidActivation is
|
|
the default activation. This activation affects the :math:`z_t`
|
|
and :math:`r_t`. It is the :math:`\\sigma` in the above formula.
|
|
:type gate_act: BaseActivation
|
|
:param bias_attr: The bias attribute. If the parameter is set to False or an object
|
|
whose type is not ParameterAttribute, no bias is defined. If the
|
|
parameter is set to True, the bias is initialized to zero.
|
|
:type bias_attr: ParameterAttribute | None | bool | Any
|
|
:param param_attr: The parameter attribute. See ParameterAttribute for details.
|
|
:type param_attr: ParameterAttribute
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute | None
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
assert act.support_hppl
|
|
assert gate_act.support_hppl
|
|
assert input.size is not None and input.size % 3 == 0
|
|
if size is not None:
|
|
if input.size / 3 == size:
|
|
plog = logger.warning
|
|
else:
|
|
plog = logger.fatal
|
|
plog("size of grumemory layer: %s is automatically set to "
|
|
"size of input layer / 3. The parameter size passing to this "
|
|
"layer is ignored." % (name))
|
|
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.GRUMEMORY,
|
|
active_type=act.name,
|
|
active_gate_type=gate_act.name,
|
|
reversed=reverse,
|
|
bias=ParamAttr.to_bias(bias_attr),
|
|
inputs=[Input(input.name, **param_attr.attr)],
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
|
|
return LayerOutput(
|
|
name,
|
|
LayerType.GRUMEMORY, [input],
|
|
size=input.size / 3,
|
|
reverse=reverse)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def last_seq(input,
|
|
name=None,
|
|
agg_level=AggregateLevel.TO_NO_SEQUENCE,
|
|
stride=-1,
|
|
layer_attr=None):
|
|
"""
|
|
Get Last Timestamp Activation of a sequence.
|
|
|
|
If stride > 0, this layer will slide a window whose size is determined by stride,
|
|
and return the last value of the sequence in the window as the output. Thus, a
|
|
long sequence will be shortened. Note that for sequence with sub-sequence, the
|
|
default value of stride is -1.
|
|
|
|
The simple usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
seq = last_seq(input=layer)
|
|
|
|
:param agg_level: Aggregated level
|
|
:type agg_level: AggregateLevel
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param stride: The step size between successive pooling regions.
|
|
:type stride: int
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
if input.reverse is not None and input.reverse:
|
|
logger.warning("You are getting the last instance of a sequence that"
|
|
" is a output of a REVERSED layer. There is no time"
|
|
" series information at all. Maybe you want to use"
|
|
" first_seq instead.")
|
|
|
|
if agg_level == AggregateLevel.TO_SEQUENCE:
|
|
assert stride == -1
|
|
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.SEQUENCE_LAST_INSTANCE,
|
|
inputs=[input.name],
|
|
trans_type=agg_level,
|
|
stride=stride,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name,
|
|
LayerType.SEQUENCE_LAST_INSTANCE,
|
|
parents=[input],
|
|
size=input.size)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def first_seq(input,
|
|
name=None,
|
|
agg_level=AggregateLevel.TO_NO_SEQUENCE,
|
|
stride=-1,
|
|
layer_attr=None):
|
|
"""
|
|
Get First Timestamp Activation of a sequence.
|
|
|
|
If stride > 0, this layer will slide a window whose size is determined by stride,
|
|
and return the first value of the sequence in the window as the output. Thus, a
|
|
long sequence will be shortened. Note that for sequence with sub-sequence, the
|
|
default value of stride is -1.
|
|
|
|
The simple usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
seq = first_seq(input=layer)
|
|
|
|
:param agg_level: aggregation level
|
|
:type agg_level: AggregateLevel
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param stride: The step size between successive pooling regions.
|
|
:type stride: int
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute.
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
|
|
if input.reverse is not None and not input.reverse:
|
|
logger.warning('You are getting the first instance for a time series,'
|
|
' and it is a normal recurrent layer output. There is no'
|
|
' time series information at all. Maybe you want to use'
|
|
' last_seq instead.')
|
|
|
|
if agg_level == AggregateLevel.TO_SEQUENCE:
|
|
assert stride == -1
|
|
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.SEQUENCE_FIRST_INSTANCE,
|
|
inputs=[input.name],
|
|
trans_type=agg_level,
|
|
stride=stride,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name,
|
|
LayerType.SEQUENCE_FIRST_INSTANCE,
|
|
parents=[input],
|
|
size=input.size)
|
|
|
|
|
|
class ExpandLevel(object):
|
|
"""
|
|
Please refer to AggregateLevel first.
|
|
|
|
ExpandLevel supports two modes:
|
|
|
|
- :code:`ExpandLevel.FROM_NO_SEQUENCE` means the expansion acts on
|
|
:code:`NO_SEQUENCE`, which will be expanded to
|
|
:code:`SEQUENCE` or :code:`SUB_SEQUENCE`.
|
|
|
|
- :code:`ExpandLevel.FROM_SEQUENCE` means the expansion acts on
|
|
:code:`SEQUENCE`, which will be expanded to
|
|
:code:`SUB_SEQUENCE`.
|
|
"""
|
|
FROM_NO_SEQUENCE = AggregateLevel.TO_NO_SEQUENCE
|
|
FROM_SEQUENCE = AggregateLevel.TO_SEQUENCE
|
|
# compatible with previous configuration
|
|
FROM_TIMESTEP = FROM_NO_SEQUENCE
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def expand_layer(input,
|
|
expand_as,
|
|
name=None,
|
|
bias_attr=False,
|
|
expand_level=ExpandLevel.FROM_NO_SEQUENCE,
|
|
layer_attr=None):
|
|
"""
|
|
A layer for expanding dense data or (sequence data where the length of each
|
|
sequence is one) to sequence data.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
expand = expand_layer(input=layer1,
|
|
expand_as=layer2,
|
|
expand_level=ExpandLevel.FROM_NO_SEQUENCE)
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param expand_as: Expand the input according to this layer's sequence infomation. And
|
|
after the operation, the input expanded will have the same number of
|
|
elememts as this layer.
|
|
:type expand_as: LayerOutput
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param bias_attr: The bias attribute. If the parameter is set to False or an object
|
|
whose type is not ParameterAttribute, no bias is defined. If the
|
|
parameter is set to True, the bias is initialized to zero.
|
|
:type bias_attr: ParameterAttribute | None | bool | Any
|
|
:param expand_level: Whether the input layer is a sequence or the element of a sequence.
|
|
:type expand_level: ExpandLevel
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute.
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
|
|
Layer(
|
|
inputs=[input.name, expand_as.name],
|
|
name=name,
|
|
bias=ParamAttr.to_bias(bias_attr=bias_attr),
|
|
type=LayerType.EXPAND_LAYER,
|
|
trans_type=expand_level,
|
|
**ExtraAttr.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name=name,
|
|
size=input.size,
|
|
layer_type=LayerType.EXPAND_LAYER,
|
|
parents=[input, expand_as])
|
|
|
|
|
|
@wrap_name_default()
|
|
@wrap_act_default(act=IdentityActivation())
|
|
@layer_support()
|
|
def repeat_layer(input,
|
|
num_repeats,
|
|
as_row_vector=True,
|
|
act=None,
|
|
name=None,
|
|
layer_attr=None):
|
|
"""
|
|
A layer for repeating the input for num_repeats times.
|
|
|
|
If as_row_vector:
|
|
|
|
.. math::
|
|
y = [x_1,\cdots, x_n, \cdots, x_1, \cdots, x_n]
|
|
|
|
If not as_row_vector:
|
|
|
|
.. math::
|
|
y = [x_1,\cdots, x_1, \cdots, x_n, \cdots, x_n]
|
|
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
expand = repeat_layer(input=layer, num_repeats=4)
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param num_repeats: The times of repeating the input.
|
|
:type num_repeats: int
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param as_row_vector: Whether to treat the input as row vectors or not. If
|
|
the parameter is set to True, the repeating operation
|
|
will be performed in the column direction. Otherwise,
|
|
it will be performed in the row direction.
|
|
:type as_row_vector: bool
|
|
:param act: Activation type. IdentityActivation is the default activation.
|
|
:type act: BaseActivation
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute.
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
|
|
l = Layer(
|
|
inputs=[input.name],
|
|
name=name,
|
|
active_type=act.name,
|
|
num_filters=num_repeats,
|
|
as_row_vector=as_row_vector,
|
|
type=LayerType.FEATURE_MAP_EXPAND_LAYER,
|
|
**ExtraAttr.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name=name,
|
|
size=l.config.size,
|
|
layer_type=LayerType.FEATURE_MAP_EXPAND_LAYER,
|
|
activation=act,
|
|
parents=[input])
|
|
|
|
|
|
@wrap_name_default("seqreshape")
|
|
@wrap_act_default(act=IdentityActivation())
|
|
@wrap_bias_attr_default(has_bias=False)
|
|
@layer_support(ERROR_CLIPPING, DROPOUT)
|
|
def seq_reshape_layer(input,
|
|
reshape_size,
|
|
act=None,
|
|
name=None,
|
|
layer_attr=None,
|
|
bias_attr=None):
|
|
"""
|
|
A layer for reshaping the sequence. Assume the input sequence has T instances,
|
|
the dimension of each instance is M, and the input reshape_size is N, then the
|
|
output sequence has T*M/N instances, the dimension of each instance is N.
|
|
|
|
Note that T*M/N must be an integer.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
reshape = seq_reshape_layer(input=layer, reshape_size=4)
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param reshape_size: The dimension of the reshaped sequence.
|
|
:type reshape_size: int
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param act: Activation type. IdentityActivation is the default activation.
|
|
:type act: BaseActivation
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute.
|
|
:param bias_attr: The bias attribute. If the parameter is set to False or an object
|
|
whose type is not ParameterAttribute, no bias is defined. If the
|
|
parameter is set to True, the bias is initialized to zero.
|
|
:type bias_attr: ParameterAttribute | None | bool | Any
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
|
|
Layer(
|
|
inputs=[input.name],
|
|
name=name,
|
|
size=reshape_size,
|
|
type=LayerType.SEQUENCE_RESHAPE,
|
|
bias=ParamAttr.to_bias(bias_attr),
|
|
**ExtraAttr.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name=name,
|
|
size=reshape_size,
|
|
layer_type=LayerType.SEQUENCE_RESHAPE,
|
|
parents=[input])
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def interpolation_layer(input, weight, name=None, layer_attr=None):
|
|
"""
|
|
This layer performs linear interpolation on two inputs,
|
|
which is used in NEURAL TURING MACHINE.
|
|
|
|
.. math::
|
|
y.row[i] = w[i] * x_1.row[i] + (1 - w[i]) * x_2.row[i]
|
|
|
|
where :math:`x_1` and :math:`x_2` are two (batchSize x dataDim) inputs,
|
|
:math:`w` is (batchSize x 1) weight vector, and :math:`y` is
|
|
(batchSize x dataDim) output.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
interpolation = interpolation_layer(input=[layer1, layer2], weight=layer3)
|
|
|
|
:param input: The input of this layer.
|
|
:type input: list | tuple
|
|
:param weight: Weight layer.
|
|
:type weight: LayerOutput
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute.
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
assert isinstance(input, collections.Sequence)
|
|
assert len(input) == 2
|
|
assert isinstance(input[0], LayerOutput) and isinstance(input[1],
|
|
LayerOutput)
|
|
if input[0].size is not None and input[1].size is not None:
|
|
assert input[0].size == input[1].size
|
|
assert isinstance(weight, LayerOutput)
|
|
if weight.size is not None:
|
|
assert weight.size == 1
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.INTERPOLATION_LAYER,
|
|
inputs=[weight.name, input[0].name, input[1].name],
|
|
**ExtraAttr.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name,
|
|
LayerType.INTERPOLATION_LAYER,
|
|
parents=[weight, input[0], input[1]],
|
|
size=input[0].size)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def bilinear_interp_layer(input,
|
|
out_size_x=None,
|
|
out_size_y=None,
|
|
name=None,
|
|
layer_attr=None):
|
|
"""
|
|
This layer implements bilinear interpolation on convolutional layer's output.
|
|
|
|
Please refer to Wikipedia: https://en.wikipedia.org/wiki/Bilinear_interpolation
|
|
|
|
The simple usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
bilinear = bilinear_interp_layer(input=layer1, out_size_x=64, out_size_y=64)
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput.
|
|
:param out_size_x: The width of the output.
|
|
:type out_size_x: int
|
|
:param out_size_y: The height of the output.
|
|
:type out_size_y: int
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
assert input.layer_type == LayerType.CONV_LAYER
|
|
assert isinstance(input.activation, LinearActivation)
|
|
assert out_size_x > 0 and out_size_y > 0
|
|
assert input.num_filters is not None
|
|
num_channels = input.num_filters
|
|
l = Layer(
|
|
name=name,
|
|
inputs=Input(
|
|
input.name,
|
|
bilinear_interp=BilinearInterp(
|
|
out_size_x=out_size_x,
|
|
out_size_y=out_size_y,
|
|
channels=num_channels)),
|
|
type=LayerType.BILINEAR_INTERP_LAYER,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name,
|
|
LayerType.BILINEAR_INTERP_LAYER,
|
|
parents=[input],
|
|
num_filters=num_channels,
|
|
size=l.config.size)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def power_layer(input, weight, name=None, layer_attr=None):
|
|
"""
|
|
This layer applies a power function to a vector element-wise,
|
|
which is used in NEURAL TURING MACHINE.
|
|
|
|
.. math::
|
|
y = x^w
|
|
|
|
where :math:`x` is an input vector, :math:`w` is a scalar exponent,
|
|
and :math:`y` is an output vector.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
power = power_layer(input=layer1, weight=layer2)
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param weight: The exponent of the power.
|
|
:type weight: LayerOutput
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute.
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
assert isinstance(input, LayerOutput) and isinstance(weight, LayerOutput)
|
|
if weight.size is not None:
|
|
assert weight.size == 1
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.POWER_LAYER,
|
|
inputs=[weight.name, input.name],
|
|
**ExtraAttr.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name, LayerType.POWER_LAYER, parents=[input, weight], size=input.size)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def scaling_layer(input, weight, name=None, layer_attr=None):
|
|
"""
|
|
A layer for multiplying input vector by weight scalar.
|
|
|
|
.. math::
|
|
y = w x
|
|
|
|
where :math:`x` is size=dataDim input, :math:`w` is size=1 weight,
|
|
and :math:`y` is size=dataDim output.
|
|
|
|
Note that the above computation is for one sample. Multiple samples are
|
|
processed in one batch.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
scale = scaling_layer(input=layer1, weight=layer2)
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param weight: The weight of each sample.
|
|
:type weight: LayerOutput
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute.
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
assert isinstance(weight, LayerOutput) and isinstance(input, LayerOutput)
|
|
if weight.size is not None:
|
|
assert weight.size == 1
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.SCALING_LAYER,
|
|
inputs=[weight.name, input.name],
|
|
**ExtraAttr.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name, LayerType.SCALING_LAYER, parents=[weight, input], size=input.size)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def trans_layer(input, name=None, layer_attr=None):
|
|
"""
|
|
A layer for transposing a minibatch matrix.
|
|
|
|
.. math::
|
|
y = x^\mathrm{T}
|
|
|
|
where :math:`x` is (M x N) input, and :math:`y` is (N x M) output.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
trans = trans_layer(input=layer)
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute.
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.TRANS_LAYER,
|
|
inputs=[input.name],
|
|
**ExtraAttr.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name, LayerType.TRANS_LAYER, parents=[input], size=input.size)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def rotate_layer(input, height, width, name=None, layer_attr=None):
|
|
"""
|
|
A layer for rotating 90 degrees (clock-wise) for each feature channel,
|
|
usually used when the input sample is some image or feature map.
|
|
|
|
.. math::
|
|
y(j,i,:) = x(M-i-1,j,:)
|
|
|
|
where :math:`x` is (M x N x C) input, and :math:`y` is (N x M x C) output.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
rot = rotate_layer(input=layer,
|
|
height=100,
|
|
width=100)
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param height: The height of the sample matrix.
|
|
:type height: int
|
|
:param width: The width of the sample matrix.
|
|
:type width: int
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute.
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
assert isinstance(input, LayerOutput)
|
|
l = Layer(
|
|
name=name,
|
|
height=height,
|
|
width=width,
|
|
type=LayerType.ROTATE_LAYER,
|
|
inputs=[input.name],
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name=name,
|
|
layer_type=LayerType.ROTATE_LAYER,
|
|
parents=[input],
|
|
size=l.config.size)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None):
|
|
"""
|
|
Cosine Similarity Layer. The cosine similarity equation is here.
|
|
|
|
.. math::
|
|
similarity = cos(\\theta) = {\\mathbf{a} \\cdot \\mathbf{b}
|
|
\\over \\|\\mathbf{a}\\| \\|\\mathbf{b}\\|}
|
|
|
|
The size of a is M, size of b is M*N,
|
|
Similarity will be calculated N times by step M. The output size is
|
|
N. The scale will be multiplied to similarity.
|
|
|
|
Note that the above computation is for one sample. Multiple samples are
|
|
processed in one batch.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
cos = cos_sim(a=layer1, b=layer2, size=3)
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param a: The first input of this layer.
|
|
:type a: LayerOutput
|
|
:param b: The second input of this layer.
|
|
:type b: LayerOutput
|
|
:param scale: The scale of the cosine similarity. 1 is the default value.
|
|
:type scale: float
|
|
:param size: The dimension of this layer. NOTE size_a * size should equal size_b.
|
|
:type size: int
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
assert isinstance(a, LayerOutput) and isinstance(b, LayerOutput)
|
|
if size == 1:
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.COSINE_SIM,
|
|
cos_scale=scale,
|
|
inputs=[a.name, b.name],
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
else:
|
|
if a.size is not None and b.size is not None:
|
|
assert size == b.size / a.size
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.COSINE_SIM_VEC,
|
|
size=size,
|
|
cos_scale=scale,
|
|
inputs=[a.name, b.name],
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(name, LayerType.COSINE_SIM, parents=[a, b], size=size)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def l2_distance_layer(x, y, name=None, layer_attr=None):
|
|
"""
|
|
This layer calculates and returns the Euclidean distance between two input
|
|
vectors x and y. The equation is as follows:
|
|
|
|
.. math::
|
|
l2_distance(\\mathbf{x}, \\mathbf{y}) = \\sqrt{\\sum_{i=1}^D(x_i - y_i)}
|
|
|
|
The output size of this layer is fixed to be 1. Note that the above
|
|
computation is for one sample. Multiple samples are processed in one batch.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
l2_sim = l2_distance(x=layer1, y=layer2)
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param x: The first input x for this layer, whose output is a matrix with
|
|
dimensionality N x D. N is the sample number in a mini-batch.
|
|
D is the dimensionality of x's output.
|
|
:type x: LayerOutput
|
|
:param y: The second input y for this layer, whose output is a matrix with
|
|
dimensionality N x D. N is the sample number in a mini-batch.
|
|
D is the dimensionality of y's output.
|
|
:type y: LayerOutput
|
|
:param layer_attr: The extra layer attributes, for example, drop rate.
|
|
See ExtraLayerAttribute for more details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: The returned LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
|
|
assert isinstance(x, LayerOutput) and isinstance(y, LayerOutput)
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.L2_DISTANCE,
|
|
inputs=[x.name, y.name],
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(name, LayerType.L2_DISTANCE, parents=[x, y], size=1)
|
|
|
|
|
|
@wrap_name_default()
|
|
@wrap_bias_attr_default(has_bias=True)
|
|
@wrap_param_attr_default()
|
|
@layer_support()
|
|
def hsigmoid(input,
|
|
label,
|
|
num_classes=None,
|
|
name=None,
|
|
bias_attr=None,
|
|
param_attr=None,
|
|
layer_attr=None):
|
|
"""
|
|
Organize the classes into a binary tree. At each node, a sigmoid function
|
|
is used to calculate the probability of belonging to the right branch.
|
|
|
|
Reference:
|
|
`Hierarchical Probabilistic Neural Network Language Model
|
|
<http://www.gatsby.ucl.ac.uk/aistats/fullpapers/208.pdf>`_
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
cost = hsigmoid(input=[layer1, layer2],
|
|
label=data_layer)
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput | list | tuple
|
|
:param label: The input label.
|
|
:type label: LayerOutput
|
|
:param num_classes: The number of classes. And it should be larger than 2. If the parameter
|
|
is not set or set to None, its actual value will be automatically set to
|
|
the number of labels.
|
|
:type num_classes: int
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param bias_attr: The bias attribute. If the parameter is set to False or an object
|
|
whose type is not ParameterAttribute, no bias is defined. If the
|
|
parameter is set to True, the bias is initialized to zero.
|
|
:type bias_attr: ParameterAttribute | None | bool | Any
|
|
:param param_attr: The parameter attribute. See ParameterAttribute for details.
|
|
:type param_attr: ParameterAttribute
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
if isinstance(input, LayerOutput):
|
|
input = [input]
|
|
if not isinstance(param_attr, collections.Sequence):
|
|
param_attr = [param_attr]
|
|
else:
|
|
if not isinstance(param_attr, collections.Sequence):
|
|
param_attr = [param_attr] * len(input)
|
|
else:
|
|
assert len(param_attr) == len(input)
|
|
|
|
assert isinstance(input, collections.Sequence)
|
|
assert isinstance(label, LayerOutput)
|
|
assert label.layer_type == LayerType.DATA
|
|
|
|
if num_classes is None:
|
|
num_classes = label.size
|
|
if num_classes is None or num_classes <= 2:
|
|
raise ValueError("hsigmoid label size must larger than 2.")
|
|
|
|
ipts_for_layer = []
|
|
parents = []
|
|
for each_input, each_param_attr in zip(input, param_attr):
|
|
assert isinstance(each_input, LayerOutput)
|
|
ipts_for_layer.append(Input(each_input.name, **each_param_attr.attr))
|
|
parents.append(each_input)
|
|
ipts_for_layer.append(label.name)
|
|
parents.append(label)
|
|
|
|
l = Layer(
|
|
name=name,
|
|
type=LayerType.HSIGMOID,
|
|
num_classes=num_classes,
|
|
bias=ParamAttr.to_bias(bias_attr),
|
|
inputs=ipts_for_layer,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name, LayerType.HSIGMOID, parents=parents, size=l.config.size)
|
|
|
|
|
|
@wrap_name_default("conv")
|
|
@wrap_param_attr_default()
|
|
@wrap_bias_attr_default()
|
|
@wrap_act_default(act=ReluActivation())
|
|
@layer_support(DROPOUT)
|
|
def img_conv_layer(input,
|
|
filter_size,
|
|
num_filters,
|
|
name=None,
|
|
num_channels=None,
|
|
act=None,
|
|
groups=1,
|
|
stride=1,
|
|
padding=0,
|
|
dilation=1,
|
|
bias_attr=None,
|
|
param_attr=None,
|
|
shared_biases=True,
|
|
layer_attr=None,
|
|
filter_size_y=None,
|
|
stride_y=None,
|
|
padding_y=None,
|
|
dilation_y=None,
|
|
trans=False,
|
|
layer_type=None):
|
|
"""
|
|
Convolution layer for image. Paddle can support both square and non-square
|
|
input currently.
|
|
|
|
The details of convolution layer, please refer UFLDL's `convolution
|
|
<http://ufldl.stanford.edu/tutorial/supervised/
|
|
FeatureExtractionUsingConvolution/>`_ .
|
|
|
|
Convolution Transpose (deconv) layer for image. Paddle can support both square
|
|
and non-square input currently.
|
|
|
|
The details of convolution transpose layer,
|
|
please refer to the following explanation and references therein
|
|
<http://datascience.stackexchange.com/questions/6107/
|
|
what-are-deconvolutional-layers/>`_ .
|
|
The num_channel means input image's channel number. It may be 1 or 3 when
|
|
input is raw pixels of image(mono or RGB), or it may be the previous layer's
|
|
num_filters.
|
|
|
|
There are several groups of filters in PaddlePaddle implementation.
|
|
If the groups attribute is greater than 1, for example groups=2,
|
|
the input will be splitted into 2 parts along the channel axis, and
|
|
the filters will also be splitted into 2 parts. The first half of the filters
|
|
is only connected to the first half of the input channels, while the second
|
|
half of the filters is only connected to the second half of the input. After
|
|
the computation of convolution for each part of input,
|
|
the output will be obtained by concatenating the two results.
|
|
|
|
The details of grouped convolution, please refer to:
|
|
`ImageNet Classification with Deep Convolutional Neural Networks
|
|
<http://www.cs.toronto.edu/~kriz/imagenet_classification_with_deep_convolutional.pdf>`_
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
|
|
num_channels=8,
|
|
num_filters=16, stride=1,
|
|
bias_attr=False,
|
|
act=ReluActivation())
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param filter_size: The dimensions of the filter kernel. If the parameter is
|
|
set to one integer, the two dimensions on x and y axises
|
|
will be same when filter_size_y is not set. If it is set
|
|
to a list, the first element indicates the dimension on
|
|
the x axis, and the second is used to specify the dimension
|
|
on the y axis when filter_size_y is not provided.
|
|
:type filter_size: int | tuple | list
|
|
:param filter_size_y: The dimension of the filter kernel on the y axis. If the parameter
|
|
is not set, it will be set automatically according to filter_size.
|
|
:type filter_size_y: int
|
|
:param num_filters: The number of filters. It is as same as the output image channel.
|
|
:type num_filters: int
|
|
:param act: Activation type. ReluActivation is the default activation.
|
|
:type act: BaseActivation
|
|
:param groups: The group number. 1 is the default group number.
|
|
:type groups: int
|
|
:param stride: The strides. If the parameter is set to one integer, the strides
|
|
on x and y axises will be same when stride_y is not set. If it is
|
|
set to a list, the first element indicates the stride on the x axis,
|
|
and the second is used to specify the stride on the y axis when
|
|
stride_y is not provided. 1 is the default value.
|
|
:type stride: int | tuple | list
|
|
:param stride_y: The stride on the y axis.
|
|
:type stride_y: int
|
|
:param padding: The padding sizes. If the parameter is set to one integer, the padding
|
|
sizes on x and y axises will be same when padding_y is not set. If it
|
|
is set to a list, the first element indicates the padding size on the
|
|
x axis, and the second is used to specify the padding size on the y axis
|
|
when padding_y is not provided. 0 is the default padding size.
|
|
:type padding: int | tuple | list
|
|
:param padding_y: The padding size on the y axis.
|
|
:type padding_y: int
|
|
:param dilation: The dimensions of the dilation. If the parameter is set to one integer,
|
|
the two dimensions on x and y axises will be same when dilation_y is not
|
|
set. If it is set to a list, the first element indicates the dimension
|
|
on the x axis, and the second is used to specify the dimension on the y
|
|
axis when dilation_y is not provided. 1 is the default dimension.
|
|
:type dilation: int | tuple | list
|
|
:param dilation_y: The dimension of the dilation on the y axis.
|
|
:type dilation_y: int
|
|
:param bias_attr: The bias attribute. If the parameter is set to False or an object
|
|
whose type is not ParameterAttribute, no bias is defined. If the
|
|
parameter is set to True, the bias is initialized to zero.
|
|
:type bias_attr: ParameterAttribute | None | bool | Any
|
|
:param num_channels: The number of input channels. If the parameter is not set or
|
|
set to None, its actual value will be automatically set to
|
|
the channel number of the input.
|
|
:type num_channels: int
|
|
:param param_attr: The parameter attribute. See ParameterAttribute for
|
|
details.
|
|
:type param_attr: ParameterAttribute
|
|
:param shared_biases: Whether biases will be shared between filters or not.
|
|
:type shared_biases: bool
|
|
:param layer_attr: The extra layer attributes. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:param trans: True if it is a convTransLayer, False if it is a convLayer
|
|
:type trans: bool
|
|
:param layer_type: Specify the layer type. If the dilation's dimension on one axis is
|
|
larger than 1, layer_type has to be "cudnn_conv" or "cudnn_convt".
|
|
If trans=True, layer_type has to be "exconvt" or "cudnn_convt",
|
|
otherwise layer_type has to be either "exconv" or "cudnn_conv".
|
|
:type layer_type: basestring
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
if num_channels is None:
|
|
assert input.num_filters is not None
|
|
num_channels = input.num_filters
|
|
|
|
if filter_size_y is None:
|
|
if isinstance(filter_size, collections.Sequence):
|
|
assert len(filter_size) == 2
|
|
filter_size, filter_size_y = filter_size
|
|
else:
|
|
filter_size_y = filter_size
|
|
|
|
if stride_y is None:
|
|
if isinstance(stride, collections.Sequence):
|
|
assert len(stride) == 2
|
|
stride, stride_y = stride
|
|
else:
|
|
stride_y = stride
|
|
|
|
if padding_y is None:
|
|
if isinstance(padding, collections.Sequence):
|
|
assert len(padding) == 2
|
|
padding, padding_y = padding
|
|
else:
|
|
padding_y = padding
|
|
|
|
if dilation_y is None:
|
|
if isinstance(dilation, collections.Sequence):
|
|
assert len(dilation) == 2
|
|
dilation, dilation_y = dilation
|
|
else:
|
|
dilation_y = dilation
|
|
|
|
if param_attr.attr.get('initial_smart'):
|
|
# special initial for conv layers.
|
|
init_w = (2.0 / (filter_size**2 * num_channels))**0.5
|
|
param_attr.attr["initial_mean"] = 0.0
|
|
param_attr.attr["initial_std"] = init_w
|
|
param_attr.attr["initial_strategy"] = 0
|
|
param_attr.attr["initial_smart"] = False
|
|
|
|
if layer_type:
|
|
if dilation > 1 or dilation_y > 1:
|
|
assert layer_type in [
|
|
"cudnn_conv", "cudnn_convt", "exconv", "exconvt"
|
|
]
|
|
if trans:
|
|
assert layer_type in ["exconvt", "cudnn_convt"]
|
|
else:
|
|
assert layer_type in ["exconv", "cudnn_conv"]
|
|
lt = layer_type
|
|
else:
|
|
lt = LayerType.CONVTRANS_LAYER if trans else LayerType.CONV_LAYER
|
|
|
|
l = Layer(
|
|
name=name,
|
|
inputs=Input(
|
|
input.name,
|
|
conv=Conv(
|
|
filter_size=filter_size,
|
|
padding=padding,
|
|
dilation=dilation,
|
|
stride=stride,
|
|
channels=num_channels,
|
|
groups=groups,
|
|
filter_size_y=filter_size_y,
|
|
padding_y=padding_y,
|
|
dilation_y=dilation_y,
|
|
stride_y=stride_y),
|
|
**param_attr.attr),
|
|
active_type=act.name,
|
|
num_filters=num_filters,
|
|
bias=ParamAttr.to_bias(bias_attr),
|
|
shared_biases=shared_biases,
|
|
type=lt,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name,
|
|
lt,
|
|
parents=[input],
|
|
activation=act,
|
|
num_filters=num_filters,
|
|
size=l.config.size)
|
|
|
|
|
|
@wrap_name_default("pool")
|
|
@layer_support()
|
|
def img_pool_layer(input,
|
|
pool_size,
|
|
name=None,
|
|
num_channels=None,
|
|
pool_type=None,
|
|
stride=1,
|
|
padding=0,
|
|
layer_attr=None,
|
|
pool_size_y=None,
|
|
stride_y=None,
|
|
padding_y=None,
|
|
ceil_mode=True,
|
|
exclude_mode=None):
|
|
"""
|
|
Image pooling Layer.
|
|
|
|
The details of pooling layer, please refer to ufldl's pooling_ .
|
|
|
|
.. _pooling: http://ufldl.stanford.edu/tutorial/supervised/Pooling/
|
|
|
|
- ceil_mode=True:
|
|
|
|
.. math::
|
|
|
|
w & = 1 + \\frac{ceil(input\_width + 2 * padding - pool\_size)}{stride}
|
|
|
|
h & = 1 + \\frac{ceil(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}
|
|
|
|
- ceil_mode=False:
|
|
|
|
.. math::
|
|
|
|
w & = 1 + \\frac{floor(input\_width + 2 * padding - pool\_size)}{stride}
|
|
|
|
h & = 1 + \\frac{floor(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
maxpool = img_pool_layer(input=conv,
|
|
pool_size=3,
|
|
pool_size_y=5,
|
|
num_channels=8,
|
|
stride=1,
|
|
stride_y=2,
|
|
padding=1,
|
|
padding_y=2,
|
|
pool_type=MaxPooling())
|
|
|
|
:param padding: The padding size on the x axis. 0 is the default padding size.
|
|
:type padding: int
|
|
:param padding_y: The padding size on the y axis. If the parameter is not set
|
|
or set to None, it will be set to 'padding' automatically.
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param pool_size: The pooling window length on the x axis.
|
|
:type pool_size: int
|
|
:param pool_size_y: The pooling window length on the y axis. If the parameter is
|
|
not set or set to None, its actual value will be automatically
|
|
set to pool_size.
|
|
:type pool_size_y: int
|
|
:param num_channels: The number of input channels. If the parameter is not set or
|
|
set to None, its actual value will be automatically set to
|
|
the channels number of the input.
|
|
:type num_channels: int
|
|
:param pool_type: Pooling type. MaxPooling is the default pooling.
|
|
:type pool_type: BasePoolingType
|
|
:param stride: The stride on the x axis. 1 is the default value.
|
|
:type stride: int
|
|
:param stride_y: The stride on the y axis. If the parameter is not set or set to
|
|
None, its actual value will be automatically set to 'stride'.
|
|
:type stride_y: int
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:param ceil_mode: Whether to use the ceil function to calculate output height and width.
|
|
True is the default. If it is set to False, the floor function will
|
|
be used.
|
|
:type ceil_mode: bool
|
|
:param exclude_mode: Whether to exclude the padding cells when calculating, but only
|
|
work when pool_type is AvgPooling. If None, also exclude the padding
|
|
cells. If use cudnn, use CudnnAvgPooling or CudnnAvgInclPadPooling
|
|
as pool_type to identify the mode.
|
|
:type exclude_mode: bool
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
if num_channels is None:
|
|
assert input.num_filters is not None
|
|
num_channels = input.num_filters
|
|
|
|
if pool_type is None:
|
|
pool_type = MaxPooling()
|
|
elif isinstance(pool_type, AvgPooling):
|
|
pool_type.name = 'avg'
|
|
|
|
assert type(pool_type) in [AvgPooling, MaxPooling, MaxWithMaskPooling, CudnnAvgPooling,
|
|
CudnnMaxPooling, CudnnAvgInclPadPooling], \
|
|
"only (Cudnn)AvgPooling, (Cudnn)MaxPooling, MaxWithMaskPooling are supported"
|
|
|
|
type_name = pool_type.name + '-projection' \
|
|
if (
|
|
isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
|
|
else pool_type.name
|
|
pool_size_y = pool_size if pool_size_y is None else pool_size_y
|
|
stride_y = stride if stride_y is None else stride_y
|
|
padding_y = padding if padding_y is None else padding_y
|
|
|
|
l = Layer(
|
|
name=name,
|
|
type=LayerType.POOL_LAYER,
|
|
inputs=[
|
|
Input(
|
|
input.name,
|
|
pool=Pool(
|
|
pool_type=type_name,
|
|
channels=num_channels,
|
|
size_x=pool_size,
|
|
start=None,
|
|
stride=stride,
|
|
padding=padding,
|
|
size_y=pool_size_y,
|
|
stride_y=stride_y,
|
|
padding_y=padding_y))
|
|
],
|
|
ceil_mode=ceil_mode,
|
|
exclude_mode=exclude_mode,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name,
|
|
LayerType.POOL_LAYER,
|
|
parents=[input],
|
|
num_filters=num_channels,
|
|
size=l.config.size)
|
|
|
|
|
|
@wrap_name_default("pool3d")
|
|
@layer_support()
|
|
def img_pool3d_layer(input,
|
|
pool_size,
|
|
name=None,
|
|
num_channels=None,
|
|
pool_type=None,
|
|
stride=1,
|
|
padding=0,
|
|
layer_attr=None,
|
|
pool_size_y=None,
|
|
stride_y=None,
|
|
padding_y=None,
|
|
pool_size_z=None,
|
|
stride_z=None,
|
|
padding_z=None,
|
|
ceil_mode=True):
|
|
"""
|
|
Image pooling Layer.
|
|
|
|
The details of pooling layer, please refer ufldl's pooling_ .
|
|
|
|
.. _pooling: http://ufldl.stanford.edu/tutorial/supervised/Pooling/
|
|
|
|
- ceil_mode=True:
|
|
|
|
.. math::
|
|
|
|
w & = 1 + \\frac{ceil(input\_width + 2 * padding - pool\_size)}{stride}
|
|
|
|
h & = 1 + \\frac{ceil(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}
|
|
|
|
d & = 1 + \\frac{ceil(input\_depth + 2 * padding\_z - pool\_size\_z)}{stride\_z}
|
|
|
|
- ceil_mode=False:
|
|
|
|
.. math::
|
|
|
|
w & = 1 + \\frac{floor(input\_width + 2 * padding - pool\_size)}{stride}
|
|
|
|
h & = 1 + \\frac{floor(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}
|
|
|
|
d & = 1 + \\frac{floor(input\_depth + 2 * padding\_z - pool\_size\_z)}{stride\_z}
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
maxpool = img_pool3d_layer(input=conv,
|
|
pool_size=3,
|
|
num_channels=8,
|
|
stride=1,
|
|
padding=1,
|
|
pool_type=MaxPooling())
|
|
|
|
:param padding: pooling padding width.
|
|
:type padding: int | tuple | list
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring.
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param pool_size: The pooling window lengths along three axises. If the parameter
|
|
is set to one integer, the three lengths will be same.
|
|
:type pool_size: int | tuple | list
|
|
:param num_channels: The number of input channels. If the parameter is not set or
|
|
set to None, its actual value will be automatically set to
|
|
the channels number of the input.
|
|
:type num_channels: int
|
|
:param pool_type: Pooling type. MaxPooling is the default pooling.
|
|
:type pool_type: BasePoolingType
|
|
:param stride: The strides of the pooling along three axises. If the parameter
|
|
is set to one integer, the three strides will be same. 1 is the
|
|
default value.
|
|
:type stride: int | tuple | list
|
|
:param padding: The sizes of padding along three axises. If the parameter is set to
|
|
one integer, they will be same. 0 is the default padding size.
|
|
:type padding: int | tuple | list
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:param ceil_mode: Wether to use the ceil function to calculate output height and width.
|
|
True is the default. If it is set to False, the floor function will
|
|
be used.
|
|
:type ceil_mode: bool
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
if num_channels is None:
|
|
assert input.num_filters is not None
|
|
num_channels = input.num_filters
|
|
|
|
if pool_type is None:
|
|
pool_type = MaxPooling()
|
|
elif isinstance(pool_type, AvgPooling):
|
|
pool_type.name = 'avg'
|
|
|
|
type_name = pool_type.name + '-projection' \
|
|
if (
|
|
isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
|
|
else pool_type.name
|
|
|
|
if isinstance(pool_size, collections.Sequence):
|
|
assert len(pool_size) == 3
|
|
pool_size, pool_size_y, pool_size_z = pool_size
|
|
else:
|
|
pool_size_y = pool_size
|
|
pool_size_z = pool_size
|
|
|
|
if isinstance(stride, collections.Sequence):
|
|
assert len(stride) == 3
|
|
stride, stride_y, stride_z = stride
|
|
else:
|
|
stride_y = stride
|
|
stride_z = stride
|
|
|
|
if isinstance(padding, collections.Sequence):
|
|
assert len(padding) == 3
|
|
padding, padding_y, padding_y = padding
|
|
else:
|
|
padding_y = padding
|
|
padding_z = padding
|
|
|
|
l = Layer(
|
|
name=name,
|
|
type=LayerType.POOL3D_LAYER,
|
|
inputs=[
|
|
Input(
|
|
input.name,
|
|
pool=Pool3d(
|
|
pool_type=type_name,
|
|
channels=num_channels,
|
|
size_x=pool_size,
|
|
start=None,
|
|
stride=stride,
|
|
padding=padding,
|
|
size_y=pool_size_y,
|
|
stride_y=stride_y,
|
|
padding_y=padding_y,
|
|
size_z=pool_size_z,
|
|
stride_z=stride_z,
|
|
padding_z=padding_z))
|
|
],
|
|
ceil_mode=ceil_mode,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name,
|
|
LayerType.POOL_LAYER,
|
|
parents=[input],
|
|
num_filters=num_channels,
|
|
size=l.config.size)
|
|
|
|
|
|
@wrap_name_default("spp")
|
|
@layer_support()
|
|
def spp_layer(input,
|
|
name=None,
|
|
num_channels=None,
|
|
pool_type=None,
|
|
pyramid_height=None,
|
|
layer_attr=None):
|
|
"""
|
|
A layer performs spatial pyramid pooling.
|
|
|
|
Reference:
|
|
`Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition
|
|
<https://arxiv.org/abs/1406.4729>`_
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
spp = spp_layer(input=data,
|
|
pyramid_height=2,
|
|
num_channels=16,
|
|
pool_type=MaxPooling())
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param num_channels: The number of input channels. If the parameter is not set or
|
|
set to None, its actual value will be automatically set to
|
|
the channels number of the input.
|
|
:type num_channels: int
|
|
:param pool_type: Pooling type. MaxPooling is the default pooling.
|
|
:type scale: BasePoolingType
|
|
:param pyramid_height: The pyramid height of this pooling.
|
|
:type pyramid_height: int
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
if num_channels is None:
|
|
assert input.num_filters is not None
|
|
num_channels = input.num_filters
|
|
|
|
if pool_type is None:
|
|
pool_type = MaxPooling()
|
|
elif isinstance(pool_type, AvgPooling):
|
|
pool_type.name = 'avg'
|
|
|
|
type_name = pool_type.name
|
|
if (isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)):
|
|
type_name += '-projection'
|
|
|
|
l = Layer(
|
|
name=name,
|
|
type=LayerType.SPP_LAYER,
|
|
inputs=Input(
|
|
input.name,
|
|
spp=SpatialPyramidPool(
|
|
pool_type=type_name,
|
|
channels=num_channels,
|
|
pyramid_height=pyramid_height)),
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name,
|
|
layer_type=LayerType.SPP_LAYER,
|
|
parents=[input],
|
|
num_filters=num_channels,
|
|
size=l.config.size)
|
|
|
|
|
|
def __img_norm_layer__(name, input, size, norm_type, scale, power, num_channels,
|
|
blocked, layer_attr):
|
|
if num_channels is None:
|
|
assert input.num_filters is not None
|
|
num_channels = input.num_filters
|
|
|
|
l = Layer(
|
|
name=name,
|
|
type=LayerType.NORM_LAYER,
|
|
inputs=Input(
|
|
input.name,
|
|
norm=Norm(
|
|
norm_type=norm_type,
|
|
channels=num_channels,
|
|
size=size,
|
|
scale=scale,
|
|
pow=power,
|
|
blocked=blocked)),
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name,
|
|
layer_type=LayerType.NORM_LAYER,
|
|
parents=[input],
|
|
num_filters=num_channels,
|
|
img_norm_type=norm_type,
|
|
size=l.config.size)
|
|
|
|
|
|
@wrap_name_default("crmnorm")
|
|
@layer_support()
|
|
def img_cmrnorm_layer(input,
|
|
size,
|
|
scale=0.0128,
|
|
power=0.75,
|
|
name=None,
|
|
num_channels=None,
|
|
layer_attr=None):
|
|
"""
|
|
Response normalization across feature maps.
|
|
|
|
Reference:
|
|
`ImageNet Classification with Deep Convolutional Neural Networks
|
|
<http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf>`_
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
norm = img_cmrnorm_layer(input=net, size=5)
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param size: Normalize in number of :math:`size` feature maps.
|
|
:type size: int
|
|
:param scale: The hyper-parameter.
|
|
:type scale: float
|
|
:param power: The hyper-parameter.
|
|
:type power: float
|
|
:param num_channels: The number of input channels. If the parameter is not set or
|
|
set to None, its actual value will be automatically set to
|
|
the channels number of the input.
|
|
:param layer_attr: The extra layer attributes. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
return __img_norm_layer__(name, input, size, "cmrnorm-projection", scale,
|
|
power, num_channels, 0, layer_attr)
|
|
|
|
|
|
@wrap_bias_attr_default()
|
|
@wrap_param_attr_default(
|
|
default_factory=lambda _: ParamAttr(initial_mean=1.0, initial_std=0.))
|
|
@wrap_act_default(act=ReluActivation())
|
|
@wrap_name_default("batch_norm")
|
|
@layer_support(DROPOUT, ERROR_CLIPPING)
|
|
def batch_norm_layer(input,
|
|
act=None,
|
|
name=None,
|
|
img3D=False,
|
|
num_channels=None,
|
|
bias_attr=None,
|
|
param_attr=None,
|
|
layer_attr=None,
|
|
batch_norm_type=None,
|
|
epsilon=1e-5,
|
|
moving_average_fraction=0.9,
|
|
use_global_stats=None,
|
|
mean_var_names=None):
|
|
"""
|
|
Batch Normalization Layer. The notation of this layer is as follows.
|
|
|
|
:math:`x` is the input features over a mini-batch.
|
|
|
|
.. math::
|
|
|
|
\\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
|
|
\ mini-batch\ mean \\\\
|
|
\\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
|
|
\\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
|
|
\\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
|
|
\\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
|
|
y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
|
|
|
|
Reference:
|
|
`Batch Normalization: Accelerating Deep Network Training by Reducing
|
|
Internal Covariate Shift
|
|
<http://arxiv.org/abs/1502.03167>`_
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
norm = batch_norm_layer(input=net, act=ReluActivation())
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: This layer's input which is to be performed batch normalization on.
|
|
:type input: LayerOutput
|
|
:param batch_norm_type: We have batch_norm, mkldnn_batch_norm and cudnn_batch_norm.
|
|
batch_norm supports CPU, MKLDNN and GPU. cudnn_batch_norm
|
|
requires cuDNN version greater or equal to v4 (>=v4).
|
|
But cudnn_batch_norm is faster and needs less
|
|
memory than batch_norm. mkldnn_batch_norm requires
|
|
use_mkldnn is enabled. By default (None), we will
|
|
automatically select cudnn_batch_norm for GPU,
|
|
mkldnn_batch_norm for MKLDNN and batch_norm for CPU.
|
|
Users can specify the batch norm type. If you use
|
|
cudnn_batch_norm, we suggested you use latest version,
|
|
such as v5.1.
|
|
:type batch_norm_type: None | string, None or "batch_norm" or "cudnn_batch_norm"
|
|
or "mkldnn_batch_norm"
|
|
:param act: Activation type. ReluActivation is the default activation.
|
|
:type act: BaseActivation
|
|
:param num_channels: The number of input channels. If the parameter is not set or
|
|
set to None, its actual value will be automatically set to
|
|
the channels number of the input.
|
|
:type num_channels: int
|
|
:param bias_attr: :math:`\\beta`. The bias attribute. If the parameter is set to
|
|
False or an object whose type is not ParameterAttribute, no
|
|
bias is defined. If the parameter is set to True, the bias is
|
|
initialized to zero.
|
|
:type bias_attr: ParameterAttribute | None | bool | Any
|
|
:param param_attr: :math:`\\gamma`. The parameter attribute. See ParameterAttribute
|
|
for details.
|
|
:type param_attr: ParameterAttribute
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:param use_global_stats: Whether use moving mean/variance statistics during
|
|
testing peroid. If the parameter is set to None or
|
|
True, it will use moving mean/variance statistics
|
|
during testing. If the parameter is set to False, it
|
|
will use the mean and variance of the current batch
|
|
of test data.
|
|
:type use_global_stats: bool | None.
|
|
:param epsilon: The small constant added to the variance to improve numeric stability.
|
|
:type epsilon: float.
|
|
:param moving_average_fraction: Factor used in the moving average computation.
|
|
:math:`runningMean = newMean*(1-factor) + runningMean*factor`
|
|
:type moving_average_fraction: float.
|
|
:param mean_var_names: [mean name, variance name]
|
|
:type mean_var_names: string list
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
|
|
if num_channels is None:
|
|
if input.num_filters is not None:
|
|
num_channels = input.num_filters
|
|
else:
|
|
num_channels = input.size
|
|
assert (batch_norm_type is None) or (batch_norm_type == "batch_norm") or \
|
|
(batch_norm_type == "mkldnn_batch_norm") or \
|
|
(batch_norm_type == "cudnn_batch_norm")
|
|
|
|
l = Layer(
|
|
name=name,
|
|
img3D=img3D,
|
|
inputs=Input(
|
|
input.name, image=Image(channels=num_channels), **param_attr.attr),
|
|
active_type=act.name,
|
|
type=LayerType.BATCH_NORM_LAYER,
|
|
batch_norm_type=batch_norm_type,
|
|
bias=ParamAttr.to_bias(bias_attr),
|
|
epsilon=epsilon,
|
|
moving_average_fraction=moving_average_fraction,
|
|
use_global_stats=use_global_stats,
|
|
mean_var_names=mean_var_names,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
|
|
return LayerOutput(
|
|
name=name,
|
|
layer_type=LayerType.BATCH_NORM_LAYER,
|
|
parents=[input],
|
|
activation=act,
|
|
num_filters=num_channels,
|
|
size=l.config.size)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def sum_to_one_norm_layer(input, name=None, layer_attr=None):
|
|
"""
|
|
A layer for sum-to-one normalization,
|
|
which is used in NEURAL TURING MACHINE.
|
|
|
|
.. math::
|
|
out[i] = \\frac {in[i]} {\sum_{k=1}^N in[k]}
|
|
|
|
where :math:`in` is a (batchSize x dataDim) input vector,
|
|
and :math:`out` is a (batchSize x dataDim) output vector.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
sum_to_one_norm = sum_to_one_norm_layer(input=layer)
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute
|
|
for details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.SUM_TO_ONE_NORM_LAYER,
|
|
inputs=[input.name],
|
|
**ExtraAttr.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name, LayerType.SUM_TO_ONE_NORM_LAYER, parents=[input], size=input.size)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def row_l2_norm_layer(input, name=None, layer_attr=None):
|
|
"""
|
|
A layer for L2-normalization in each row.
|
|
|
|
.. math::
|
|
out[i] = \\frac{in[i]} {\\sqrt{\\sum_{k=1}^N in[k]^{2}}}
|
|
|
|
where the size of :math:`in` is (batchSize x dataDim) ,
|
|
and the size of :math:`out` is a (batchSize x dataDim) .
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
row_l2_norm_layer = row_l2_norm_layer(input=layer)
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute
|
|
for details.
|
|
:type layer_attr: ExtraLayerAttribute.
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.ROW_L2_NORM_LAYER,
|
|
inputs=[input.name],
|
|
**ExtraAttr.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name, LayerType.ROW_L2_NORM_LAYER, parents=[input], size=input.size)
|
|
|
|
|
|
@wrap_name_default("addto")
|
|
@wrap_act_default(act=LinearActivation())
|
|
@wrap_bias_attr_default(has_bias=False)
|
|
@layer_support(DROPOUT, ERROR_CLIPPING)
|
|
def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None):
|
|
"""
|
|
AddtoLayer.
|
|
|
|
.. math::
|
|
|
|
y = f(\\sum_{i} x_i + b)
|
|
|
|
where :math:`y` is output, :math:`x` is input, :math:`b` is bias,
|
|
and :math:`f` is activation function.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
addto = addto_layer(input=[layer1, layer2],
|
|
act=ReluActivation(),
|
|
bias_attr=False)
|
|
|
|
This layer just simply adds all input layers together, then activates the
|
|
sum. All inputs should share the same dimension, which is also the dimension
|
|
of this layer's output.
|
|
|
|
There is no weight matrix for each input, because it just a simple add
|
|
operation. If you want a complicated operation before add, please use
|
|
mixed_layer.
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The input layers. It could be a LayerOutput or list/tuple of
|
|
LayerOutput.
|
|
:type input: LayerOutput | list | tuple
|
|
:param act: Activation Type. LinearActivation is the default activation.
|
|
:type act: BaseActivation
|
|
:param bias_attr: The bias attribute. If the parameter is set to False or an object
|
|
whose type is not ParameterAttribute, no bias is defined. If the
|
|
parameter is set to True, the bias is initialized to zero.
|
|
:type bias_attr: ParameterAttribute | None | bool | Any
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
num_filters = None
|
|
if isinstance(input, LayerOutput):
|
|
input = [input]
|
|
|
|
assert isinstance(input, collections.Sequence)
|
|
ipts_for_layer = []
|
|
for each_input in input:
|
|
assert isinstance(each_input, LayerOutput)
|
|
ipts_for_layer.append(Input(each_input.name))
|
|
if each_input.num_filters is not None:
|
|
num_filters = each_input.num_filters
|
|
|
|
l = Layer(
|
|
name=name,
|
|
type=LayerType.ADDTO_LAYER,
|
|
inputs=ipts_for_layer,
|
|
bias=ParamAttr.to_bias(bias_attr),
|
|
active_type=act.name,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
|
|
return LayerOutput(
|
|
name,
|
|
LayerType.ADDTO_LAYER,
|
|
parents=input,
|
|
activation=act,
|
|
num_filters=num_filters,
|
|
size=l.config.size)
|
|
|
|
|
|
@wrap_act_default(act=IdentityActivation())
|
|
@wrap_name_default("concat")
|
|
@layer_support(DROPOUT, ERROR_CLIPPING)
|
|
def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
|
|
"""
|
|
Concatenate all input vectors to one vector.
|
|
Inputs can be a list of LayerOutput or a list of projection.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
concat = concat_layer(input=[layer1, layer2])
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The input layers or projections
|
|
:type input: list | tuple | collections.Sequence
|
|
:param act: Activation type. IdentityActivation is the default activation.
|
|
:type act: BaseActivation
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
|
|
if isinstance(input, LayerOutput):
|
|
input = [input]
|
|
elif isinstance(input, Projection):
|
|
input = [input]
|
|
else:
|
|
assert isinstance(input, collections.Sequence)
|
|
|
|
def __is_type__(o, tp):
|
|
if not isinstance(o, collections.Sequence):
|
|
if o == tp:
|
|
return True
|
|
elif len(o.__bases__) == 0:
|
|
return False
|
|
else:
|
|
for bs in o.__bases__:
|
|
if __is_type__(bs, tp):
|
|
return True
|
|
return False
|
|
else:
|
|
tmp = map(lambda _x: __is_type__(_x, tp), o)
|
|
a = tmp[0]
|
|
for b in tmp[1:]:
|
|
assert a == b
|
|
return a
|
|
|
|
def __reduce_concat_type__(a, b):
|
|
assert __is_type__([a, b], Projection) or __is_type__([a, b],
|
|
LayerOutput)
|
|
return a
|
|
|
|
is_concat_layer = __is_type__(
|
|
reduce(__reduce_concat_type__, map(type, input)), LayerOutput)
|
|
|
|
layer_type = (LayerType.CONCAT_LAYER
|
|
if is_concat_layer else LayerType.CONCAT_PROJ_LAYER)
|
|
|
|
if layer_type == LayerType.CONCAT_LAYER:
|
|
assert not bias_attr
|
|
|
|
layer = Layer(
|
|
name=name,
|
|
type=layer_type,
|
|
inputs=[x.name for x in input] if is_concat_layer else input,
|
|
active_type=act.name,
|
|
bias=ParamAttr.to_bias(bias_attr),
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
|
|
sz = layer.config.size
|
|
|
|
return LayerOutput(
|
|
name,
|
|
layer_type=layer_type,
|
|
parents=input if is_concat_layer else [x.origin for x in input],
|
|
activation=act,
|
|
size=sz)
|
|
|
|
|
|
@wrap_name_default("seqconcat")
|
|
@wrap_act_default(act=IdentityActivation())
|
|
@wrap_bias_attr_default(has_bias=False)
|
|
@layer_support(DROPOUT, ERROR_CLIPPING)
|
|
def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
|
|
bias_attr=None):
|
|
"""
|
|
Concatenate sequence a and sequence b.
|
|
|
|
Inputs:
|
|
- a = [a1, a2, ..., am]
|
|
- b = [b1, b2, ..., bn]
|
|
|
|
Output: [a1, ..., am, b1, ..., bn]
|
|
|
|
Note that the above computation is for one sample. Multiple samples are
|
|
processed in one batch.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
concat = seq_concat_layer(a=layer1, b=layer2)
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param a: The first input sequence layer
|
|
:type a: LayerOutput
|
|
:param b: The second input sequence layer
|
|
:type b: LayerOutput
|
|
:param act: Activation type. IdentityActivation is the default activation.
|
|
:type act: BaseActivation
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:param bias_attr: The bias attribute. If the parameter is set to False or an object
|
|
whose type is not ParameterAttribute, no bias is defined. If the
|
|
parameter is set to True, the bias is initialized to zero.
|
|
:type bias_attr: ParameterAttribute | None | bool | Any
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
assert isinstance(a, LayerOutput) and isinstance(b, LayerOutput)
|
|
assert a.size == b.size
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.SEQUENCE_CONCAT_LAYER,
|
|
inputs=[a.name, b.name],
|
|
active_type=act.name,
|
|
bias=ParamAttr.to_bias(bias_attr),
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
|
|
return LayerOutput(
|
|
name,
|
|
layer_type=LayerType.SEQUENCE_CONCAT_LAYER,
|
|
parents=[a, b],
|
|
activation=act,
|
|
size=a.size)
|
|
|
|
|
|
@wrap_name_default("memory", "memory_name")
|
|
def memory(name,
|
|
size,
|
|
memory_name=None,
|
|
is_seq=False,
|
|
boot_layer=None,
|
|
boot_bias=None,
|
|
boot_bias_active_type=None,
|
|
boot_with_const_id=None):
|
|
"""
|
|
The memory takes a layer's output at previous time step as its own output.
|
|
|
|
If boot_bias, the activation of the bias is the initial value of the memory.
|
|
|
|
If boot_with_const_id is set, then the memory's output at the first time step
|
|
is a IndexSlot, the Arguments.ids()[0] is this :code:`cost_id`.
|
|
|
|
If boot_layer is specified, the memory's output at the first time step will
|
|
be the boot_layer's output.
|
|
|
|
In other case, the default memory's output at the first time step is zero.
|
|
|
|
.. code-block:: python
|
|
|
|
mem = memory(size=256, name='state')
|
|
state = fc_layer(input=mem, size=256, name='state')
|
|
|
|
If you do not want to specify the name, you can also use set_input()
|
|
to specify the layer to be remembered as the following:
|
|
|
|
.. code-block:: python
|
|
|
|
mem = memory(size=256)
|
|
state = fc_layer(input=mem, size=256)
|
|
mem.set_input(mem)
|
|
|
|
:param name: The name of the layer which this memory remembers.
|
|
If name is None, user should call set_input() to specify the
|
|
name of the layer which this memory remembers.
|
|
:type name: basestring
|
|
:param size: The dimensionality of memory.
|
|
:type size: int
|
|
:param memory_name: The name of the memory. It is ignored when name is provided.
|
|
:type memory_name: basestring
|
|
:param is_seq: DEPRECATED. is sequence for boot_layer
|
|
:type is_seq: bool
|
|
:param boot_layer: This parameter specifies memory's output at the first time
|
|
step and the output is boot_layer's output.
|
|
:type boot_layer: LayerOutput | None
|
|
:param boot_bias: The bias attribute of memory's output at the first time step.
|
|
If the parameter is set to False or an object whose type is not
|
|
ParameterAttribute, no bias is defined. If the parameter is set
|
|
to True, the bias is initialized to zero.
|
|
:type boot_bias: ParameterAttribute | None
|
|
:param boot_bias_active_type: Activation type for memory's bias at the first time
|
|
step. LinearActivation is the default activation.
|
|
:type boot_bias_active_type: BaseActivation
|
|
:param boot_with_const_id: This parameter specifies memory's output at the first
|
|
time step and the output is an index.
|
|
:type boot_with_const_id: int
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
if boot_bias_active_type is None:
|
|
boot_bias_active_type = LinearActivation()
|
|
|
|
assert boot_bias is None or isinstance(boot_bias, ParameterAttribute)
|
|
if isinstance(boot_bias, ParameterAttribute):
|
|
boot_bias = ParamAttr.to_bias(boot_bias)
|
|
|
|
assert boot_layer is None or isinstance(boot_layer, LayerOutput)
|
|
if name is not None:
|
|
memory_name = None
|
|
|
|
memory_name = Memory(
|
|
name,
|
|
size,
|
|
boot_layer=boot_layer.name if boot_layer is not None else None,
|
|
boot_bias=boot_bias,
|
|
boot_bias_active_type=boot_bias_active_type.name,
|
|
boot_with_const_id=boot_with_const_id,
|
|
memory_name=memory_name)
|
|
|
|
lout = LayerOutput(
|
|
name=memory_name,
|
|
size=size,
|
|
layer_type=LayerType.MEMORY,
|
|
parents=[boot_layer] if boot_layer is not None else None)
|
|
return lout
|
|
|
|
|
|
@wrap_bias_attr_default()
|
|
@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
|
|
@wrap_act_default(param_names=['state_act'], act=TanhActivation())
|
|
@wrap_act_default(act=TanhActivation())
|
|
@wrap_name_default('lstm_step')
|
|
@layer_support()
|
|
def lstm_step_layer(input,
|
|
state,
|
|
size=None,
|
|
act=None,
|
|
name=None,
|
|
gate_act=None,
|
|
state_act=None,
|
|
bias_attr=None,
|
|
layer_attr=None):
|
|
"""
|
|
LSTM Step Layer. This function is used only in recurrent_group.
|
|
The lstm equations are shown as follows.
|
|
|
|
.. math::
|
|
|
|
i_t & = \\sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)
|
|
|
|
f_t & = \\sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)
|
|
|
|
c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)
|
|
|
|
o_t & = \\sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)
|
|
|
|
h_t & = o_t tanh(c_t)
|
|
|
|
|
|
The input of lstm step is :math:`Wx_t + Wh_{t-1}`, and user should use
|
|
:code:`mixed_layer` and :code:`full_matrix_projection` to calculate these
|
|
input vectors.
|
|
|
|
The state of lstm step is :math:`c_{t-1}`. And lstm step layer will do
|
|
|
|
.. math::
|
|
|
|
i_t = \\sigma(input + W_{ci}c_{t-1} + b_i)
|
|
|
|
...
|
|
|
|
|
|
This layer has two outputs. The default output is :math:`h_t`. The other
|
|
output is :math:`o_t`, whose name is 'state' and users can use
|
|
:code:`get_output_layer` to extract this output.
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param size: The dimension of this layer's output, which must be
|
|
equal to the dimension of the state.
|
|
:type size: int
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param state: The state of the LSTM unit.
|
|
:type state: LayerOutput
|
|
:param act: Activation type. TanhActivation is the default activation.
|
|
:type act: BaseActivation
|
|
:param gate_act: Activation type of the gate. SigmoidActivation is the
|
|
default activation.
|
|
:type gate_act: BaseActivation
|
|
:param state_act: Activation type of the state. TanhActivation is the
|
|
default activation.
|
|
:type state_act: BaseActivation
|
|
:param bias_attr: The bias attribute. If the parameter is set to False or an object
|
|
whose type is not ParameterAttribute, no bias is defined. If the
|
|
parameter is set to True, the bias is initialized to zero.
|
|
:type bias_attr: ParameterAttribute | None | bool | Any
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
|
|
assert size is None or state.size == size
|
|
size = state.size
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.LSTM_STEP_LAYER,
|
|
active_type=act.name,
|
|
active_gate_type=gate_act.name,
|
|
active_state_type=state_act.name,
|
|
bias=ParamAttr.to_bias(bias_attr),
|
|
size=state.size,
|
|
inputs=[input.name, state.name],
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
|
|
return LayerOutput(
|
|
name=name,
|
|
layer_type=LayerType.LSTM_STEP_LAYER,
|
|
parents=[input, state],
|
|
activation=act,
|
|
size=size,
|
|
outputs=['default', 'state'])
|
|
|
|
|
|
@wrap_bias_attr_default()
|
|
@wrap_param_attr_default()
|
|
@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
|
|
@wrap_act_default(act=TanhActivation())
|
|
@wrap_name_default('gru_step')
|
|
@layer_support()
|
|
def gru_step_layer(input,
|
|
output_mem,
|
|
size=None,
|
|
act=None,
|
|
name=None,
|
|
gate_act=None,
|
|
bias_attr=None,
|
|
param_attr=None,
|
|
layer_attr=None):
|
|
"""
|
|
|
|
:param input: The input of this layer, whose dimension can be divided by 3.
|
|
:type input: LayerOutput
|
|
:param output_mem: A memory which memorizes the output of this layer at previous
|
|
time step.
|
|
:type output_mem: LayerOutput
|
|
:param size: The dimension of this layer's output. If it is not set or set to None,
|
|
it will be set to one-third of the dimension of the input automatically.
|
|
:type size: int
|
|
:param act: Activation type of this layer's output. TanhActivation
|
|
is the default activation.
|
|
:type act: BaseActivation
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param gate_act: Activation type of this layer's two gates. SigmoidActivation is
|
|
the default activation.
|
|
:type gate_act: BaseActivation
|
|
:param bias_attr: The parameter attribute for bias. If this parameter is set to
|
|
False or an object whose type is not ParameterAttribute, no bias
|
|
is defined. If this parameter is set to True,
|
|
the bias is initialized to zero.
|
|
:type bias_attr: ParameterAttribute | None | bool | Any
|
|
:param param_attr: The parameter attribute. See ParameterAttribute for details.
|
|
:type param_attr: ParameterAttribute
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
assert input.size % 3 == 0
|
|
if size is None:
|
|
size = input.size / 3
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.GRU_STEP_LAYER,
|
|
# The parameter here is for transforming the output_mem. The input has
|
|
# already been transformed outside this module so it does not need
|
|
# parameter associated with it.
|
|
# The parameter here is instead grouped with input is due to
|
|
# backward model compatibility.
|
|
inputs=[Input(input.name, **param_attr.attr), output_mem.name],
|
|
bias=ParamAttr.to_bias(bias_attr),
|
|
size=size,
|
|
active_type=act.name,
|
|
active_gate_type=gate_act.name,
|
|
**ExtraAttr.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name=name,
|
|
layer_type=LayerType.GRU_STEP_LAYER,
|
|
parents=[input, output_mem],
|
|
size=size,
|
|
activation=act)
|
|
|
|
|
|
@wrap_bias_attr_default()
|
|
@wrap_param_attr_default()
|
|
@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
|
|
@wrap_act_default(act=TanhActivation())
|
|
@wrap_name_default('gru_step_naive')
|
|
@layer_support(ERROR_CLIPPING, DROPOUT)
|
|
def gru_step_naive_layer(input,
|
|
output_mem,
|
|
size=None,
|
|
name=None,
|
|
act=None,
|
|
gate_act=None,
|
|
bias_attr=None,
|
|
param_attr=None,
|
|
layer_attr=None):
|
|
"""
|
|
GRU Step Layer, which is realized using PaddlePaddle API. It supports ERROR_CLIPPING
|
|
and DROPOUT.
|
|
|
|
:param input: The input of this layer, whose dimensionality can be divided by 3.
|
|
:param output_mem: A memory which memorizes the output of this layer at previous
|
|
time step.
|
|
:type output_mem: LayerOutput
|
|
:param size: The dimension of this layer's output. If it is not set or set to None,
|
|
it will be set to one-third of the dimension of the input automatically.
|
|
:type size: int
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param act: Activation type of this layer's output. TanhActivation
|
|
is the default activation.
|
|
:type act: BaseActivation
|
|
:param gate_act: Activation type of this layer's two gates. SigmoidActivation
|
|
is the default activation.
|
|
:type gate_act: BaseActivation
|
|
:param bias_attr: The parameter attribute for bias. If this parameter is set to
|
|
False or an object whose type is not ParameterAttribute, no bias
|
|
is defined. If this parameter is set to True,
|
|
the bias is initialized to zero.
|
|
:type bias_attr: ParameterAttribute | None | bool | Any
|
|
:param param_attr: The parameter attribute. See ParameterAttribute for details.
|
|
:type param_attr: ParameterAttribute
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
if input.size % 3 != 0:
|
|
raise ValueError("GruStep input size must be divided by 3")
|
|
if size is None:
|
|
size = input.size / 3
|
|
|
|
if bias_attr and bias_attr.attr.get("parameter_name", None) is not None:
|
|
raise ValueError("You should not specify the field `name` in bias_attr."
|
|
" Otherwise, the three biases, which correponding to "
|
|
" the two gates and the mixed layer for computing Wx+b"
|
|
", will share the same parameter matrix unexpectedly.")
|
|
|
|
def __gate__(gate_name, offset):
|
|
with mixed_layer(
|
|
name=name + "_" + gate_name,
|
|
size=size,
|
|
layer_attr=layer_attr,
|
|
bias_attr=bias_attr,
|
|
act=gate_act) as gate:
|
|
gate += identity_projection(input=input, offset=offset)
|
|
gate += full_matrix_projection(
|
|
input=output_mem, param_attr=param_attr)
|
|
return gate
|
|
|
|
update_gate = __gate__("update", 0)
|
|
reset_gate = __gate__("reset", size)
|
|
|
|
with mixed_layer(
|
|
name=name + "_reset_output", bias_attr=False) as reset_output:
|
|
reset_output += dotmul_operator(a=output_mem, b=reset_gate)
|
|
|
|
with mixed_layer(
|
|
name=name + "_output_candidate",
|
|
size=size,
|
|
layer_attr=layer_attr,
|
|
bias_attr=bias_attr,
|
|
act=act) as output_candidate:
|
|
output_candidate += identity_projection(input=input, offset=2 * size)
|
|
output_candidate += full_matrix_projection(
|
|
input=reset_output, param_attr=param_attr)
|
|
|
|
with mixed_layer(name=name) as output:
|
|
output += identity_projection(output_mem)
|
|
output += dotmul_operator(a=output_mem, b=update_gate, scale=-1.0)
|
|
output += dotmul_operator(a=output_candidate, b=update_gate)
|
|
|
|
return output
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def get_output_layer(input, arg_name, name=None, layer_attr=None):
|
|
"""
|
|
Get layer's output by name. In PaddlePaddle, a layer might return multiple
|
|
values, but returns one layer's output. If the user wants to use another
|
|
output besides the default one, please use get_output_layer first to get
|
|
the output from input.
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The input layer. And this layer should contain
|
|
multiple outputs.
|
|
:type input: LayerOutput
|
|
:param arg_name: The name of the output to be extracted from the input layer.
|
|
:type arg_name: basestring
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
# GetOutputLayer
|
|
assert arg_name in input.outputs, 'Get Output From an not existed input.' \
|
|
' The get output name is %s, which not' \
|
|
' in %s' % (
|
|
arg_name, ",".join(input.outputs))
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.GET_OUTPUT_LAYER,
|
|
inputs=[Input(
|
|
input.name, input_layer_argument=arg_name)],
|
|
size=input.size,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
|
|
return LayerOutput(
|
|
name=name,
|
|
layer_type=LayerType.GET_OUTPUT_LAYER,
|
|
parents=[input],
|
|
size=input.size)
|
|
|
|
|
|
@wrap_name_default()
|
|
@wrap_act_default()
|
|
@wrap_bias_attr_default()
|
|
@wrap_param_attr_default()
|
|
@layer_support()
|
|
def recurrent_layer(input,
|
|
act=None,
|
|
bias_attr=None,
|
|
param_attr=None,
|
|
name=None,
|
|
reverse=False,
|
|
layer_attr=None):
|
|
"""
|
|
Simple recurrent unit layer. It is just a fully connect layer through both
|
|
time and neural network.
|
|
|
|
For each sequence [start, end] it performs the following computation\:
|
|
|
|
.. math::
|
|
|
|
out_{i} = act(in_{i}) \\ \\ \\text{for} \\ i = start \\\\
|
|
out_{i} = act(in_{i} + out_{i-1} * W) \\ \\ \\text{for} \\ start < i <= end
|
|
|
|
If reversed is true, the order is reversed\:
|
|
|
|
.. math::
|
|
|
|
out_{i} = act(in_{i}) \\ \\ \\text{for} \\ i = end \\\\
|
|
out_{i} = act(in_{i} + out_{i+1} * W) \\ \\ \\text{for} \\ start <= i < end
|
|
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param act: Activation type. TanhActivation is the default activation.
|
|
:type act: BaseActivation
|
|
:param bias_attr: The parameter attribute for bias. If this parameter is set to
|
|
False or an object whose type is not ParameterAttribute,
|
|
no bias is defined. If the parameter is set to True,
|
|
the bias is initialized to zero.
|
|
:type bias_attr: ParameterAttribute | None | bool | Any
|
|
:param param_attr: The parameter attribute. See ParameterAttribute for
|
|
details.
|
|
:type param_attr: ParameterAttribute
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.RECURRENT_LAYER,
|
|
inputs=Input(input.name, **param_attr.attr),
|
|
active_type=act.name,
|
|
bias=ParamAttr.to_bias(bias_attr),
|
|
reversed=reverse,
|
|
**ExtraAttr.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name=name,
|
|
layer_type=LayerType.RECURRENT_LAYER,
|
|
parents=[input],
|
|
size=input.size,
|
|
activation=act,
|
|
reverse=reverse)
|
|
|
|
|
|
class StaticInput(object):
|
|
"""
|
|
StaticInput is only used in recurrent_group which defines a read-only memory
|
|
and can be a sequence or non-sequence.
|
|
:param size: DEPRECATED
|
|
:param is_seq: DEPRECATED
|
|
"""
|
|
|
|
def __init__(self, input, is_seq=False, size=None):
|
|
assert isinstance(input, LayerOutput)
|
|
self.input = input
|
|
assert input.size is not None
|
|
if size is not None:
|
|
assert input.size == size
|
|
|
|
|
|
def SubsequenceInput(input):
|
|
"""
|
|
DEPRECATED.
|
|
Input sequence has sub-sequence, used in recurrent_group.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
input = SubsequenceInput(layer)
|
|
"""
|
|
return input
|
|
|
|
|
|
@wrap_name_default("recurrent_group")
|
|
def recurrent_group(step, input, reverse=False, name=None, targetInlink=None):
|
|
"""
|
|
Recurrent layer group is an extremely flexible recurrent unit in
|
|
PaddlePaddle. As long as the user defines the calculation done within a
|
|
time step, PaddlePaddle will iterate such a recurrent calculation over
|
|
sequence input. This is useful for attention-based models, or Neural
|
|
Turning Machine like models.
|
|
|
|
The basic usage (time steps) is:
|
|
|
|
.. code-block:: python
|
|
|
|
def step(input):
|
|
output = fc_layer(input=layer,
|
|
size=1024,
|
|
act=LinearActivation(),
|
|
bias_attr=False)
|
|
return output
|
|
|
|
group = recurrent_group(input=layer,
|
|
step=step)
|
|
|
|
You can see following configs for further usages:
|
|
|
|
- time steps: lstmemory_group, paddle/gserver/tests/sequence_layer_group.conf, \
|
|
demo/seqToseq/seqToseq_net.py
|
|
- sequence steps: paddle/gserver/tests/sequence_nest_layer_group.conf
|
|
|
|
:param step: A step function which takes the input of recurrent_group as its own
|
|
input and returns values as recurrent_group's output every time step.
|
|
|
|
The recurrent group scatters a sequence into time steps. And
|
|
for each time step, it will invoke step function, and return
|
|
a time step result. Then gather outputs of each time step into
|
|
layer group's output.
|
|
|
|
:type step: callable
|
|
|
|
:param name: The recurrent_group's name. It is optional.
|
|
:type name: basestring
|
|
|
|
:param input: Input links array.
|
|
|
|
LayerOutput will be scattered into time steps.
|
|
SubsequenceInput will be scattered into sequence steps.
|
|
StaticInput will be imported to each time step, and doesn't change
|
|
over time. It's a mechanism to access layer outside step function.
|
|
|
|
:type input: LayerOutput | StaticInput | SubsequenceInput | list | tuple
|
|
|
|
:param reverse: If reverse is set to True, the recurrent unit will process the
|
|
input sequence in a reverse order.
|
|
:type reverse: bool
|
|
|
|
:param targetInlink: DEPRECATED.
|
|
The input layer which share info with layer group's output
|
|
|
|
Param input specifies multiple input layers. For
|
|
SubsequenceInput inputs, config should assign one input
|
|
layer that share info(the number of sentences and the number
|
|
of words in each sentence) with all layer group's outputs.
|
|
targetInlink should be one of the layer group's input.
|
|
|
|
:type targetInlink: LayerOutput | SubsequenceInput
|
|
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
model_type('recurrent_nn')
|
|
|
|
if isinstance(input, LayerOutput) or isinstance(input, StaticInput):
|
|
input = [input]
|
|
assert isinstance(input, collections.Sequence)
|
|
|
|
def is_in_links(x):
|
|
return isinstance(x, LayerOutput)
|
|
|
|
in_links = filter(is_in_links, input)
|
|
|
|
RecurrentLayerGroupWithoutOutLinksBegin(
|
|
name=name,
|
|
in_links=map(lambda x: x.name, in_links),
|
|
seq_reversed=reverse)
|
|
in_args = []
|
|
for each_input in input:
|
|
if isinstance(each_input, StaticInput): # StaticInput
|
|
mem_name = "__%s_memory__" % each_input.input.name
|
|
mem = memory(
|
|
name=None,
|
|
size=each_input.input.size,
|
|
boot_layer=each_input.input)
|
|
mem.set_input(mem)
|
|
in_args.append(mem)
|
|
else:
|
|
in_args.append(each_input)
|
|
|
|
layer_outs = step(*in_args)
|
|
|
|
if isinstance(layer_outs, LayerOutput):
|
|
layer_outs = [layer_outs]
|
|
|
|
for layer_out in layer_outs:
|
|
assert isinstance(
|
|
layer_out, LayerOutput
|
|
), "Type of step function's return value must be LayerOutput."
|
|
layer_out.reverse = reverse
|
|
RecurrentLayerGroupSetOutLink(layer_out.name)
|
|
|
|
RecurrentLayerGroupEnd(name=name)
|
|
|
|
for layer_out in layer_outs:
|
|
# The previous full_name is the name inside the recurrent group.
|
|
# We need a full_name outside the recurrent group.
|
|
layer_out.full_name = MakeLayerNameInSubmodel(layer_out.name)
|
|
|
|
if len(layer_outs) == 1:
|
|
return layer_outs[0]
|
|
else:
|
|
return layer_outs
|
|
|
|
|
|
class BaseGeneratedInput(object):
|
|
def __init__(self):
|
|
self.bos_id = None
|
|
self.eos_id = None
|
|
|
|
def before_real_step(self):
|
|
raise NotImplementedError()
|
|
|
|
def after_real_step(self, *args):
|
|
raise NotImplementedError()
|
|
|
|
|
|
class GeneratedInput(BaseGeneratedInput):
|
|
def after_real_step(self, input):
|
|
if isinstance(input, LayerOutput):
|
|
input = [input]
|
|
elif isinstance(input, collections.Sequence):
|
|
input = list(input)
|
|
if len(input) > 1:
|
|
logger.info(
|
|
("More than one layers inside the recurrent_group "
|
|
"are returned as outputs of the entire recurrent_group "
|
|
"PLEASE garantee the first output is probability of "
|
|
"the predicted next word."))
|
|
|
|
return [maxid_layer(
|
|
input=input[0], name='__beam_search_predict__')] + (
|
|
input[1:] if len(input) > 1 else [])
|
|
|
|
def before_real_step(self):
|
|
predict_id = memory(
|
|
name='__beam_search_predict__',
|
|
size=self.size,
|
|
boot_with_const_id=self.bos_id)
|
|
|
|
trg_emb = embedding_layer(
|
|
input=predict_id,
|
|
size=self.embedding_size,
|
|
param_attr=ParamAttr(name=self.embedding_name))
|
|
return trg_emb
|
|
|
|
def __init__(self, size, embedding_name, embedding_size):
|
|
super(GeneratedInput, self).__init__()
|
|
self.size = size
|
|
self.embedding_name = embedding_name
|
|
self.embedding_size = embedding_size
|
|
|
|
|
|
@wrap_name_default()
|
|
def maxid_layer(input, name=None, layer_attr=None):
|
|
"""
|
|
A layer for finding the id which has the maximal value for each sample.
|
|
The result is stored in output.ids.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
maxid = maxid_layer(input=layer)
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute.
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
|
|
assert isinstance(input, LayerOutput)
|
|
l = Layer(
|
|
name=name,
|
|
type='maxid',
|
|
inputs=[input.name],
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name=name,
|
|
layer_type=LayerType.MAXID_LAYER,
|
|
parents=[input],
|
|
size=l.config.size)
|
|
|
|
|
|
@wrap_name_default()
|
|
def dot_prod_layer(input1, input2, name=None, layer_attr=None):
|
|
"""
|
|
A layer for computing the dot product of two vectors.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
dot_prod = dot_prod_layer(input1=vec1, input2=vec2)
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input1: The first input layer.
|
|
:type input1: LayerOutput
|
|
:param input2: The second input layer.
|
|
:type input2: LayerOutput
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute.
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
assert isinstance(input1, LayerOutput)
|
|
assert isinstance(input2, LayerOutput)
|
|
assert input1.size == input2.size, ("Two inputs should have the same size.")
|
|
|
|
l = Layer(
|
|
name=name,
|
|
type=LayerType.DOT_PROD_LAYER,
|
|
inputs=[input1.name, input2.name],
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name=name,
|
|
layer_type=LayerType.DOT_PROD_LAYER,
|
|
parents=[input1, input2],
|
|
size=l.config.size)
|
|
|
|
|
|
@wrap_name_default()
|
|
def out_prod_layer(input1, input2, name=None, layer_attr=None):
|
|
"""
|
|
A layer for computing the outer product of two vectors
|
|
The result is a matrix of size(input1) x size(input2)
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
out_prod = out_prod_layer(input1=vec1, input2=vec2)
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input1: The first input layer.
|
|
:type input: LayerOutput
|
|
:param input2: The second input layer.
|
|
:type input2: LayerOutput
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute.
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
|
|
assert isinstance(input1, LayerOutput)
|
|
assert isinstance(input2, LayerOutput)
|
|
l = Layer(
|
|
name=name,
|
|
type=LayerType.OUT_PROD_LAYER,
|
|
inputs=[input1.name, input2.name],
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name=name,
|
|
layer_type=LayerType.OUT_PROD_LAYER,
|
|
parents=[input1, input2],
|
|
size=l.config.size)
|
|
|
|
|
|
@wrap_name_default()
|
|
def eos_layer(input, eos_id, name=None, layer_attr=None):
|
|
"""
|
|
A layer for checking EOS for each sample:
|
|
- output_id = (input_id == conf.eos_id)
|
|
|
|
The result is stored in output\_.ids.
|
|
It is used by recurrent layer group.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
eos = eos_layer(input=layer, eos_id=id)
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param eos_id: End id of sequence
|
|
:type eos_id: int
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute.
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
l = Layer(
|
|
name=name,
|
|
type=LayerType.EOSID_LAYER,
|
|
eos_id=eos_id,
|
|
inputs=[input.name],
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name=name,
|
|
layer_type=LayerType.EOSID_LAYER,
|
|
parents=[input],
|
|
size=l.config.size)
|
|
|
|
|
|
@wrap_name_default()
|
|
def beam_search(step,
|
|
input,
|
|
bos_id,
|
|
eos_id,
|
|
beam_size,
|
|
max_length=500,
|
|
name=None,
|
|
num_results_per_sample=None):
|
|
"""
|
|
Beam search is a heuristic search algorithm used in sequence generation.
|
|
It explores a graph by expanding the most promising nodes in a limited set
|
|
to maintain tractability.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
def rnn_step(input):
|
|
last_time_step_output = memory(name='rnn', size=512)
|
|
with mixed_layer(size=512, name='rnn') as simple_rnn:
|
|
simple_rnn += full_matrix_projection(input)
|
|
simple_rnn += last_time_step_output
|
|
return simple_rnn
|
|
|
|
generated_word_embedding = GeneratedInput(
|
|
size=target_dictionary_dim,
|
|
embedding_name="target_language_embedding",
|
|
embedding_size=word_vector_dim)
|
|
|
|
beam_gen = beam_search(name="decoder",
|
|
step=rnn_step,
|
|
input=[StaticInput(encoder_last),
|
|
generated_word_embedding],
|
|
bos_id=0,
|
|
eos_id=1,
|
|
beam_size=5)
|
|
|
|
Please see the following demo for more details:
|
|
|
|
- machine translation : demo/seqToseq/translation/gen.conf \
|
|
demo/seqToseq/seqToseq_net.py
|
|
|
|
:param name: The name of the recurrent unit that is responsible for
|
|
generating sequences. It is optional.
|
|
:type name: basestring
|
|
:param step: A callable function that defines the calculation in a time
|
|
step, and it is applied to sequences with arbitrary length by
|
|
sharing a same set of weights.
|
|
|
|
You can refer to the first parameter of recurrent_group, or
|
|
demo/seqToseq/seqToseq_net.py for more details.
|
|
:type step: callable
|
|
:param input: Input data for the recurrent unit, which should include the
|
|
previously generated words as a GeneratedInput object.
|
|
In beam_search, none of the input's type should be LayerOutput.
|
|
:type input: list
|
|
:param bos_id: Index of the start symbol in the dictionary. The start symbol
|
|
is a special token for NLP task, which indicates the
|
|
beginning of a sequence. In the generation task, the start
|
|
symbol is essential, since it is used to initialize the RNN
|
|
internal state.
|
|
:type bos_id: int
|
|
:param eos_id: Index of the end symbol in the dictionary. The end symbol is
|
|
a special token for NLP task, which indicates the end of a
|
|
sequence. The generation process will stop once the end
|
|
symbol is generated, or a pre-defined max iteration number
|
|
is exceeded.
|
|
:type eos_id: int
|
|
:param max_length: Max generated sequence length.
|
|
:type max_length: int
|
|
:param beam_size: Beam search for sequence generation is an iterative search
|
|
algorithm. To maintain tractability, every iteration only
|
|
only stores a predetermined number, called the beam_size,
|
|
of the most promising next words. The greater the beam
|
|
size, the fewer candidate words are pruned.
|
|
:type beam_size: int
|
|
:param num_results_per_sample: Number of the generated results per input
|
|
sequence. This number must always be less than
|
|
beam size.
|
|
:type num_results_per_sample: int
|
|
:return: The generated word index.
|
|
:rtype: LayerOutput
|
|
"""
|
|
|
|
if num_results_per_sample is None:
|
|
num_results_per_sample = beam_size
|
|
if num_results_per_sample > beam_size:
|
|
logger.warning("num_results_per_sample should be less than beam_size")
|
|
|
|
if isinstance(input, StaticInput) or isinstance(input, BaseGeneratedInput):
|
|
input = [input]
|
|
|
|
generated_input_index = -1
|
|
|
|
real_input = []
|
|
for i, each_input in enumerate(input):
|
|
assert not isinstance(each_input, LayerOutput), (
|
|
"in beam_search, "
|
|
"none of the input should has a type of LayerOutput.")
|
|
if isinstance(each_input, BaseGeneratedInput):
|
|
assert generated_input_index == -1, ("recurrent_group accepts "
|
|
"only one GeneratedInput.")
|
|
generated_input_index = i
|
|
|
|
else:
|
|
real_input.append(each_input)
|
|
|
|
assert generated_input_index != -1, "No GeneratedInput is given."
|
|
|
|
gipt = input[generated_input_index]
|
|
|
|
gipt.bos_id = bos_id
|
|
gipt.eos_id = eos_id
|
|
|
|
def __real_step__(*args):
|
|
eos_name = "__%s_eos_layer__" % name
|
|
RecurrentLayerGroupSetGenerator(
|
|
Generator(
|
|
eos_layer_name=eos_name,
|
|
max_num_frames=max_length,
|
|
beam_size=beam_size,
|
|
num_results_per_sample=num_results_per_sample))
|
|
|
|
args = list(args)
|
|
args.insert(generated_input_index, gipt.before_real_step())
|
|
|
|
predict = gipt.after_real_step(step(*args))
|
|
|
|
eos_layer(input=predict[0], eos_id=eos_id, name=eos_name)
|
|
return predict
|
|
|
|
return recurrent_group(
|
|
step=__real_step__, input=real_input, reverse=False, name=name)
|
|
|
|
|
|
def __cost_input__(input, label, weight=None):
|
|
"""
|
|
inputs and parents for cost layers.
|
|
"""
|
|
if isinstance(input, LayerOutput):
|
|
input = [input]
|
|
if isinstance(label, LayerOutput):
|
|
label = [label]
|
|
ipts = [Input(ipt.name) for ipt in (input + label)]
|
|
parents = [ipt for ipt in (input + label)]
|
|
if weight is not None:
|
|
assert weight.size == 1
|
|
ipts.append(Input(weight.name))
|
|
parents.append(weight)
|
|
return ipts, parents
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def square_error_cost(input,
|
|
label,
|
|
weight=None,
|
|
name=None,
|
|
coeff=1.0,
|
|
layer_attr=None):
|
|
"""
|
|
sum of square error cost:
|
|
|
|
.. math::
|
|
|
|
cost = \\sum_{i=1}^N(t_i-y_i)^2
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The first input layer.
|
|
:type input: LayerOutput
|
|
:param label: The input label.
|
|
:type label: LayerOutput
|
|
:param weight: The weight layer defines a weight for each sample in the
|
|
mini-batch. It is optional.
|
|
:type weight: LayerOutput
|
|
:param coeff: The weight of the gradient in the back propagation.
|
|
1.0 is the default value.
|
|
:type coeff: float
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
ipts, parents = __cost_input__(input, label, weight)
|
|
|
|
Layer(
|
|
inputs=ipts,
|
|
type="square_error",
|
|
name=name,
|
|
coeff=coeff,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(name, LayerType.COST, parents=parents, size=1)
|
|
|
|
|
|
regression_cost = square_error_cost
|
|
|
|
|
|
@wrap_name_default("cost")
|
|
@layer_support()
|
|
def classification_cost(input,
|
|
label,
|
|
weight=None,
|
|
name=None,
|
|
evaluator=classification_error_evaluator,
|
|
layer_attr=None,
|
|
coeff=1.):
|
|
"""
|
|
classification cost Layer.
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The first input layer.
|
|
:type input: LayerOutput
|
|
:param label: The input label.
|
|
:type label: LayerOutput
|
|
:param weight: The weight layer defines a weight for each sample in the
|
|
mini-batch. It is optional.
|
|
:type weight: LayerOutput
|
|
:param evaluator: Evaluator method. classification_error_evaluator is the default.
|
|
:type evaluator: Evaluator method
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:param coeff: The weight of the gradient in the back propagation.
|
|
1.0 is the default value.
|
|
:type coeff: float
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
assert input.layer_type != LayerType.DATA
|
|
assert isinstance(input.activation, SoftmaxActivation)
|
|
assert label.layer_type == LayerType.DATA
|
|
|
|
ipts, parents = __cost_input__(input, label, weight)
|
|
|
|
Layer(
|
|
name=name,
|
|
type="multi-class-cross-entropy",
|
|
inputs=ipts,
|
|
coeff=coeff,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
|
|
def __add_evaluator__(e):
|
|
assert callable(e)
|
|
assert hasattr(e, 'is_evaluator')
|
|
assert isinstance(e.is_evaluator, bool)
|
|
assert e.is_evaluator
|
|
assert hasattr(e, "for_classification")
|
|
assert isinstance(e.for_classification, bool)
|
|
assert e.for_classification
|
|
|
|
e(name=e.__name__, input=input, label=label, weight=weight)
|
|
|
|
if not isinstance(evaluator, collections.Sequence):
|
|
evaluator = [evaluator]
|
|
|
|
for each_evaluator in evaluator:
|
|
__add_evaluator__(each_evaluator)
|
|
|
|
return LayerOutput(name, LayerType.COST, parents=parents, size=1)
|
|
|
|
|
|
def conv_operator(img,
|
|
filter,
|
|
filter_size,
|
|
num_filters,
|
|
num_channels=None,
|
|
stride=1,
|
|
padding=0,
|
|
filter_size_y=None,
|
|
stride_y=None,
|
|
padding_y=None,
|
|
trans=False):
|
|
"""
|
|
Different from img_conv_layer, conv_op is an Operator, which can be used
|
|
in mixed_layer. And conv_op takes two inputs to perform convolution.
|
|
The first input is the image and the second is filter kernel. It only
|
|
supports GPU mode.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
op = conv_operator(img=input1,
|
|
filter=input2,
|
|
filter_size=3,
|
|
num_filters=64,
|
|
num_channels=64)
|
|
|
|
:param img: The input image.
|
|
:type img: LayerOutput
|
|
:param filter: The input filter.
|
|
:type filter: LayerOutput
|
|
:param filter_size: The dimension of the filter kernel on the x axis.
|
|
:type filter_size: int
|
|
:param filter_size_y: The dimension of the filter kernel on the y axis.
|
|
If the parameter is not set or set to None, it will
|
|
set to 'filter_size' automatically.
|
|
:type filter_size_y: int
|
|
:param num_filters: The number of the output channels.
|
|
:type num_filters: int
|
|
:param num_channels: The number of the input channels. If the parameter is not set
|
|
or set to None, it will be automatically set to the channel
|
|
number of the 'img'.
|
|
:type num_channels: int
|
|
:param stride: The stride on the x axis.
|
|
:type stride: int
|
|
:param stride_y: The stride on the y axis. If the parameter is not set or
|
|
set to None, it will be set to 'stride' automatically.
|
|
:type stride_y: int
|
|
:param padding: The padding size on the x axis.
|
|
:type padding: int
|
|
:param padding_y: The padding size on the y axis. If the parameter is not set
|
|
or set to None, it will be set to 'padding' automatically.
|
|
:type padding_y: int
|
|
:return: A ConvOperator Object.
|
|
:rtype: ConvOperator
|
|
"""
|
|
if filter_size_y is None:
|
|
filter_size_y = filter_size
|
|
if stride_y is None:
|
|
stride_y = stride
|
|
if padding_y is None:
|
|
padding_y = padding
|
|
|
|
if num_channels is None:
|
|
num_channels = img.num_filters
|
|
|
|
assert isinstance(filter, LayerOutput)
|
|
assert filter.size is not None
|
|
|
|
opCls = ConvTransOperator if trans else ConvOperator
|
|
|
|
op = opCls(
|
|
input_layer_names=[img.name, filter.name],
|
|
num_filters=num_filters,
|
|
conv_conf=Conv(
|
|
filter_size=filter_size,
|
|
padding=padding,
|
|
stride=stride,
|
|
channels=num_channels,
|
|
filter_size_y=filter_size_y,
|
|
padding_y=padding_y,
|
|
stride_y=stride_y,
|
|
groups=1))
|
|
|
|
op.origin = [img, filter]
|
|
return op
|
|
|
|
|
|
@wrap_param_attr_default()
|
|
def conv_projection(input,
|
|
filter_size,
|
|
num_filters,
|
|
num_channels=None,
|
|
stride=1,
|
|
padding=0,
|
|
filter_size_y=None,
|
|
stride_y=None,
|
|
padding_y=None,
|
|
groups=1,
|
|
param_attr=None,
|
|
trans=False):
|
|
"""
|
|
Different from img_conv_layer and conv_op, conv_projection is a Projection,
|
|
which can be used in mixed_layer and concat_layer. It uses cudnn to implement
|
|
convolution and only supports GPU mode.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
proj = conv_projection(input=input1,
|
|
filter_size=3,
|
|
num_filters=64,
|
|
num_channels=64)
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param filter_size: The dimensions of the filter kernel. If the parameter is
|
|
set to one integer, the two dimensions on x and y axises
|
|
will be same when filter_size_y is not set. If it is set
|
|
to a list, the first element indicates the dimension on
|
|
the x axis, and the second is used to specify the dimension
|
|
on the y axis when filter_size_y is not provided.
|
|
:type filter_size: int | tuple | list
|
|
:param filter_size_y: The dimension of the filter kernel on the y axis. If the parameter
|
|
is not set, it will be set automatically according to filter_size.
|
|
:type filter_size_y: int
|
|
:param num_filters: The number of filters.
|
|
:type num_filters: int
|
|
:param num_channels: The number of the input channels.
|
|
:type num_channels: int
|
|
:param stride: The strides. If the parameter is set to one integer, the strides
|
|
on x and y axises will be same when stride_y is not set. If it is
|
|
set to a list, the first element indicates the stride on the x axis,
|
|
and the second is used to specify the stride on the y axis when
|
|
stride_y is not provided.
|
|
:type stride: int | tuple | list
|
|
:param stride_y: The stride on the y axis.
|
|
:type stride_y: int
|
|
:param padding: The padding sizes. If the parameter is set to one integer, the padding
|
|
sizes on x and y axises will be same when padding_y is not set. If it
|
|
is set to a list, the first element indicates the padding size on the
|
|
x axis, and the second is used to specify the padding size on the y axis
|
|
when padding_y is not provided.
|
|
:type padding: int | tuple | list
|
|
:param padding_y: The padding size on the y axis.
|
|
:type padding_y: int
|
|
:param groups: The group number.
|
|
:type groups: int
|
|
:param param_attr: The parameter attribute of the convolution. See ParameterAttribute for
|
|
details.
|
|
:type param_attr: ParameterAttribute
|
|
:param trans: Whether it is ConvTransProjection or ConvProjection
|
|
:type trans: bool
|
|
:return: A Projection Object.
|
|
:rtype: ConvTransProjection | ConvProjection
|
|
"""
|
|
if num_channels is None:
|
|
assert input.num_filters is not None
|
|
num_channels = input.num_filters
|
|
|
|
if filter_size_y is None:
|
|
if isinstance(filter_size, collections.Sequence):
|
|
assert len(filter_size) == 2
|
|
filter_size, filter_size_y = filter_size
|
|
else:
|
|
filter_size_y = filter_size
|
|
|
|
if stride_y is None:
|
|
if isinstance(stride, collections.Sequence):
|
|
assert len(stride) == 2
|
|
stride, stride_y = stride
|
|
else:
|
|
stride_y = stride
|
|
|
|
if padding_y is None:
|
|
if isinstance(padding, collections.Sequence):
|
|
assert len(padding) == 2
|
|
padding, padding_y = padding
|
|
else:
|
|
padding_y = padding
|
|
|
|
if param_attr.attr.get('initial_smart'):
|
|
# special initial for conv layers.
|
|
init_w = (2.0 / (filter_size**2 * num_channels))**0.5
|
|
param_attr.attr["initial_mean"] = 0.0
|
|
param_attr.attr["initial_std"] = init_w
|
|
param_attr.attr["initial_strategy"] = 0
|
|
param_attr.attr["initial_smart"] = False
|
|
|
|
projCls = ConvTransProjection if trans else ConvProjection
|
|
|
|
proj = projCls(
|
|
input_layer_name=input.name,
|
|
num_filters=num_filters,
|
|
conv_conf=Conv(
|
|
filter_size=filter_size,
|
|
padding=padding,
|
|
stride=stride,
|
|
channels=num_channels,
|
|
filter_size_y=filter_size_y,
|
|
padding_y=padding_y,
|
|
stride_y=stride_y,
|
|
groups=groups),
|
|
**param_attr.attr)
|
|
|
|
proj.origin = input
|
|
return proj
|
|
|
|
|
|
@wrap_name_default("pad")
|
|
@layer_support()
|
|
def pad_layer(input,
|
|
pad_c=None,
|
|
pad_h=None,
|
|
pad_w=None,
|
|
name=None,
|
|
layer_attr=None):
|
|
"""
|
|
This operation pads zeros to the input data according to pad_c,pad_h
|
|
and pad_w. pad_c, pad_h, pad_w specify the size in the corresponding
|
|
dimension. And the input data shape is NCHW.
|
|
|
|
For example, pad_c=[2,3] means padding 2 zeros before the input data
|
|
and 3 zeros after the input data in the channel dimension. pad_h means
|
|
padding zeros in the height dimension. pad_w means padding zeros in the
|
|
width dimension.
|
|
|
|
For example,
|
|
|
|
.. code-block:: python
|
|
|
|
input(2,2,2,3) = [
|
|
[ [[1,2,3], [3,4,5]],
|
|
[[2,3,5], [1,6,7]] ],
|
|
[ [[4,3,1], [1,8,7]],
|
|
[[3,8,9], [2,3,5]] ]
|
|
]
|
|
|
|
pad_c=[1,1], pad_h=[0,0], pad_w=[0,0]
|
|
|
|
output(2,4,2,3) = [
|
|
[ [[0,0,0], [0,0,0]],
|
|
[[1,2,3], [3,4,5]],
|
|
[[2,3,5], [1,6,7]],
|
|
[[0,0,0], [0,0,0]] ],
|
|
[ [[0,0,0], [0,0,0]],
|
|
[[4,3,1], [1,8,7]],
|
|
[[3,8,9], [2,3,5]],
|
|
[[0,0,0], [0,0,0]] ]
|
|
]
|
|
|
|
The simply usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
pad = pad_layer(input=ipt,
|
|
pad_c=[4,4],
|
|
pad_h=[0,0],
|
|
pad_w=[2,2])
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param pad_c: The padding size in the channel dimension.
|
|
:type pad_c: list | None
|
|
:param pad_h: The padding size in the height dimension.
|
|
:type pad_h: list | None
|
|
:param pad_w: The padding size in the width dimension.
|
|
:type pad_w: list | None
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
if pad_c is not None:
|
|
assert isinstance(pad_c, collections.Sequence) and len(pad_c) == 2
|
|
else:
|
|
pad_c = [0, 0]
|
|
|
|
if pad_h is not None:
|
|
assert isinstance(pad_h, collections.Sequence) and len(pad_h) == 2
|
|
else:
|
|
pad_h = [0, 0]
|
|
|
|
if pad_w is not None:
|
|
assert isinstance(pad_w, collections.Sequence) and len(pad_w) == 2
|
|
else:
|
|
pad_w = [0, 0]
|
|
|
|
assert input.num_filters is not None
|
|
in_ch = input.num_filters
|
|
out_ch = in_ch + pad_c[0] + pad_c[1]
|
|
|
|
l = Layer(
|
|
name=name,
|
|
type=LayerType.PAD_LAYER,
|
|
inputs=Input(
|
|
input.name,
|
|
pad=Pad(
|
|
channels=in_ch,
|
|
pad_c=pad_c,
|
|
pad_h=pad_h,
|
|
pad_w=pad_w, )),
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name,
|
|
layer_type=LayerType.PAD_LAYER,
|
|
parents=[input],
|
|
num_filters=out_ch,
|
|
size=l.config.size)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def conv_shift_layer(a, b, name=None, layer_attr=None):
|
|
"""
|
|
This layer performs cyclic convolution on two inputs. For example:
|
|
- a[in]: contains M elements.
|
|
- b[in]: contains N elements (N should be odd).
|
|
- c[out]: contains M elements.
|
|
|
|
.. math::
|
|
|
|
c[i] = \sum_{j=-(N-1)/2}^{(N-1)/2}a_{i+j} * b_{j}
|
|
|
|
In this formula:
|
|
- a's index is computed modulo M. When it is negative, then get item from
|
|
the right side (which is the end of array) to the left.
|
|
- b's index is computed modulo N. When it is negative, then get item from
|
|
the right size (which is the end of array) to the left.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
conv_shift = conv_shift_layer(a=layer1, b=layer2)
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param a: The first input of this layer.
|
|
:type a: LayerOutput
|
|
:param b: The second input of this layer.
|
|
:type b: LayerOutput
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
assert isinstance(a, LayerOutput) and isinstance(b, LayerOutput)
|
|
assert b.size is None or b.size % 2 == 1 # size of b must be odd.
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.CONV_SHIFT_LAYER,
|
|
inputs=[a.name, b.name],
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
|
|
return LayerOutput(
|
|
name, LayerType.CONV_SHIFT_LAYER, parents=[a, b], size=a.size)
|
|
|
|
|
|
@wrap_name_default()
|
|
@wrap_param_attr_default()
|
|
@wrap_bias_attr_default()
|
|
@wrap_act_default(act=LinearActivation())
|
|
@layer_support(ERROR_CLIPPING, DROPOUT)
|
|
def tensor_layer(a,
|
|
b,
|
|
size,
|
|
act=None,
|
|
name=None,
|
|
param_attr=None,
|
|
bias_attr=None,
|
|
layer_attr=None):
|
|
"""
|
|
This layer performs tensor operation on two inputs.
|
|
For example:
|
|
|
|
.. math::
|
|
y_{i} = a * W_{i} * {b^\mathrm{T}}, i=0,1,...,K-1
|
|
|
|
In this formular:
|
|
- :math:`a`: the first input contains M elements.
|
|
- :math:`b`: the second input contains N elements.
|
|
- :math:`y_{i}`: the i-th element of y.
|
|
- :math:`W_{i}`: the i-th learned weight, shape if [M, N]
|
|
- :math:`b^\mathrm{T}`: the transpose of :math:`b_{2}`.
|
|
|
|
The simple usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
tensor = tensor_layer(a=layer1, b=layer2, size=1000)
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param a: The first input of this layer.
|
|
:type a: LayerOutput
|
|
:param b: The second input of this layer.
|
|
:type b: LayerOutput
|
|
:param size: The dimension of this layer.
|
|
:type size: int
|
|
:param act: Activation type. LinearActivation is the default activation.
|
|
:type act: BaseActivation
|
|
:param param_attr: The parameter attribute. See ParameterAttribute for
|
|
details.
|
|
:type param_attr: ParameterAttribute
|
|
:param bias_attr: The parameter attribute for bias. If this parameter is set to
|
|
False or an object whose type is not ParameterAttribute,
|
|
no bias is defined. If this parameter is set to True,
|
|
the bias is initialized to zero.
|
|
:type bias_attr: ParameterAttribute | None | bool | Any
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute | None
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
assert isinstance(a, LayerOutput) and isinstance(b, LayerOutput)
|
|
Layer(
|
|
name=name,
|
|
size=size,
|
|
type=LayerType.TENSOR_LAYER,
|
|
active_type=act.name,
|
|
bias=ParamAttr.to_bias(bias_attr),
|
|
inputs=[Input(a.name, **param_attr.attr), Input(b.name)],
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name, LayerType.TENSOR_LAYER, parents=[a, b], activation=act, size=size)
|
|
|
|
|
|
@wrap_name_default()
|
|
@wrap_param_attr_default()
|
|
@wrap_bias_attr_default()
|
|
@wrap_act_default()
|
|
@layer_support(DROPOUT, ERROR_CLIPPING)
|
|
def selective_fc_layer(input,
|
|
size,
|
|
select=None,
|
|
act=None,
|
|
name=None,
|
|
pass_generation=False,
|
|
has_selected_colums=True,
|
|
mul_ratio=0.02,
|
|
param_attr=None,
|
|
bias_attr=None,
|
|
layer_attr=None):
|
|
"""
|
|
Selectived fully connected layer. Different from fc_layer, the output
|
|
of this layer can be sparse. It requires an additional input to indicate
|
|
several selected columns for output. If the selected columns is not
|
|
specified, selective_fc_layer acts exactly like fc_layer.
|
|
|
|
The simple usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
sel_fc = selective_fc_layer(input=input, size=128, act=TanhActivation())
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput | list | tuple
|
|
:param select: The layer to select columns to output. It should be a sparse
|
|
binary matrix, and is treated as the mask of selective fc. If
|
|
it is not set or set to None, selective_fc_layer acts exactly
|
|
like fc_layer.
|
|
:type select: LayerOutput
|
|
:param size: The dimension of this layer, which should be equal to that of
|
|
the layer 'select'.
|
|
:type size: int
|
|
:param act: Activation type. TanhActivation is the default activation.
|
|
:type act: BaseActivation
|
|
:param pass_generation: The flag which indicates whether it is during generation.
|
|
:type pass_generation: bool
|
|
:param has_selected_colums: The flag which indicates whether the parameter 'select'
|
|
has been set. True is the default.
|
|
:type has_selected_colums: bool
|
|
:param mul_ratio: A ratio helps to judge how sparse the output is and determine
|
|
the computation method for speed consideration.
|
|
:type mul_ratio: float
|
|
:param param_attr: The parameter attribute. See ParameterAttribute for
|
|
details.
|
|
:type param_attr: ParameterAttribute
|
|
:param bias_attr: The parameter attribute for bias. If this parameter is set to
|
|
False or an object whose type is not ParameterAttribute,
|
|
no bias is defined. If this parameter is set to True,
|
|
the bias is initialized to zero.
|
|
:type bias_attr: ParameterAttribute | None | bool | Any
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute | None
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
if isinstance(input, LayerOutput):
|
|
input = [input]
|
|
assert not isinstance(param_attr, collections.Sequence)
|
|
param_attr = [param_attr]
|
|
else:
|
|
if isinstance(param_attr, collections.Sequence):
|
|
assert len(input) == len(param_attr)
|
|
else:
|
|
if "parameter_name" in param_attr.attr and len(input) > 1:
|
|
logger.fatal(
|
|
"When the name field of param_attr is manually specified "
|
|
"and the input is a list, the param_attr should also be a "
|
|
"list with each item being the param_attr for each input "
|
|
"item. If only one named param_attr is provided, all the "
|
|
"input items would share this parameter.")
|
|
param_attr = [copy.deepcopy(param_attr) for _ in range(len(input))]
|
|
|
|
assert isinstance(input, collections.Sequence)
|
|
assert isinstance(select, LayerOutput)
|
|
if select.size is not None:
|
|
assert select.size == size
|
|
Layer(
|
|
inputs=[
|
|
Input(ipt.name, **attr.attr) for ipt, attr in zip(input, param_attr)
|
|
] + [select.name],
|
|
name=name,
|
|
type=LayerType.SEL_FC_LAYER,
|
|
size=size,
|
|
bias=ParameterAttribute.to_bias(bias_attr),
|
|
active_type=act.name,
|
|
selective_fc_pass_generation=pass_generation,
|
|
has_selected_colums=has_selected_colums,
|
|
selective_fc_full_mul_ratio=mul_ratio,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name,
|
|
LayerType.SEL_FC_LAYER,
|
|
list(input) + [select],
|
|
activation=act,
|
|
size=size)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def sampling_id_layer(input, name=None, layer_attr=None):
|
|
"""
|
|
A layer for sampling id from a multinomial distribution from the input layer.
|
|
Sampling one id for one sample.
|
|
|
|
The simple usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
samping_id = sampling_id_layer(input=input)
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
l = Layer(
|
|
name=name,
|
|
type=LayerType.SAMPLING_ID_LAYER,
|
|
inputs=[Input(input.name)],
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name, LayerType.SAMPLING_ID_LAYER, input, size=l.config.size)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def slope_intercept_layer(input,
|
|
name=None,
|
|
slope=1.0,
|
|
intercept=0.0,
|
|
layer_attr=None):
|
|
"""
|
|
This layer for applying a slope and an intercept to the input.
|
|
|
|
.. math::
|
|
y = slope * x + intercept
|
|
|
|
The simple usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
scale = slope_intercept_layer(input=input, slope=-1.0, intercept=1.0)
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param slope: The scale factor.
|
|
:type slope: float
|
|
:param intercept: The offset.
|
|
:type intercept: float
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.SLOPE_INTERCEPT_LAYER,
|
|
slope=slope,
|
|
intercept=intercept,
|
|
inputs=[Input(input.name)],
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name, LayerType.SLOPE_INTERCEPT_LAYER, input, size=input.size)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def linear_comb_layer(weights, vectors, size=None, name=None, layer_attr=None):
|
|
"""
|
|
A layer for weighted sum of vectors takes two inputs.
|
|
- Input: size of weights is M
|
|
size of vectors is M*N
|
|
- Output: a vector of size=N
|
|
|
|
.. math::
|
|
|
|
z(i) = \sum_{j=0}^{M-1} x(j) y(i+Nj)
|
|
|
|
where :math:`0 \le i \le N-1`
|
|
|
|
Or in the matrix notation:
|
|
|
|
.. math::
|
|
|
|
z = x^\mathrm{T} Y
|
|
|
|
In this formular:
|
|
- :math:`x`: weights
|
|
- :math:`y`: vectors.
|
|
- :math:`z`: the output.
|
|
|
|
Note that the above computation is for one sample. Multiple samples are
|
|
processed in one batch.
|
|
|
|
The simple usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
linear_comb = linear_comb_layer(weights=weight, vectors=vectors,
|
|
size=elem_dim)
|
|
|
|
:param weights: The weight layer.
|
|
:type weights: LayerOutput
|
|
:param vectors: The vector layer.
|
|
:type vectors: LayerOutput
|
|
:param size: The dimension of this layer.
|
|
:type size: int
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
assert isinstance(weights, LayerOutput) and isinstance(vectors, LayerOutput)
|
|
if vectors.size is not None and weights.size is not None:
|
|
assert vectors.size % weights.size == 0
|
|
if size is None:
|
|
size = vectors.size / weights.size
|
|
else:
|
|
assert size == vectors.size / weights.size
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.LINEAR_COMBINATION_LAYER,
|
|
size=size,
|
|
inputs=[Input(weights.name), Input(vectors.name)],
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name, LayerType.LINEAR_COMBINATION_LAYER, [weights, vectors], size=size)
|
|
|
|
|
|
convex_comb_layer = linear_comb_layer
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def block_expand_layer(input,
|
|
block_x=0,
|
|
block_y=0,
|
|
stride_x=0,
|
|
stride_y=0,
|
|
padding_x=0,
|
|
padding_y=0,
|
|
num_channels=None,
|
|
name=None,
|
|
layer_attr=None):
|
|
"""
|
|
Expand feature map to minibatch matrix.
|
|
- matrix width is: block_y * block_x * num_channels
|
|
- matirx height is: outputH * outputW
|
|
|
|
.. math::
|
|
|
|
outputH = 1 + (2 * padding_y + imgSizeH - block_y + stride_y - 1) / stride_y
|
|
|
|
outputW = 1 + (2 * padding_x + imgSizeW - block_x + stride_x - 1) / stride_x
|
|
|
|
The expanding method is the same with ExpandConvLayer, but saved the transposed
|
|
value. After expanding, output.sequenceStartPositions will store timeline.
|
|
The number of time steps is outputH * outputW and the dimension of each
|
|
time step is block_y * block_x * num_channels. This layer can be used after
|
|
convolutional neural network, and before recurrent neural network.
|
|
|
|
The simple usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
block_expand = block_expand_layer(input=layer,
|
|
num_channels=128,
|
|
stride_x=1,
|
|
stride_y=1,
|
|
block_x=1,
|
|
block_x=3)
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param num_channels: The number of input channels. If the parameter is not set or
|
|
set to None, its actual value will be automatically set to
|
|
the channels number of the input.
|
|
:type num_channels: int
|
|
:param block_x: The width of sub block.
|
|
:type block_x: int
|
|
:param block_y: The width of sub block.
|
|
:type block_y: int
|
|
:param stride_x: The stride size in horizontal direction.
|
|
:type stride_x: int
|
|
:param stride_y: The stride size in vertical direction.
|
|
:type stride_y: int
|
|
:param padding_x: The padding size in horizontal direction.
|
|
:type padding_x: int
|
|
:param padding_y: The padding size in vertical direction.
|
|
:type padding_y: int
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring.
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
if num_channels is None:
|
|
assert input.num_filters is not None
|
|
num_channels = input.num_filters
|
|
l = Layer(
|
|
name=name,
|
|
inputs=Input(
|
|
input.name,
|
|
block_expand=BlockExpand(
|
|
channels=num_channels,
|
|
block_x=block_x,
|
|
block_y=block_y,
|
|
stride_x=stride_x,
|
|
stride_y=stride_y,
|
|
padding_x=padding_x,
|
|
padding_y=padding_y)),
|
|
type=LayerType.BLOCK_EXPAND,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
|
|
return LayerOutput(
|
|
name, LayerType.BLOCK_EXPAND, parents=[input], size=l.config.size)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
|
|
"""
|
|
A layer to do max out on convolutional layer output.
|
|
- Input: the output of a convolutional layer.
|
|
- Output: feature map size same as the input's, and its channel number is
|
|
(input channel) / groups.
|
|
|
|
So groups should be larger than 1, and the num of channels should be able
|
|
to be devided by groups.
|
|
|
|
Reference:
|
|
`Maxout Networks
|
|
<http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf>`_
|
|
`Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks
|
|
<https://arxiv.org/pdf/1312.6082v4.pdf>`_
|
|
|
|
|
|
.. math::
|
|
|
|
& out = \max_k (in[n, k, o_c , s])
|
|
|
|
& out_{i * s + j} = \max_k in_{ k * o_{c} * s + i * s + j}
|
|
|
|
& s = \\frac{input.size}{ num\_channels}
|
|
|
|
& o_{c} = \\frac{num\_channels}{groups}
|
|
|
|
& 0 \le i < o_{c}
|
|
|
|
& 0 \le j < s
|
|
|
|
& 0 \le k < groups
|
|
|
|
|
|
The simple usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
maxout = maxout_layer(input,
|
|
num_channels=128,
|
|
groups=4)
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param num_channels: The number of input channels. If the parameter is not set or
|
|
set to None, its actual value will be automatically set to
|
|
the channels number of the input.
|
|
:type num_channels: int
|
|
:param groups: The group number of input layer.
|
|
:type groups: int
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
assert isinstance(input.activation, LinearActivation)
|
|
assert groups > 1
|
|
if num_channels is None:
|
|
assert input.num_filters is not None
|
|
num_channels = input.num_filters
|
|
assert num_channels % groups == 0
|
|
l = Layer(
|
|
name=name,
|
|
inputs=Input(
|
|
input.name, maxout=MaxOut(
|
|
channels=num_channels, groups=groups)),
|
|
type=LayerType.MAXOUT,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name, LayerType.MAXOUT, parents=[input], size=l.config.size)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def ctc_layer(input,
|
|
label,
|
|
size=None,
|
|
name=None,
|
|
norm_by_times=False,
|
|
layer_attr=None):
|
|
"""
|
|
Connectionist Temporal Classification (CTC) is designed for temporal
|
|
classication task. e.g. sequence labeling problems where the
|
|
alignment between the inputs and the target labels is unknown.
|
|
|
|
Reference:
|
|
`Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
|
|
with Recurrent Neural Networks
|
|
<http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf>`_
|
|
|
|
Note:
|
|
Considering the 'blank' label needed by CTC, you need to use (num_classes + 1)
|
|
as the size of the input, where num_classes is the category number.
|
|
And the 'blank' is the last category index. So the size of 'input' layer (e.g.
|
|
fc_layer with softmax activation) should be (num_classes + 1). The size of
|
|
ctc_layer should also be (num_classes + 1).
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
ctc = ctc_layer(input=input,
|
|
label=label,
|
|
size=9055,
|
|
norm_by_times=True)
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param label: The input label.
|
|
:type label: LayerOutput
|
|
:param size: The dimension of this layer, which must be equal to (category number + 1).
|
|
:type size: int
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param norm_by_times: Whether to do normalization by times. False is the default.
|
|
:type norm_by_times: bool
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
assert isinstance(input, LayerOutput)
|
|
assert isinstance(label, LayerOutput)
|
|
if label.size is not None:
|
|
if size is not None:
|
|
assert size == label.size + 1
|
|
else:
|
|
size = label.size + 1
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.CTC_LAYER,
|
|
size=size,
|
|
norm_by_times=norm_by_times,
|
|
inputs=[input.name, label.name],
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(name, LayerType.CTC_LAYER, [input, label], size=size)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def warp_ctc_layer(input,
|
|
label,
|
|
size=None,
|
|
name=None,
|
|
blank=0,
|
|
norm_by_times=False,
|
|
layer_attr=None):
|
|
"""
|
|
A layer intergrating the open-source `warp-ctc
|
|
<https://github.com/baidu-research/warp-ctc>`_ library, which is used in
|
|
`Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin
|
|
<https://arxiv.org/pdf/1512.02595v1.pdf>`_, to compute Connectionist Temporal
|
|
Classification (CTC) loss. Besides, another `warp-ctc
|
|
<https://github.com/gangliao/warp-ctc>`_ repository, which is forked from
|
|
the official one, is maintained to enable more compiling options. During the
|
|
building process, PaddlePaddle will clone the source codes, build and
|
|
install it to :code:`third_party/install/warpctc` directory.
|
|
|
|
Reference:
|
|
`Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
|
|
with Recurrent Neural Networks
|
|
<http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf>`_
|
|
|
|
Note:
|
|
- Let num_classes represents the category number. Considering the 'blank'
|
|
label needed by CTC, you need to use (num_classes + 1) as the size of
|
|
warp_ctc layer.
|
|
- You can set 'blank' to any value ranged in [0, num_classes], which
|
|
should be consistent with those used in your labels.
|
|
- As a native 'softmax' activation is interated to the warp-ctc library,
|
|
'linear' activation is expected to be used instead in the 'input' layer.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
ctc = warp_ctc_layer(input=input,
|
|
label=label,
|
|
size=1001,
|
|
blank=1000,
|
|
norm_by_times=False)
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param label: The input label.
|
|
:type label: LayerOutput
|
|
:param size: The dimension of this layer, which must be equal to (category number + 1).
|
|
:type size: int
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param blank: The 'blank' label used in ctc.
|
|
:type blank: int
|
|
:param norm_by_times: Whether to do normalization by times. False is the default.
|
|
:type norm_by_times: bool
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
assert isinstance(input, LayerOutput)
|
|
assert isinstance(label, LayerOutput)
|
|
if label.size is not None:
|
|
if size is not None:
|
|
assert size == label.size + 1
|
|
else:
|
|
size = label.size + 1
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.WARP_CTC_LAYER,
|
|
size=size,
|
|
blank=blank,
|
|
norm_by_times=norm_by_times,
|
|
inputs=[input.name, label.name],
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name, LayerType.WARP_CTC_LAYER, parents=[input, label], size=size)
|
|
|
|
|
|
@wrap_name_default()
|
|
@wrap_param_attr_default()
|
|
@layer_support()
|
|
def crf_layer(input,
|
|
label,
|
|
size=None,
|
|
weight=None,
|
|
param_attr=None,
|
|
name=None,
|
|
coeff=1.0,
|
|
layer_attr=None):
|
|
"""
|
|
A layer for calculating the cost of sequential conditional random
|
|
field model.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
crf = crf_layer(input=input,
|
|
label=label,
|
|
size=label_dim)
|
|
|
|
:param input: The first input layer.
|
|
:type input: LayerOutput
|
|
:param label: The input label.
|
|
:type label: LayerOutput
|
|
:param size: The category number.
|
|
:type size: int
|
|
:param weight: The weight layer defines a weight for each sample in the
|
|
mini-batch. It is optional.
|
|
:type weight: LayerOutput
|
|
:param param_attr: The parameter attribute. See ParameterAttribute for
|
|
details.
|
|
:type param_attr: ParameterAttribute
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param coeff: The weight of the gradient in the back propagation.
|
|
1.0 is the default value.
|
|
:type coeff: float
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
assert isinstance(input, LayerOutput)
|
|
assert isinstance(label, LayerOutput)
|
|
assert weight is None or isinstance(weight, LayerOutput)
|
|
if input.size is not None and label.size is not None:
|
|
assert input.size == label.size
|
|
if size is None:
|
|
size = input.size
|
|
else:
|
|
assert size == input.size
|
|
|
|
ipts = [Input(input.name, **param_attr.attr), Input(label.name)]
|
|
if weight is not None:
|
|
ipts.append(Input(weight.name))
|
|
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.CRF_LAYER,
|
|
size=size,
|
|
inputs=ipts,
|
|
coeff=coeff,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
parents = [input, label]
|
|
if weight is not None:
|
|
parents.append(weight)
|
|
# The size for LayerOutput means the dimension of the output.
|
|
# It's different from the meaning of crf layer, which is the number of
|
|
# classes.
|
|
return LayerOutput(name, LayerType.CRF_LAYER, parents, size=1)
|
|
|
|
|
|
@wrap_name_default()
|
|
@wrap_param_attr_default()
|
|
@layer_support()
|
|
def crf_decoding_layer(input,
|
|
size,
|
|
label=None,
|
|
param_attr=None,
|
|
name=None,
|
|
layer_attr=None):
|
|
"""
|
|
A layer for calculating the decoding sequence of sequential conditional
|
|
random field model. The decoding sequence is stored in output.ids.
|
|
If the input 'label' is provided, it is treated as the ground-truth label, and
|
|
this layer will also calculate error. output.value[i] is 1 for an incorrect
|
|
decoding and 0 for the correct.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
crf_decoding = crf_decoding_layer(input=input,
|
|
size=label_dim)
|
|
|
|
:param input: The first input layer.
|
|
:type input: LayerOutput
|
|
:param size: The dimension of this layer.
|
|
:type size: int
|
|
:param label: The input label.
|
|
:type label: LayerOutput | None
|
|
:param param_attr: The parameter attribute. See ParameterAttribute for
|
|
details.
|
|
:type param_attr: ParameterAttribute
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
|
|
assert isinstance(input, LayerOutput)
|
|
assert label is None or isinstance(label, LayerOutput)
|
|
|
|
ipts = [Input(input.name, **param_attr.attr)]
|
|
if label is not None:
|
|
ipts.append(Input(label.name))
|
|
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.CRF_DECODING_LAYER,
|
|
size=size,
|
|
inputs=ipts,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
parents = [input]
|
|
if label is not None:
|
|
parents.append(label)
|
|
# The size for LayerOutput means the dimension of the output.
|
|
# It's different from the meaning of crf layer, which is the number of
|
|
# classes.
|
|
return LayerOutput(name, LayerType.CRF_DECODING_LAYER, parents, size=1)
|
|
|
|
|
|
"""
|
|
Following are cost Layers.
|
|
"""
|
|
|
|
|
|
@wrap_bias_attr_default(has_bias=True)
|
|
@wrap_param_attr_default()
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def nce_layer(input,
|
|
label,
|
|
num_classes=None,
|
|
param_attr=None,
|
|
weight=None,
|
|
num_neg_samples=10,
|
|
neg_distribution=None,
|
|
name=None,
|
|
bias_attr=None,
|
|
layer_attr=None):
|
|
"""
|
|
Noise-contrastive estimation.
|
|
|
|
Reference:
|
|
`A fast and simple algorithm for training neural probabilistic language
|
|
models. <https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf>`_
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
cost = nce_layer(input=[layer1, layer2], label=layer2,
|
|
param_attr=[attr1, attr2], weight=layer3,
|
|
num_classes=3, neg_distribution=[0.1,0.3,0.6])
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The first input of this layer.
|
|
:type input: LayerOutput | list | tuple | collections.Sequence
|
|
:param label: The input label.
|
|
:type label: LayerOutput
|
|
:param weight: The weight layer defines a weight for each sample in the
|
|
mini-batch. It is optional.
|
|
:type weight: LayerOutput
|
|
:param num_classes: The number of classes.
|
|
:type num_classes: int
|
|
:param act: Activation type. SigmoidActivation is the default activation.
|
|
:type act: BaseActivation
|
|
:param param_attr: The parameter attribute. See ParameterAttribute for
|
|
details.
|
|
:type param_attr: ParameterAttribute
|
|
:param num_neg_samples: The number of sampled negative labels. 10 is the
|
|
default value.
|
|
:type num_neg_samples: int
|
|
:param neg_distribution: The discrete noisy distribution over the output
|
|
space from which num_neg_samples negative labels
|
|
are sampled. If this parameter is not set, a
|
|
uniform distribution will be used. A user-defined
|
|
distribution is a list whose length must be equal
|
|
to the num_classes. Each member of the list defines
|
|
the probability of a class given input x.
|
|
:type neg_distribution: list | tuple | collections.Sequence | None
|
|
:param bias_attr: The parameter attribute for bias. If this parameter is set to
|
|
False or an object whose type is not ParameterAttribute,
|
|
no bias is defined. If this parameter is set to True,
|
|
the bias is initialized to zero.
|
|
:type bias_attr: ParameterAttribute | None | bool | Any
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
if isinstance(input, LayerOutput):
|
|
input = [input]
|
|
assert not isinstance(param_attr, collections.Sequence)
|
|
param_attr = [param_attr]
|
|
else:
|
|
if isinstance(param_attr, collections.Sequence):
|
|
assert len(input) == len(param_attr)
|
|
else:
|
|
param_attr = [copy.deepcopy(param_attr) for _ in range(len(input))]
|
|
|
|
assert isinstance(input, collections.Sequence)
|
|
|
|
assert isinstance(label, LayerOutput)
|
|
assert label.layer_type == LayerType.DATA
|
|
if num_classes is None:
|
|
num_classes = label.size
|
|
if neg_distribution is not None:
|
|
assert isinstance(neg_distribution, collections.Sequence)
|
|
assert len(neg_distribution) == num_classes
|
|
assert abs(sum(neg_distribution) - 1.0) < 1e-5
|
|
|
|
ipts_for_layer = []
|
|
parents = []
|
|
for each_input, attr in zip(input, param_attr):
|
|
assert isinstance(each_input, LayerOutput)
|
|
ipts_for_layer.append(Input(each_input.name, **attr.attr))
|
|
parents.append(each_input)
|
|
ipts_for_layer.append(label.name)
|
|
parents.append(label)
|
|
|
|
if weight is not None:
|
|
assert isinstance(weight, LayerOutput)
|
|
assert weight.layer_type == LayerType.DATA
|
|
ipts_for_layer.append(weight.name)
|
|
parents.append(weight)
|
|
|
|
l = Layer(
|
|
name=name,
|
|
type=LayerType.NCE_LAYER,
|
|
num_classes=num_classes,
|
|
neg_sampling_dist=neg_distribution,
|
|
active_type=SigmoidActivation().name,
|
|
num_neg_samples=num_neg_samples,
|
|
inputs=ipts_for_layer,
|
|
bias=ParamAttr.to_bias(bias_attr),
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name,
|
|
LayerType.NCE_LAYER,
|
|
parents=parents,
|
|
size=l.config.size,
|
|
activation=SigmoidActivation())
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def rank_cost(left,
|
|
right,
|
|
label,
|
|
weight=None,
|
|
name=None,
|
|
coeff=1.0,
|
|
layer_attr=None):
|
|
"""
|
|
A cost Layer for learning to rank using gradient descent.
|
|
|
|
Reference:
|
|
`Learning to Rank using Gradient Descent
|
|
<http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf>`_
|
|
|
|
.. math::
|
|
|
|
C_{i,j} & = -\\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}})
|
|
|
|
o_{i,j} & = o_i - o_j
|
|
|
|
\\tilde{P_{i,j}} & = \\{0, 0.5, 1\\} \ or \ \\{0, 1\\}
|
|
|
|
In this formula:
|
|
- :math:`C_{i,j}` is the cross entropy cost.
|
|
- :math:`\\tilde{P_{i,j}}` is the label. 1 means positive order
|
|
and 0 means reverse order.
|
|
- :math:`o_i` and :math:`o_j`: the left output and right output.
|
|
Their dimension is one.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
cost = rank_cost(left=out_left,
|
|
right=out_right,
|
|
label=label)
|
|
|
|
:param left: The first input, the size of this layer is 1.
|
|
:type left: LayerOutput
|
|
:param right: The right input, the size of this layer is 1.
|
|
:type right: LayerOutput
|
|
:param label: Label is 1 or 0, means positive order and reverse order.
|
|
:type label: LayerOutput
|
|
:param weight: The weight layer defines a weight for each sample in the
|
|
mini-batch. It is optional.
|
|
:type weight: LayerOutput
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param coeff: The weight of the gradient in the back propagation.
|
|
1.0 is the default value.
|
|
:type coeff: float
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
assert left.size == 1
|
|
assert right.size == 1
|
|
assert label.size == 1
|
|
|
|
ipts = [left.name, right.name, label.name]
|
|
parents = [left, right, label]
|
|
if weight is not None:
|
|
ipts.append(weight.name)
|
|
parents.append(weight)
|
|
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.RANK_COST,
|
|
inputs=ipts,
|
|
coeff=coeff,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
|
|
return LayerOutput(name, LayerType.RANK_COST, parents=parents, size=1)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def lambda_cost(input,
|
|
score,
|
|
name,
|
|
NDCG_num=5,
|
|
max_sort_size=-1,
|
|
layer_attr=None):
|
|
"""
|
|
lambdaCost for lambdaRank LTR approach.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
cost = lambda_cost(input=input,
|
|
score=score,
|
|
NDCG_num=8,
|
|
max_sort_size=-1)
|
|
|
|
:param input: The first input of this layer, which is often a document
|
|
samples list of the same query and whose type must be sequence.
|
|
:type input: LayerOutput
|
|
:param score: The scores of the samples.
|
|
:type input: LayerOutput
|
|
:param NDCG_num: The size of NDCG (Normalized Discounted Cumulative Gain),
|
|
e.g., 5 for NDCG@5. It must be less than or equal to the
|
|
minimum size of the list.
|
|
:type NDCG_num: int
|
|
:param max_sort_size: The size of partial sorting in calculating gradient. If
|
|
max_sort_size is equal to -1 or greater than the number
|
|
of the samples in the list, then the algorithm will sort
|
|
the entire list to compute the gradient. In other cases,
|
|
max_sort_size must be greater than or equal to NDCG_num.
|
|
:type max_sort_size: int
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
assert isinstance(input, LayerOutput) and isinstance(score, LayerOutput)
|
|
if score.size is not None:
|
|
assert score.size == 1
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.LAMBDA_COST,
|
|
inputs=[input.name, score.name],
|
|
NDCG_num=NDCG_num,
|
|
max_sort_size=max_sort_size,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
|
|
return LayerOutput(
|
|
name, LayerType.LAMBDA_COST, parents=[input, score], size=1)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def cross_entropy(input,
|
|
label,
|
|
name=None,
|
|
coeff=1.0,
|
|
weight=None,
|
|
layer_attr=None):
|
|
"""
|
|
A loss layer for multi class entropy.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
cost = cross_entropy(input=input_layer,
|
|
label=label_layer)
|
|
|
|
:param input: The first input layer.
|
|
:type input: LayerOutput.
|
|
:param label: The input label.
|
|
:type input: LayerOutput
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param coeff: The weight of the gradient in the back propagation.
|
|
1.0 is the default value.
|
|
:type coeff: float
|
|
:param weight: The weight layer defines a weight for each sample in the
|
|
mini-batch. It is optional.
|
|
:type weight: LayerOutout
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
|
|
ipts, parents = __cost_input__(input, label, weight)
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.CROSS_ENTROPY,
|
|
inputs=ipts,
|
|
coeff=coeff,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=parents, size=1)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def cross_entropy_with_selfnorm(input,
|
|
label,
|
|
name=None,
|
|
coeff=1.0,
|
|
softmax_selfnorm_alpha=0.1,
|
|
layer_attr=None):
|
|
"""
|
|
A loss layer for multi class entropy with selfnorm.
|
|
Input should be a vector of positive numbers, without normalization.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
cost = cross_entropy_with_selfnorm(input=input_layer,
|
|
label=label_layer)
|
|
|
|
:param input: The first input layer.
|
|
:type input: LayerOutput
|
|
:param label: The input label.
|
|
:type input: LayerOutput
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param coeff: The weight of the gradient in the back propagation.
|
|
1.0 is the default value.
|
|
:type coeff: float
|
|
:param softmax_selfnorm_alpha: The scale factor affects the cost.
|
|
:type softmax_selfnorm_alpha: float
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.CROSS_ENTROPY_WITH_SELFNORM,
|
|
inputs=[input.name, label.name],
|
|
coeff=coeff,
|
|
softmax_selfnorm_alpha=softmax_selfnorm_alpha,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
|
|
return LayerOutput(
|
|
name,
|
|
LayerType.CROSS_ENTROPY_WITH_SELFNORM,
|
|
parents=[input, label],
|
|
size=1)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def sum_cost(input, name=None, layer_attr=None):
|
|
"""
|
|
A loss layer which calculates the sum of the input as loss.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
cost = sum_cost(input=input_layer)
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput.
|
|
"""
|
|
assert isinstance(input, LayerOutput)
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.SUM_COST,
|
|
inputs=[input.name],
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
|
|
return LayerOutput(name, LayerType.SUM_COST, parents=[input], size=1)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def huber_regression_cost(input,
|
|
label,
|
|
name=None,
|
|
delta=1.0,
|
|
coeff=1.0,
|
|
layer_attr=None):
|
|
"""
|
|
In statistics, the Huber loss is a loss function used in robust regression,
|
|
that is less sensitive to outliers in data than the squared error loss.
|
|
Given a prediction f(x), a label y and :math:`\delta`, the loss function
|
|
is defined as:
|
|
|
|
.. math::
|
|
|
|
loss = 0.5*(y-f(x))^{2}, | y-f(x) | < \delta
|
|
|
|
loss = \delta | y-f(x) | - 0.5 \delta ^2, otherwise
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
cost = huber_regression_cost(input=input_layer, label=label_layer)
|
|
|
|
:param input: The first input layer.
|
|
:type input: LayerOutput
|
|
:param label: The input label.
|
|
:type input: LayerOutput
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param delta: The difference between the observed and predicted values.
|
|
:type delta: float
|
|
:param coeff: The weight of the gradient in the back propagation.
|
|
1.0 is the default value.
|
|
:type coeff: float
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput.
|
|
"""
|
|
assert isinstance(input, LayerOutput)
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.HUBER_REGRESSION,
|
|
inputs=[input.name, label.name],
|
|
delta=delta,
|
|
coeff=coeff,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name, LayerType.HUBER_REGRESSION, parents=[input, label], size=1)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def huber_classification_cost(input,
|
|
label,
|
|
name=None,
|
|
coeff=1.0,
|
|
layer_attr=None):
|
|
"""
|
|
For classification purposes, a variant of the Huber loss called modified Huber
|
|
is sometimes used. Given a prediction f(x) (a real-valued classifier score) and
|
|
a true binary class label :math:`y\in \{-1, 1 \}`, the modified Huber
|
|
loss is defined as:
|
|
|
|
.. math:
|
|
|
|
loss = \max ( 0, 1-yf(x) )^2, yf(x) \geq -1
|
|
|
|
loss = -4yf(x), otherwise
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
cost = huber_classification_cost(input=input_layer, label=label_layer)
|
|
|
|
:param input: The first input layer.
|
|
:type input: LayerOutput
|
|
:param label: The input label.
|
|
:type input: LayerOutput
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param coeff: The weight of the gradient in the back propagation.
|
|
1.0 is the default value.
|
|
:type coeff: float
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
assert isinstance(input, LayerOutput)
|
|
if input.size is not None:
|
|
assert input.size == 1
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.HUBER_CLASSIFICATION,
|
|
inputs=[input.name, label.name],
|
|
coeff=coeff,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name, LayerType.HUBER_CLASSIFICATION, parents=[input, label], size=1)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def multi_binary_label_cross_entropy(input,
|
|
label,
|
|
name=None,
|
|
coeff=1.0,
|
|
layer_attr=None):
|
|
"""
|
|
A loss layer for multi binary label cross entropy.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
cost = multi_binary_label_cross_entropy(input=input_layer,
|
|
label=label_layer)
|
|
|
|
:param input: The first input layer.
|
|
:type input: LayerOutput
|
|
:param label: The input label.
|
|
:type input: LayerOutput
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param coeff: The weight of the gradient in the back propagation.
|
|
1.0 is the default value.
|
|
:type coeff: float
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
|
|
if input.activation is None or \
|
|
not isinstance(input.activation, SigmoidActivation):
|
|
logger.log(logging.WARN,
|
|
("%s is not a recommended activation for "
|
|
"multi_binary_label_cross_entropy, sigmoid is better") %
|
|
repr(input.activation))
|
|
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.MULTI_BIN_LABEL_CROSS_ENTROPY,
|
|
inputs=[input.name, label.name],
|
|
coeff=coeff,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name,
|
|
LayerType.MULTI_BIN_LABEL_CROSS_ENTROPY,
|
|
parents=[input, label],
|
|
size=1)
|
|
|
|
|
|
class BeamInput(object):
|
|
"""
|
|
Define the input for cross_entropy_over_beam layer.
|
|
|
|
A beam is made up of a triple: the first one is scores over all
|
|
candidates; the second one is indices of top k selected candidates; the
|
|
third one is the index of ground truth, which is also always called
|
|
gold.
|
|
"""
|
|
|
|
def __init__(self, candidate_scores, selected_candidates, gold):
|
|
assert isinstance(candidate_scores, LayerOutput)
|
|
self.candidate_scores = candidate_scores
|
|
assert candidate_scores.size == 1
|
|
|
|
assert isinstance(selected_candidates, LayerOutput)
|
|
self.selected_candidates = selected_candidates
|
|
|
|
assert isinstance(gold, LayerOutput)
|
|
self.gold = gold
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def cross_entropy_over_beam(input, name=None):
|
|
"""
|
|
This layer is used in learning to search models, which is to solve complex
|
|
joint prediction problems based on learning to search through a
|
|
problem-defined search space.
|
|
|
|
Specifically, the learning to search process for this layer begins with
|
|
searching a target sequence from a nested sequence. In the first search
|
|
step, top beam size sequences with highest scores, indices of these top k
|
|
sequences in the original nested sequence, and the ground truth (also
|
|
called gold) altogether (a triple) make up of the first beam.
|
|
|
|
Then, several special positions, for example, start and end positions
|
|
that define meaningful segments are searched. In these searches, top k
|
|
positions with highest scores are selected, and then sequence, starting
|
|
from the selected starts till ends of the sequences (or a fixed position)
|
|
are taken to search next.
|
|
|
|
We call the possible top k results returned in one search the beam. This
|
|
search process can be repeated for pre-defined turns and leads to several
|
|
beam expansions.
|
|
|
|
Finally, the layer cross_entropy_over_beam takes all the beam expansions
|
|
which contain several candidate targets found along the multi-step search.
|
|
cross_entropy_over_beam calculates cross entropy over the expanded beams
|
|
which all the candidates in the beam as the normalized factor.
|
|
|
|
Note that, if gold falls off the beam at search step t, then the cost is
|
|
calculated over the beam at step t.
|
|
|
|
This cost layer always works together with kmax_seq_score_layer,
|
|
sub_nested_seq_layer, and sequence_slice_layer to trim the input to form a
|
|
sub-search space.
|
|
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
cost = cross_entropy_over_beam(input=[
|
|
BeamInput(
|
|
candidate_scores=beam1_candidates,
|
|
selected_candidates=beam1_topk,
|
|
gold=gold1),
|
|
BeamInput(
|
|
candidate_scores=beam2_candidates,
|
|
selected_candidates=beam2_topk,
|
|
gold=gold2),
|
|
])
|
|
|
|
|
|
:param input: Input beams for this layer.
|
|
:type input: BeamInput
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
|
|
if isinstance(input, BeamInput):
|
|
input = [input]
|
|
else:
|
|
assert isinstance(input, list), (
|
|
'input for cross_entropy_over_beam shold be a python list '
|
|
'of BeamInput object.')
|
|
for ipt in input:
|
|
assert isinstance(ipt, BeamInput), (
|
|
'input for cross_entropy_over_beam '
|
|
'should be a BeamInput object.')
|
|
|
|
ipts = []
|
|
parents = []
|
|
for beam in input:
|
|
parents += [beam.candidate_scores, beam.selected_candidates, beam.gold]
|
|
ipts += [
|
|
beam.candidate_scores.name, beam.selected_candidates.name,
|
|
beam.gold.name
|
|
]
|
|
|
|
Layer(name=name, type=LayerType.CROSS_ENTROPY_OVER_BEAM, inputs=ipts)
|
|
return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=parents, size=1)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
|
|
"""
|
|
This is a L1 loss but more smooth. It requires that the
|
|
sizes of input and label are equal. The formula is as follows,
|
|
|
|
.. math::
|
|
|
|
L = \sum_{i} smooth_{L1}(input_i - label_i)
|
|
|
|
in which
|
|
|
|
.. math::
|
|
|
|
smooth_{L1}(x) = \\begin{cases} 0.5x^2& \\text{if} \\ |x| < 1 \\\\ |x|-0.5& \\text{otherwise} \end{cases}
|
|
|
|
Reference:
|
|
`Fast R-CNN
|
|
<https://arxiv.org/pdf/1504.08083v2.pdf>`_
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
cost = smooth_l1_cost(input=input_layer,
|
|
label=label_layer)
|
|
|
|
:param input: The input layer.
|
|
:type input: LayerOutput
|
|
:param label: The input label.
|
|
:type input: LayerOutput
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param coeff: The weight of the gradient in the back propagation.
|
|
1.0 is the default value.
|
|
:type coeff: float
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
assert isinstance(input, LayerOutput)
|
|
assert isinstance(label, LayerOutput)
|
|
assert input.size == label.size
|
|
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.SMOOTH_L1,
|
|
inputs=[input.name, label.name],
|
|
coeff=coeff,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name, LayerType.SMOOTH_L1, parents=[input, label], size=1)
|
|
|
|
|
|
@wrap_name_default()
|
|
def multiplex_layer(input, name=None, layer_attr=None):
|
|
"""
|
|
This layer multiplex multiple layers according to the indexes,
|
|
which are provided by the first input layer.
|
|
inputs[0]: the indexes of the layers to form the output of size batchSize.
|
|
inputs[1:N]; the candidate output data.
|
|
For each index i from 0 to batchSize - 1, the i-th row of the output is the
|
|
the same to the i-th row of the (index[i] + 1)-th layer.
|
|
|
|
For each i-th row of output:
|
|
.. math::
|
|
y[i][j] = x_{x_{0}[i] + 1}[i][j], j = 0,1, ... , (x_{1}.width - 1)
|
|
|
|
where, y is output. :math:`x_{k}` is the k-th input layer and
|
|
:math:`k = x_{0}[i] + 1`.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
maxid = multiplex_layer(input=layers)
|
|
|
|
:param input: Input layers.
|
|
:type input: list of LayerOutput
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute.
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
|
|
assert isinstance(input, collections.Sequence)
|
|
assert len(input) > 2, 'multiplex_layer should have more than 2 inputs'
|
|
for i in range(1, len(input)):
|
|
assert isinstance(input[i], LayerOutput)
|
|
assert input[i].size == input[1].size, \
|
|
"All the input layers except the first one should have the same size"
|
|
|
|
l = Layer(
|
|
name=name,
|
|
type='multiplex',
|
|
inputs=[x.name for x in input],
|
|
size=input[1].size,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name=name,
|
|
layer_type=LayerType.MULTIPLEX_LAYER,
|
|
parents=input,
|
|
size=l.config.size)
|
|
|
|
|
|
@wrap_name_default("dropout")
|
|
def dropout_layer(input, dropout_rate, name=None):
|
|
"""
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
dropout = dropout_layer(input=input_layer, dropout_rate=0.5)
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param dropout_rate: The probability of dropout.
|
|
:type dropout_rate: float
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
return addto_layer(
|
|
name=name,
|
|
input=input,
|
|
act=LinearActivation(),
|
|
bias_attr=False,
|
|
layer_attr=ExtraAttr(drop_rate=dropout_rate))
|
|
|
|
|
|
@wrap_name_default()
|
|
@wrap_act_default(act=LinearActivation())
|
|
@wrap_param_attr_default()
|
|
@layer_support(DROPOUT)
|
|
def row_conv_layer(input,
|
|
context_len,
|
|
act=None,
|
|
name=None,
|
|
param_attr=None,
|
|
layer_attr=None):
|
|
"""
|
|
|
|
The row convolution is called lookahead convolution. It is firstly
|
|
introduced in paper of `Deep Speech 2: End-to-End Speech Recognition
|
|
in English and Mandarin <https://arxiv.org/pdf/1512.02595v1.pdf>`_ .
|
|
|
|
The bidirectional RNN that learns representation for a sequence by
|
|
performing a forward and a backward pass through the entire sequence.
|
|
However, unlike unidirectional RNNs, bidirectional RNNs are challenging
|
|
to deploy in an online and low-latency setting. The lookahead convolution
|
|
incorporates information from future subsequences in a computationally
|
|
efficient manner to improve unidirectional RNNs.
|
|
|
|
The connection of row convolution is different from the 1D sequence
|
|
convolution. Assumed that, the future context-length is k, that is to say,
|
|
it can get the output at timestep t by using the the input feature from t-th
|
|
timestep to (t+k+1)-th timestep. Assumed that the hidden dim of input
|
|
activations are d, the activations r_t for the new layer at time-step t are:
|
|
|
|
.. math::
|
|
|
|
r_{t,r} = \sum_{j=1}^{k + 1} {w_{i,j}h_{t+j-1, i}}
|
|
\quad \\text{for} \quad (1 \leq i \leq d)
|
|
|
|
Note:
|
|
The `context_len` is `k + 1`. That is to say, the lookahead step
|
|
number plus one equals context_len.
|
|
|
|
|
|
.. code-block:: python
|
|
|
|
row_conv = row_conv_layer(input=input_layer, context_len=3)
|
|
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param context_len: The context length equals the lookahead step number
|
|
plus one.
|
|
:type context_len: int
|
|
:param act: Activation Type. LinearActivation is the default activation.
|
|
:type act: BaseActivation
|
|
:param param_attr: The parameter attribute. See ParameterAttribute for
|
|
details.
|
|
:type param_attr: ParameterAttribute
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute | None
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
assert isinstance(input, LayerOutput)
|
|
assert context_len > 0, "the context_len must be greatet than 0."
|
|
|
|
Layer(
|
|
inputs=[Input(input.name, **param_attr.attr)],
|
|
name=name,
|
|
context_length=context_len,
|
|
type=LayerType.ROW_CONV_LAYER,
|
|
active_type=act.name,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name, LayerType.ROW_CONV_LAYER, input, activation=act, size=input.size)
|
|
|
|
|
|
@layer_support()
|
|
@wrap_name_default()
|
|
def prelu_layer(input,
|
|
name=None,
|
|
partial_sum=1,
|
|
channel_shared=None,
|
|
num_channels=None,
|
|
param_attr=None,
|
|
layer_attr=None):
|
|
"""
|
|
The Parametric Relu activation that actives outputs with a learnable weight.
|
|
|
|
Reference:
|
|
`Delving Deep into Rectifiers: Surpassing Human-Level Performance on
|
|
ImageNet Classification <http://arxiv.org/pdf/1502.01852v1.pdf>`_
|
|
|
|
.. math::
|
|
z_i &\\quad if \\quad z_i > 0 \\\\
|
|
a_i * z_i &\\quad \\mathrm{otherwise}
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
prelu = prelu_layer(input=layers, partial_sum=1)
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param partial_sum: this parameter makes a group of inputs share the same weight.
|
|
|
|
- partial_sum = 1, indicates the element-wise activation: each element has a weight.
|
|
- partial_sum = number of elements in one channel, indicates the channel-wise activation, elements in a channel share the same weight.
|
|
- partial_sum = number of outputs, indicates all elements share the same weight.
|
|
|
|
:type partial_sum: int
|
|
:param channel_shared: whether or not the parameter are shared across channels.
|
|
|
|
- channel_shared = True, we set the partial_sum to the number of outputs.
|
|
- channel_shared = False, we set the partial_sum to the number of elements in one channel.
|
|
|
|
:type channel_shared: bool
|
|
:param num_channels: number of input channel.
|
|
:type num_channels: int
|
|
:param param_attr: The parameter attribute. See ParameterAttribute for details.
|
|
:type param_attr: ParameterAttribute
|
|
:param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute | None
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
|
|
assert isinstance(input, LayerOutput), 'prelu_layer accepts only one input.'
|
|
|
|
if not param_attr:
|
|
param_attr = ParamAttr(initial_mean=0.25, initial_std=0.0)
|
|
else:
|
|
assert isinstance(param_attr, ParameterAttribute)
|
|
|
|
if num_channels is None:
|
|
assert input.num_filters is not None, \
|
|
'the input channel cannot be detected, please specify the num_channels parameter'
|
|
num_channels = input.num_filters
|
|
|
|
if channel_shared is not None:
|
|
assert isinstance(channel_shared, bool)
|
|
assert (input.height != 0 and input.width != 0), \
|
|
'input height and widht must be setted'
|
|
if channel_shared:
|
|
partial_sum = input.height * input.width * num_channels
|
|
else:
|
|
partial_sum = input.height * input.width
|
|
|
|
l = Layer(
|
|
name=name,
|
|
type=LayerType.PRELU,
|
|
inputs=Input(input.name, **param_attr.attr),
|
|
partial_sum=partial_sum,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name=name,
|
|
layer_type=LayerType.PRELU,
|
|
parents=input,
|
|
num_filters=num_channels,
|
|
size=l.config.size)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support(ERROR_CLIPPING, DROPOUT)
|
|
@wrap_act_default(act=LinearActivation())
|
|
def gated_unit_layer(input,
|
|
size,
|
|
act=None,
|
|
name=None,
|
|
gate_attr=None,
|
|
gate_param_attr=None,
|
|
gate_bias_attr=True,
|
|
inproj_attr=None,
|
|
inproj_param_attr=None,
|
|
inproj_bias_attr=True,
|
|
layer_attr=None):
|
|
"""
|
|
The gated unit layer implements a simple gating mechanism over the input.
|
|
The input :math:`X` is first projected into a new space :math:`X'`, and
|
|
it is also used to produce a gate weight :math:`\sigma`. Element-wise
|
|
product between :math:`X'` and :math:`\sigma` is finally returned.
|
|
|
|
Reference:
|
|
`Language Modeling with Gated Convolutional Networks
|
|
<https://arxiv.org/abs/1612.08083>`_
|
|
|
|
.. math::
|
|
y=\\text{act}(X \cdot W + b)\otimes \sigma(X \cdot V + c)
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
gated_unit = gated_unit_layer(size=128, input=input_layer))
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param size: The dimension of this layer's output.
|
|
:type size: int
|
|
:param act: Activation type of the projection. LinearActivation is the default
|
|
activation.
|
|
:type act: BaseActivation
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param gate_attr: The extra layer attribute of the gate. See ExtraLayerAttribute for
|
|
details.
|
|
:type gate_attr: ExtraLayerAttribute | None
|
|
:param gate_param_attr: The parameter attribute of the gate. See ParameterAttribute
|
|
for details.
|
|
:type gate_param_attr: ParameterAttribute
|
|
:param gate_bias_attr: The bias attribute of the gate. If this parameter is set to False or
|
|
an object whose type is not ParameterAttribute, no bias is defined.
|
|
If this parameter is set to True, the bias is initialized to zero.
|
|
:type gate_bias_attr: ParameterAttribute | bool | None | Any
|
|
:param inproj_attr: Extra layer attributes of the projection. See ExtraLayerAttribute for
|
|
details.
|
|
:type inproj_attr: ExtraLayerAttribute | None
|
|
:param inproj_param_attr: The parameter attribute of the projection. See ParameterAttribute
|
|
for details.
|
|
:type inproj_param_attr: ParameterAttribute
|
|
:param inproj_bias_attr: The bias attribute of the projection. If this parameter is set to False
|
|
or an object whose type is not ParameterAttribute, no bias is defined.
|
|
If this parameter is set to True, the bias is initialized to zero.
|
|
:type inproj_bias_attr: ParameterAttribute | bool | None | Any
|
|
:param layer_attr: Extra layer attribute of the product. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute | None
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
|
|
assert isinstance(
|
|
input, LayerOutput), 'The gated linear unit accepts only one input.'
|
|
|
|
input_proj = fc_layer(
|
|
input=input,
|
|
name="%s_input_proj" % name,
|
|
size=size,
|
|
act=act,
|
|
layer_attr=inproj_attr,
|
|
param_attr=inproj_param_attr,
|
|
bias_attr=inproj_bias_attr)
|
|
|
|
gate = fc_layer(
|
|
size=size,
|
|
name="%s_gate" % name,
|
|
act=SigmoidActivation(),
|
|
input=input,
|
|
layer_attr=gate_attr,
|
|
param_attr=gate_param_attr,
|
|
bias_attr=gate_bias_attr)
|
|
return mixed_layer(
|
|
name="%s_gated_act" % name,
|
|
input=dotmul_operator(input_proj, gate),
|
|
layer_attr=layer_attr)
|
|
|
|
|
|
@layer_support()
|
|
@wrap_name_default('switch_order')
|
|
def switch_order_layer(input,
|
|
name=None,
|
|
reshape_axis=None,
|
|
act=None,
|
|
layer_attr=None):
|
|
"""
|
|
This layer switch dimension order of image input.
|
|
From order "batchSize, channels, height, width"
|
|
to order "batchSize, height, width, channels".
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
reshape_axis = 3
|
|
switch = switch_order(input=layer, name='switch', reshape_axis=reshape_axis)
|
|
reshape = {'height':[ 0, 1, 2], 'width':[3]}
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param reshape_axis: Specify the axises of 'height'. Its value should be positive and less than 4.
|
|
:type reshape_axis: int
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
assert isinstance(input, LayerOutput)
|
|
assert reshape_axis != None and (reshape_axis > 0 and reshape_axis < 4)
|
|
height = [ele for ele in xrange(reshape_axis)]
|
|
width = [ele for ele in range(reshape_axis, 4)]
|
|
reshape = {'height': height, 'width': width}
|
|
|
|
l = Layer(
|
|
name=name,
|
|
inputs=input.name,
|
|
reshape=reshape,
|
|
type=LayerType.SWITCH_ORDER_LAYER,
|
|
active_type=act.name,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name=name,
|
|
layer_type=LayerType.SWITCH_ORDER_LAYER,
|
|
activation=act,
|
|
parents=input,
|
|
size=l.config.size)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None):
|
|
"""
|
|
This layer crops images according to the offset and shape. Users can set
|
|
the crop shape through the argument 'shape' explicitly or by specifying a
|
|
reference input layer.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
crop = crop_layer(input=[image_input, reference_input], axis=2, offset=[2, 3])
|
|
|
|
:param input: The input of this layer. If two inputs are given, the second one
|
|
will be regarded as the reference.
|
|
And the input must be 4-dims and in NCHW order.
|
|
:type input: LayerOutput | Sequence
|
|
:param offset: The crop offset.
|
|
:type offset: Sequence
|
|
:param axis: The start axis to be cropped. For image input layer:
|
|
- 0: batch size
|
|
- 1: channels
|
|
- 2: height
|
|
- 3: width
|
|
:type axis: int
|
|
:param shape: The shape to be cropped to. Default is None.
|
|
:type shape: Sequence | None
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
if isinstance(input, LayerOutput):
|
|
input = [input]
|
|
else:
|
|
assert isinstance(input, collections.Sequence)
|
|
l = Layer(
|
|
inputs=[x.name for x in input],
|
|
axis=axis,
|
|
offset=offset,
|
|
shape=shape,
|
|
name=name,
|
|
type=LayerType.CROP_LAYER,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name=name,
|
|
layer_type=LayerType.CROP_LAYER,
|
|
parents=input,
|
|
size=l.config.size)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def sub_nested_seq_layer(input, selected_indices, name=None):
|
|
"""
|
|
The sub_nested_seq_layer accepts two inputs: the first one is a nested
|
|
sequence; the second one is a set of selceted indices in the nested sequence.
|
|
|
|
Then sub_nest_seq_layer trims the first nested sequence input according
|
|
to the selected indices to form a new output. This layer is useful in
|
|
beam training.
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
sub_nest_seq = sub_nested_seq_layer(input=data, selected_indices=selected_ids)
|
|
|
|
|
|
:param input: The input of this layer. It is a nested sequence.
|
|
:type input: LayerOutput
|
|
:param selected_indices: A set of sequence indices in the nested sequence.
|
|
:type input: LayerOutput
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
|
|
assert isinstance(input, LayerOutput), (
|
|
'The first input of '
|
|
'sub_nested_seq_layer must be a Paddle layer.')
|
|
assert isinstance(selected_indices, LayerOutput), (
|
|
'The second input of '
|
|
'sub_nested_seq_layer must be a Paddle layer.')
|
|
|
|
l = Layer(
|
|
inputs=input.name,
|
|
selected_indices=selected_indices.name,
|
|
name=name,
|
|
type=LayerType.SUB_NESTED_SEQ)
|
|
return LayerOutput(
|
|
name=name,
|
|
layer_type=LayerType.SUB_NESTED_SEQ,
|
|
parents=input,
|
|
size=l.config.size)
|
|
|
|
|
|
@wrap_name_default("clip")
|
|
def clip_layer(input, min, max, name=None):
|
|
"""
|
|
A layer for clipping the input value by the threshold.
|
|
|
|
.. math::
|
|
|
|
out[i] = \min (\max (in[i],p_{1} ),p_{2} )
|
|
|
|
.. code-block:: python
|
|
|
|
clip = clip_layer(input=input_layer, min=-10, max=10)
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput.
|
|
:param min: The lower threshold for clipping.
|
|
:type min: float
|
|
:param max: The upper threshold for clipping.
|
|
:type max: float
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.CLIP_LAYER,
|
|
inputs=[input.name],
|
|
min=min,
|
|
max=max)
|
|
return LayerOutput(
|
|
name, LayerType.CLIP_LAYER, parents=[input], size=input.size)
|
|
|
|
|
|
@wrap_name_default()
|
|
def seq_slice_layer(input, starts, ends, name=None):
|
|
"""
|
|
seq_slice_layer will return one or several sub-sequences from the
|
|
input sequence layer given start and end indices.
|
|
|
|
- If only start indices are given, and end indices are set to None,
|
|
this layer slices the input sequence from the given start indices
|
|
to its end.
|
|
- If only end indices are given, and start indices are set to None,
|
|
this layer slices the input sequence from its beginning to the
|
|
given end indices.
|
|
- If start and end indices are both given, they should have the same
|
|
number of elements.
|
|
|
|
If start or end indices contains more than one elements, the input sequence
|
|
will be sliced for multiple times.
|
|
|
|
|
|
.. code-block:: python
|
|
|
|
seq_silce = seq_slice_layer(input=input_seq,
|
|
starts=start_pos, ends=end_pos)
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The input of this layer, which should be a sequence.
|
|
:type input: LayerOutput
|
|
:param starts: The start indices to slice the input sequence.
|
|
:type starts: LayerOutput | None
|
|
:param ends: The end indices to slice the input sequence.
|
|
:type ends: LayerOutput | None
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
|
|
assert isinstance(input, LayerOutput), (
|
|
'The first input of seq_slice layer must be a PaddlePaddle layer.')
|
|
|
|
if starts is not None:
|
|
assert isinstance(starts, LayerOutput), (
|
|
'The start indices for seq_slice layer '
|
|
'must be a PaddlePaddle layer.')
|
|
if ends is not None:
|
|
assert isinstance(ends, LayerOutput), (
|
|
'The end indices for seq_slice layer must be a PaddlePaddle layer.')
|
|
assert starts is not None or ends is not None, (
|
|
'start and end indices '
|
|
'cannot be set to None at the same time, at least one of '
|
|
'them should be given.')
|
|
if starts is not None and ends is not None:
|
|
assert starts.size == ends.size, (
|
|
'If start and end indices are both given to seq_slice_layer, '
|
|
'they should have the same width.')
|
|
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.SEQ_SLICE,
|
|
inputs=input.name,
|
|
starts=starts.name if starts is not None else None,
|
|
ends=ends.name if ends is not None else None)
|
|
return LayerOutput(
|
|
name, LayerType.SEQ_SLICE, parents=[input], size=input.size)
|
|
|
|
|
|
@wrap_name_default()
|
|
@layer_support()
|
|
def kmax_seq_score_layer(input, name=None, beam_size=1):
|
|
"""
|
|
This layer accepts one input which is scores over a sequence or a nested
|
|
sequence, and returns indices of beam_size sequences with highest scores.
|
|
|
|
.. code-block:: python
|
|
|
|
kmax_indices = kmax_seq_score_layer(input=input_layer, beam_size)
|
|
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The input of this layer. It stores scores over a sequence or
|
|
a nested sequence and its size must be 1.
|
|
:type input: LayerOutput
|
|
:param beam_size: The indices of the sequences with top beam_size scores are returned.
|
|
:type beam_size: int
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
assert isinstance(input, LayerOutput), ("kmax_seq_score_layer "
|
|
"accepts only one input.")
|
|
assert input.size == 1, (
|
|
"input of kmax_seq_score_layer is a score "
|
|
"over a sequence or a nested sequence, so its width must be 1.")
|
|
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.KMAX_SEQ_SCORE,
|
|
inputs=[input.name],
|
|
beam_size=beam_size)
|
|
|
|
return LayerOutput(
|
|
name, LayerType.KMAX_SEQ_SCORE, parents=[input], size=input.size)
|
|
|
|
|
|
@wrap_name_default("conv3d")
|
|
@wrap_param_attr_default()
|
|
@wrap_bias_attr_default()
|
|
@wrap_act_default(act=ReluActivation())
|
|
@layer_support(DROPOUT)
|
|
def img_conv3d_layer(input,
|
|
filter_size,
|
|
num_filters,
|
|
name=None,
|
|
num_channels=None,
|
|
act=None,
|
|
groups=1,
|
|
stride=1,
|
|
padding=0,
|
|
bias_attr=None,
|
|
param_attr=None,
|
|
shared_biases=True,
|
|
layer_attr=None,
|
|
trans=False,
|
|
layer_type=None):
|
|
"""
|
|
|
|
The example usage is:
|
|
|
|
.. code-block:: python
|
|
|
|
conv = img_conv3d_layer(input=data, filter_size=1,
|
|
num_channels=8,
|
|
num_filters=16, stride=1,
|
|
bias_attr=False,
|
|
act=ReluActivation())
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param filter_size: The dimensions of the filter kernel along three axises. If the parameter
|
|
is set to one integer, the three dimensions will be same.
|
|
:type filter_size: int | tuple | list
|
|
:param num_filters: The number of filters. It is as same as the output image channel.
|
|
:type num_filters: int
|
|
:param act: Activation type. ReluActivation is the default activation.
|
|
:type act: BaseActivation
|
|
:param groups: The number of the filter groups.
|
|
:type groups: int
|
|
:param stride: The strides of the convolution along three axises. If the parameter
|
|
is set to one integer, the three strides will be same.
|
|
:type stride: int | tuple | list
|
|
:param padding: The numbers of padding along three axises. If the parameter is set to
|
|
one integer, they will be same.
|
|
:type padding: int | tuple | list
|
|
:param bias_attr: The bias attribute. If the parameter is set to False or an object
|
|
whose type is not ParameterAttribute, no bias is defined. If the
|
|
parameter is set to True, the bias is initialized to zero.
|
|
:type bias_attr: ParameterAttribute | None | bool | Any
|
|
:param num_channels: The number of input channels. If the parameter is not set or
|
|
set to None, its actual value will be automatically set to
|
|
the channels number of the input.
|
|
:type num_channels: int
|
|
:param param_attr: The parameter attribute of the convolution. See ParameterAttribute for
|
|
details.
|
|
:type param_attr: ParameterAttribute
|
|
:param shared_biases: Whether biases will be shared between filters or not.
|
|
:type shared_biases: bool
|
|
:param layer_attr: The extra layer attributes. See ExtraLayerAttribute for
|
|
details.
|
|
:type layer_attr: ExtraLayerAttribute
|
|
:param trans: True if it is a convTransLayer, False if it is a convLayer
|
|
:type trans: bool
|
|
:param layer_type: Specify the layer type. If the parameter is set, it must be "deconv3d"
|
|
when trans=True. If not set, it will be automatically set to "deconv3d"
|
|
when trans=True and "conv3d" when trans=False.
|
|
:type layer_type: basestring
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
if num_channels is None:
|
|
assert input.num_filters is not None
|
|
num_channels = input.num_filters
|
|
|
|
if isinstance(filter_size, collections.Sequence):
|
|
assert len(filter_size) == 3
|
|
filter_size, filter_size_y, filter_size_z = filter_size
|
|
else:
|
|
filter_size_y = filter_size
|
|
filter_size_z = filter_size
|
|
|
|
if isinstance(stride, collections.Sequence):
|
|
assert len(stride) == 3
|
|
stride, stride_y, stride_z = stride
|
|
else:
|
|
stride_y = stride
|
|
stride_z = stride
|
|
|
|
if isinstance(padding, collections.Sequence):
|
|
assert len(padding) == 3
|
|
padding, padding_y, padding_z = padding
|
|
else:
|
|
padding_y = padding
|
|
padding_z = padding
|
|
|
|
if param_attr.attr.get('initial_smart'):
|
|
# special initial for conv layers.
|
|
init_w = (2.0 / (filter_size**2 * num_channels))**0.5
|
|
param_attr.attr["initial_mean"] = 0.0
|
|
param_attr.attr["initial_std"] = init_w
|
|
param_attr.attr["initial_strategy"] = 0
|
|
param_attr.attr["initial_smart"] = False
|
|
|
|
if layer_type:
|
|
if trans:
|
|
assert layer_type in ["deconv3d"]
|
|
lt = layer_type
|
|
else:
|
|
lt = LayerType.DECONV3D_LAYER if trans else LayerType.CONV3D_LAYER
|
|
|
|
l = Layer(
|
|
name=name,
|
|
inputs=Input(
|
|
input.name,
|
|
conv=Conv3D(
|
|
filter_size=filter_size,
|
|
padding=padding,
|
|
stride=stride,
|
|
channels=num_channels,
|
|
groups=groups,
|
|
filter_size_y=filter_size_y,
|
|
padding_y=padding_y,
|
|
stride_y=stride_y,
|
|
filter_size_z=filter_size_z,
|
|
padding_z=padding_z,
|
|
stride_z=stride_z),
|
|
**param_attr.attr),
|
|
active_type=act.name,
|
|
num_filters=num_filters,
|
|
bias=ParamAttr.to_bias(bias_attr),
|
|
shared_biases=shared_biases,
|
|
type=lt,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name,
|
|
lt,
|
|
parents=[input],
|
|
activation=act,
|
|
num_filters=num_filters,
|
|
size=l.config.size)
|
|
|
|
|
|
@wrap_name_default("scale_shift")
|
|
@wrap_param_attr_default()
|
|
@wrap_bias_attr_default()
|
|
def scale_shift_layer(input, name=None, param_attr=None, bias_attr=None):
|
|
"""
|
|
A layer applies a linear transformation to each element in each row of
|
|
the input matrix. For each element, the layer first re-scales it and then
|
|
adds a bias to it.
|
|
|
|
This layer is very like the SlopeInterceptLayer, except the scale and
|
|
bias are trainable.
|
|
|
|
.. math::
|
|
|
|
y = w * x + b
|
|
|
|
.. code-block:: python
|
|
|
|
scale_shift = scale_shift_layer(input=input_layer, bias_attr=False)
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput
|
|
:param param_attr: The parameter attribute of scaling. See ParameterAttribute for
|
|
details.
|
|
:type param_attr: ParameterAttribute
|
|
:param bias_attr: The bias attribute. If the parameter is set to False or an object
|
|
whose type is not ParameterAttribute, no bias is defined. If the
|
|
parameter is set to True, the bias is initialized to zero.
|
|
:type bias_attr: ParameterAttribute | None | bool | Any
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.SCALE_SHIFT_LAYER,
|
|
inputs=Input(input.name, **param_attr.attr),
|
|
bias=ParamAttr.to_bias(bias_attr))
|
|
return LayerOutput(
|
|
name, LayerType.SCALE_SHIFT_LAYER, parents=[input], size=input.size)
|
|
|
|
|
|
@wrap_name_default("resize")
|
|
def resize_layer(input, size, name=None):
|
|
"""
|
|
The resize layer resizes the input matrix with a shape of [Height, Width]
|
|
into the output matrix with a shape of [Height x Width / size, size],
|
|
where size is the parameter of this layer indicating the output dimension.
|
|
|
|
:param input: The input of this layer.
|
|
:type input: LayerOutput.
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param size: The resized output dimension of this layer.
|
|
:type size: int
|
|
:return: A LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
Layer(name=name, type=LayerType.RESIZE, inputs=Input(input.name), size=size)
|
|
return LayerOutput(name, LayerType.RESIZE, parents=[input], size=input.size)
|
|
|
|
|
|
@wrap_act_default(act=LinearActivation())
|
|
@wrap_name_default('sub_seq')
|
|
def sub_seq_layer(input, offsets, sizes, act=None, bias_attr=None, name=None):
|
|
"""
|
|
sub_seq_layer will return sub-sequences from the input sequences. For each
|
|
sequence in the input sequence layer, sub_seq_layer will slice it by given
|
|
offset and size. Please notice that, number of offset value and size value
|
|
both are equal to the number of sequence in the input layer.
|
|
|
|
.. code-block:: python
|
|
|
|
sub_seq = sub_seq_layer(input=input_seq, offsets=offsets, sizes=sizes)
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The input of this layer, which should be sequence.
|
|
:type input: LayerOutput
|
|
:param offsets: The offset indices to slice the input sequence, which should
|
|
be sequence type.
|
|
:type offsets: LayerOutput
|
|
:param sizes: The sizes of the sub-sequences, which should be sequence type.
|
|
:type sizes: LayerOutput
|
|
:param act: Activation type, LinearActivation is the default activation.
|
|
:type act: BaseActivation.
|
|
:param bias_attr: The bias attribute. If the parameter is set to False or an object
|
|
whose type is not ParameterAttribute, no bias is defined. If the
|
|
parameter is set to True, the bias is initialized to zero.
|
|
:type bias_attr: ParameterAttribute | None | bool | Any
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
|
|
assert isinstance(input, LayerOutput), (
|
|
'The first input of sub_seq_layer layer must be a PaddlePaddle layer.')
|
|
assert isinstance(offsets, LayerOutput), (
|
|
'The offset indices for sub_seq_layer, '
|
|
'must be a PaddlePaddle layer.')
|
|
assert isinstance(sizes, LayerOutput), (
|
|
'The sizes of sub-sequences, must be a PaddlePaddle layer.')
|
|
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.SUB_SEQ_LAYER,
|
|
inputs=[input.name, offsets.name, sizes.name],
|
|
active_type=act.name,
|
|
bias=ParamAttr.to_bias(bias_attr))
|
|
|
|
return LayerOutput(
|
|
name,
|
|
LayerType.SUB_SEQ_LAYER,
|
|
parents=[input, offsets, sizes],
|
|
size=input.size)
|
|
|
|
|
|
@wrap_name_default('scale_sub_region')
|
|
def scale_sub_region_layer(input, indices, value, name=None):
|
|
"""
|
|
Given an image or feature map with CHW information, scale_sub_region_layer
|
|
can be used to multiply a real value to values of a sub continuous region.
|
|
You can provide start and end indices of CHW for each instance.
|
|
Please notice that all start indices are counting from 1.
|
|
The shape of indices should be [batch_size, 6] and the layout for each row
|
|
is [C_Start, C_End, H_Start, H_End, W_Start, W_End].
|
|
|
|
.. code-block:: python
|
|
|
|
scale_sub_region = scale_sub_region_layer(input=input,
|
|
indices=indices,
|
|
value=value)
|
|
|
|
:param name: The name of this layer. It is optional.
|
|
:type name: basestring
|
|
:param input: The input of this layer which should contains CHW information.
|
|
:type input: LayerOutput
|
|
:param indices: Start index and end index for C H W, the input value should
|
|
be a 2-D matrix with shape [batch_size, 6].
|
|
:type indices: LayerOutput.
|
|
:param value: value to multiply.
|
|
:type value: float
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
|
|
assert isinstance(input, LayerOutput), (
|
|
'The first input of scale_sub_region_layer, '
|
|
'must be a PaddlePaddle layer.')
|
|
assert isinstance(indices, LayerOutput), (
|
|
'The start and end indices for CHW, must be a PaddlePaddle layer.')
|
|
assert isinstance(value, float), (
|
|
'The value to multiply, must be a real value.')
|
|
|
|
Layer(
|
|
name=name,
|
|
type=LayerType.SCALE_SUB_REGION_LAYER,
|
|
inputs=[input.name, indices.name],
|
|
value=value)
|
|
|
|
return LayerOutput(
|
|
name,
|
|
LayerType.SCALE_SUB_REGION_LAYER,
|
|
parents=[input, indices],
|
|
num_filters=input.num_filters,
|
|
size=input.size)
|
|
|
|
|
|
@wrap_name_default()
|
|
@wrap_act_default(act=LinearActivation())
|
|
@wrap_param_attr_default()
|
|
@layer_support()
|
|
def factorization_machine(input,
|
|
factor_size,
|
|
act=None,
|
|
name=None,
|
|
param_attr=None,
|
|
layer_attr=None):
|
|
"""
|
|
The Factorization Machine models pairwise feature interactions as inner
|
|
product of the learned latent vectors corresponding to each input feature.
|
|
The Factorization Machine can effectively capture feature interactions
|
|
especially when the input is sparse.
|
|
|
|
This implementation only consider the 2-order feature interactions using
|
|
Factorization Machine with the formula:
|
|
|
|
.. math::
|
|
y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \\rangle x_i x_j
|
|
|
|
Note:
|
|
X is the input vector with size n. V is the factor matrix. Each row of V
|
|
is the latent vector corresponding to each input dimesion. The size of
|
|
each latent vector is k.
|
|
|
|
For details of Factorization Machine, please refer to the paper:
|
|
Factorization machines.
|
|
|
|
.. code-block:: python
|
|
first_order = paddle.layer.fc(input=input,
|
|
size=1,
|
|
act=paddle.activation.Linear())
|
|
second_order = paddle.layer.factorization_machine(input=input,
|
|
factor_size=10)
|
|
fm = paddle.layer.addto(input=[first_order, second_order],
|
|
act=paddle.activation.Linear(),
|
|
bias_attr=False)
|
|
|
|
:param input: The input layer. Supported input types: all input data types
|
|
on CPU, and only dense input types on GPU.
|
|
:type input: LayerOutput
|
|
:param factor_size: The hyperparameter that defines the dimensionality of
|
|
the latent vector size.
|
|
:type context_len: int
|
|
:param act: Activation Type. Default is linear activation.
|
|
:type act: BaseActivation
|
|
:param param_attr: The parameter attribute. See ParameterAttribute for
|
|
details.
|
|
:type param_attr: ParameterAttribute
|
|
:param layer_attr: Extra Layer config.
|
|
:type layer_attr: ExtraLayerAttribute|None
|
|
:return: LayerOutput object.
|
|
:rtype: LayerOutput
|
|
"""
|
|
assert isinstance(input, LayerOutput)
|
|
assert factor_size > 0, "the factor_size must be greater than 0."
|
|
|
|
Layer(
|
|
inputs=[Input(input.name, **param_attr.attr)],
|
|
name=name,
|
|
factor_size=factor_size,
|
|
type=LayerType.FACTORIZATION_MACHINE,
|
|
active_type=act.name,
|
|
**ExtraLayerAttribute.to_kwargs(layer_attr))
|
|
return LayerOutput(
|
|
name, LayerType.FACTORIZATION_MACHINE, input, activation=act, size=1)
|