You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							1814 lines
						
					
					
						
							64 KiB
						
					
					
				
			
		
		
	
	
							1814 lines
						
					
					
						
							64 KiB
						
					
					
				| # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 | |
| #
 | |
| # Licensed under the Apache License, Version 2.0 (the "License");
 | |
| # you may not use this file except in compliance with the License.
 | |
| # You may obtain a copy of the License at
 | |
| #
 | |
| #     http://www.apache.org/licenses/LICENSE-2.0
 | |
| #
 | |
| # Unless required by applicable law or agreed to in writing, software
 | |
| # distributed under the License is distributed on an "AS IS" BASIS,
 | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| # See the License for the specific language governing permissions and
 | |
| # limitations under the License.
 | |
| import math
 | |
| 
 | |
| from activations import LinearActivation, ReluActivation, SoftmaxActivation, \
 | |
|     IdentityActivation, TanhActivation, SequenceSoftmaxActivation
 | |
| from attrs import ExtraAttr
 | |
| from default_decorators import wrap_name_default, wrap_act_default, \
 | |
|     wrap_param_default, wrap_bias_attr_default, wrap_param_attr_default
 | |
| from layers import *  # There are too many layers used in network, so import *
 | |
| from poolings import MaxPooling, SumPooling
 | |
| from paddle.trainer.config_parser import *
 | |
| 
 | |
| __all__ = [
 | |
|     'sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
 | |
|     "img_conv_bn_pool", 'lstmemory_group', 'lstmemory_unit', 'small_vgg',
 | |
|     'img_conv_group', 'img_separable_conv', 'vgg_16_network', 'gru_unit',
 | |
|     'gru_group', 'simple_gru', 'simple_attention', 'dot_product_attention',
 | |
|     'multi_head_attention', 'simple_gru2', 'bidirectional_gru',
 | |
|     'text_conv_pool', 'bidirectional_lstm', 'inputs', 'outputs'
 | |
| ]
 | |
| 
 | |
| ######################################################
 | |
| #                     Text CNN                       #
 | |
| ######################################################
 | |
| 
 | |
| 
 | |
| @wrap_name_default("sequence_conv_pooling")
 | |
| def sequence_conv_pool(input,
 | |
|                        context_len,
 | |
|                        hidden_size,
 | |
|                        name=None,
 | |
|                        context_start=None,
 | |
|                        pool_type=None,
 | |
|                        context_proj_layer_name=None,
 | |
|                        context_proj_param_attr=False,
 | |
|                        fc_layer_name=None,
 | |
|                        fc_param_attr=None,
 | |
|                        fc_bias_attr=None,
 | |
|                        fc_act=None,
 | |
|                        pool_bias_attr=None,
 | |
|                        fc_attr=None,
 | |
|                        context_attr=None,
 | |
|                        pool_attr=None):
 | |
|     """
 | |
|     Text convolution pooling group.
 | |
| 
 | |
|     Text input => Context Projection => FC Layer => Pooling => Output.
 | |
| 
 | |
|     :param name: group name.
 | |
|     :type name: basestring
 | |
|     :param input: input layer.
 | |
|     :type input: LayerOutput
 | |
|     :param context_len: context projection length. See
 | |
|                         context_projection's document.
 | |
|     :type context_len: int
 | |
|     :param hidden_size: FC Layer size.
 | |
|     :type hidden_size: int
 | |
|     :param context_start: context start position. See
 | |
|                           context_projection's context_start.
 | |
|     :type context_start: int|None
 | |
|     :param pool_type: pooling layer type. See pooling_layer's document.
 | |
|     :type pool_type: BasePoolingType
 | |
|     :param context_proj_layer_name: context projection layer name.
 | |
|                                     None if user don't care.
 | |
|     :type context_proj_layer_name: basestring
 | |
|     :param context_proj_param_attr: padding parameter attribute of context projection layer.
 | |
|                                     If false, it means padding always be zero.
 | |
|     :type context_proj_param_attr: ParameterAttribute|None
 | |
|     :param fc_layer_name: fc layer name. None if user don't care.
 | |
|     :type fc_layer_name: basestring
 | |
|     :param fc_param_attr: fc layer parameter attribute. None if user don't care.
 | |
|     :type fc_param_attr: ParameterAttribute|None
 | |
|     :param fc_bias_attr: fc bias parameter attribute. False if no bias,
 | |
|                          None if user don't care.
 | |
|     :type fc_bias_attr: ParameterAttribute|False|None
 | |
|     :param fc_act: fc layer activation type. None means tanh.
 | |
|     :type fc_act: BaseActivation
 | |
|     :param pool_bias_attr: pooling layer bias attr. False if no bias.
 | |
|                            None if user don't care.
 | |
|     :type pool_bias_attr: ParameterAttribute|False|None
 | |
|     :param fc_attr: fc layer extra attribute.
 | |
|     :type fc_attr: ExtraLayerAttribute
 | |
|     :param context_attr: context projection layer extra attribute.
 | |
|     :type context_attr: ExtraLayerAttribute
 | |
|     :param pool_attr: pooling layer extra attribute.
 | |
|     :type pool_attr: ExtraLayerAttribute
 | |
|     :return: layer's output.
 | |
|     :rtype: LayerOutput
 | |
|     """
 | |
|     # Set Default Value to param
 | |
|     context_proj_layer_name = "%s_conv_proj" % name \
 | |
|         if context_proj_layer_name is None else context_proj_layer_name
 | |
| 
 | |
|     with mixed_layer(
 | |
|             name=context_proj_layer_name,
 | |
|             size=input.size * context_len,
 | |
|             act=LinearActivation(),
 | |
|             layer_attr=context_attr) as m:
 | |
|         m += context_projection(
 | |
|             input,
 | |
|             context_len=context_len,
 | |
|             context_start=context_start,
 | |
|             padding_attr=context_proj_param_attr)
 | |
| 
 | |
|     fc_layer_name = "%s_conv_fc" % name \
 | |
|         if fc_layer_name is None else fc_layer_name
 | |
|     fl = fc_layer(
 | |
|         name=fc_layer_name,
 | |
|         input=m,
 | |
|         size=hidden_size,
 | |
|         act=fc_act,
 | |
|         layer_attr=fc_attr,
 | |
|         param_attr=fc_param_attr,
 | |
|         bias_attr=fc_bias_attr)
 | |
| 
 | |
|     return pooling_layer(
 | |
|         name=name,
 | |
|         input=fl,
 | |
|         pooling_type=pool_type,
 | |
|         bias_attr=pool_bias_attr,
 | |
|         layer_attr=pool_attr)
 | |
| 
 | |
| 
 | |
| text_conv_pool = sequence_conv_pool
 | |
| 
 | |
| ############################################################################
 | |
| #                       Images                                             #
 | |
| ############################################################################
 | |
| 
 | |
| 
 | |
| @wrap_name_default("conv_pool")
 | |
| def simple_img_conv_pool(input,
 | |
|                          filter_size,
 | |
|                          num_filters,
 | |
|                          pool_size,
 | |
|                          name=None,
 | |
|                          pool_type=None,
 | |
|                          act=None,
 | |
|                          groups=1,
 | |
|                          conv_stride=1,
 | |
|                          conv_padding=0,
 | |
|                          bias_attr=None,
 | |
|                          num_channel=None,
 | |
|                          param_attr=None,
 | |
|                          shared_bias=True,
 | |
|                          conv_layer_attr=None,
 | |
|                          pool_stride=1,
 | |
|                          pool_padding=0,
 | |
|                          pool_layer_attr=None):
 | |
|     """
 | |
|     Simple image convolution and pooling group.
 | |
| 
 | |
|     Img input => Conv => Pooling => Output.
 | |
| 
 | |
|     :param name: group name.
 | |
|     :type name: basestring
 | |
|     :param input: input layer.
 | |
|     :type input: LayerOutput
 | |
|     :param filter_size: see img_conv_layer for details.
 | |
|     :type filter_size: int
 | |
|     :param num_filters: see img_conv_layer for details.
 | |
|     :type num_filters: int
 | |
|     :param pool_size: see img_pool_layer for details.
 | |
|     :type pool_size: int
 | |
|     :param pool_type: see img_pool_layer for details.
 | |
|     :type pool_type: BasePoolingType
 | |
|     :param act: see img_conv_layer for details.
 | |
|     :type act: BaseActivation
 | |
|     :param groups: see img_conv_layer for details.
 | |
|     :type groups: int
 | |
|     :param conv_stride: see img_conv_layer for details.
 | |
|     :type conv_stride: int
 | |
|     :param conv_padding: see img_conv_layer for details.
 | |
|     :type conv_padding: int
 | |
|     :param bias_attr: see img_conv_layer for details.
 | |
|     :type bias_attr: ParameterAttribute
 | |
|     :param num_channel: see img_conv_layer for details.
 | |
|     :type num_channel: int
 | |
|     :param param_attr: see img_conv_layer for details.
 | |
|     :type param_attr: ParameterAttribute
 | |
|     :param shared_bias: see img_conv_layer for details.
 | |
|     :type shared_bias: bool
 | |
|     :param conv_layer_attr: see img_conv_layer for details.
 | |
|     :type conv_layer_attr: ExtraLayerAttribute
 | |
|     :param pool_stride: see img_pool_layer for details.
 | |
|     :type pool_stride: int
 | |
|     :param pool_padding: see img_pool_layer for details.
 | |
|     :type pool_padding: int
 | |
|     :param pool_layer_attr: see img_pool_layer for details.
 | |
|     :type pool_layer_attr: ExtraLayerAttribute
 | |
|     :return: layer's output
 | |
|     :rtype: LayerOutput
 | |
|     """
 | |
|     _conv_ = img_conv_layer(
 | |
|         name="%s_conv" % name,
 | |
|         input=input,
 | |
|         filter_size=filter_size,
 | |
|         num_filters=num_filters,
 | |
|         num_channels=num_channel,
 | |
|         act=act,
 | |
|         groups=groups,
 | |
|         stride=conv_stride,
 | |
|         padding=conv_padding,
 | |
|         bias_attr=bias_attr,
 | |
|         param_attr=param_attr,
 | |
|         shared_biases=shared_bias,
 | |
|         layer_attr=conv_layer_attr)
 | |
|     return img_pool_layer(
 | |
|         name="%s_pool" % name,
 | |
|         input=_conv_,
 | |
|         pool_size=pool_size,
 | |
|         pool_type=pool_type,
 | |
|         stride=pool_stride,
 | |
|         padding=pool_padding,
 | |
|         layer_attr=pool_layer_attr)
 | |
| 
 | |
| 
 | |
| @wrap_name_default("conv_bn_pool")
 | |
| def img_conv_bn_pool(input,
 | |
|                      filter_size,
 | |
|                      num_filters,
 | |
|                      pool_size,
 | |
|                      name=None,
 | |
|                      pool_type=None,
 | |
|                      act=None,
 | |
|                      groups=1,
 | |
|                      conv_stride=1,
 | |
|                      conv_padding=0,
 | |
|                      conv_bias_attr=None,
 | |
|                      num_channel=None,
 | |
|                      conv_param_attr=None,
 | |
|                      shared_bias=True,
 | |
|                      conv_layer_attr=None,
 | |
|                      bn_param_attr=None,
 | |
|                      bn_bias_attr=None,
 | |
|                      bn_layer_attr=None,
 | |
|                      pool_stride=1,
 | |
|                      pool_padding=0,
 | |
|                      pool_layer_attr=None):
 | |
|     """
 | |
|     Convolution, batch normalization, pooling group.
 | |
| 
 | |
|     Img input => Conv => BN => Pooling => Output.
 | |
| 
 | |
|     :param name: group name.
 | |
|     :type name: basestring
 | |
|     :param input: input layer.
 | |
|     :type input: LayerOutput
 | |
|     :param filter_size: see img_conv_layer for details.
 | |
|     :type filter_size: int
 | |
|     :param num_filters: see img_conv_layer for details.
 | |
|     :type num_filters: int
 | |
|     :param pool_size: see img_pool_layer for details.
 | |
|     :type pool_size: int
 | |
|     :param pool_type: see img_pool_layer for details.
 | |
|     :type pool_type: BasePoolingType
 | |
|     :param act: see batch_norm_layer for details.
 | |
|     :type act: BaseActivation
 | |
|     :param groups: see img_conv_layer for details.
 | |
|     :type groups: int
 | |
|     :param conv_stride: see img_conv_layer for details.
 | |
|     :type conv_stride: int
 | |
|     :param conv_padding: see img_conv_layer for details.
 | |
|     :type conv_padding: int
 | |
|     :param conv_bias_attr: see img_conv_layer for details.
 | |
|     :type conv_bias_attr: ParameterAttribute
 | |
|     :param num_channel: see img_conv_layer for details.
 | |
|     :type num_channel: int
 | |
|     :param conv_param_attr: see img_conv_layer for details.
 | |
|     :type conv_param_attr: ParameterAttribute
 | |
|     :param shared_bias: see img_conv_layer for details.
 | |
|     :type shared_bias: bool
 | |
|     :param conv_layer_attr: see img_conv_layer for details.
 | |
|     :type conv_layer_attr: ExtraLayerOutput
 | |
|     :param bn_param_attr: see batch_norm_layer for details.
 | |
|     :type bn_param_attr: ParameterAttribute
 | |
|     :param bn_bias_attr: see batch_norm_layer for details.
 | |
|     :type bn_bias_attr: ParameterAttribute
 | |
|     :param bn_layer_attr: see batch_norm_layer for details.
 | |
|     :type bn_layer_attr: ExtraLayerAttribute
 | |
|     :param pool_stride: see img_pool_layer for details.
 | |
|     :type pool_stride: int
 | |
|     :param pool_padding: see img_pool_layer for details.
 | |
|     :type pool_padding: int
 | |
|     :param pool_layer_attr: see img_pool_layer for details.
 | |
|     :type pool_layer_attr: ExtraLayerAttribute
 | |
|     :return: layer's output
 | |
|     :rtype: LayerOutput
 | |
|     """
 | |
|     __conv__ = img_conv_layer(
 | |
|         name="%s_conv" % name,
 | |
|         input=input,
 | |
|         filter_size=filter_size,
 | |
|         num_filters=num_filters,
 | |
|         num_channels=num_channel,
 | |
|         act=LinearActivation(),
 | |
|         groups=groups,
 | |
|         stride=conv_stride,
 | |
|         padding=conv_padding,
 | |
|         bias_attr=conv_bias_attr,
 | |
|         param_attr=conv_param_attr,
 | |
|         shared_biases=shared_bias,
 | |
|         layer_attr=conv_layer_attr)
 | |
|     __bn__ = batch_norm_layer(
 | |
|         name="%s_bn" % name,
 | |
|         input=__conv__,
 | |
|         act=act,
 | |
|         bias_attr=bn_bias_attr,
 | |
|         param_attr=bn_param_attr,
 | |
|         layer_attr=bn_layer_attr)
 | |
|     return img_pool_layer(
 | |
|         name="%s_pool" % name,
 | |
|         input=__bn__,
 | |
|         pool_type=pool_type,
 | |
|         pool_size=pool_size,
 | |
|         stride=pool_stride,
 | |
|         padding=pool_padding,
 | |
|         layer_attr=pool_layer_attr)
 | |
| 
 | |
| 
 | |
| @wrap_act_default(param_names=['conv_act'], act=ReluActivation())
 | |
| @wrap_param_default(
 | |
|     param_names=['pool_type'], default_factory=lambda _: MaxPooling())
 | |
| def img_conv_group(input,
 | |
|                    conv_num_filter,
 | |
|                    pool_size,
 | |
|                    num_channels=None,
 | |
|                    conv_padding=1,
 | |
|                    conv_filter_size=3,
 | |
|                    conv_act=None,
 | |
|                    conv_with_batchnorm=False,
 | |
|                    conv_batchnorm_drop_rate=0,
 | |
|                    pool_stride=1,
 | |
|                    pool_type=None,
 | |
|                    param_attr=None):
 | |
|     """
 | |
|     Image Convolution Group, Used for vgg net.
 | |
| 
 | |
|     :param conv_batchnorm_drop_rate: if conv_with_batchnorm[i] is true,
 | |
|         conv_batchnorm_drop_rate[i] represents the drop rate of each batch norm.
 | |
|     :type conv_batchnorm_drop_rate: list
 | |
|     :param input: input layer.
 | |
|     :type input: LayerOutput
 | |
|     :param conv_num_filter: list of output channels num.
 | |
|     :type conv_num_filter: list|tuple
 | |
|     :param pool_size: pooling filter size.
 | |
|     :type pool_size: int
 | |
|     :param num_channels: input channels num.
 | |
|     :type num_channels: int
 | |
|     :param conv_padding: convolution padding size.
 | |
|     :type conv_padding: int
 | |
|     :param conv_filter_size: convolution filter size.
 | |
|     :type conv_filter_size: int
 | |
|     :param conv_act: activation funciton after convolution.
 | |
|     :type conv_act: BaseActivation
 | |
|     :param conv_with_batchnorm: if conv_with_batchnorm[i] is true,
 | |
|         there is a batch normalization operation after each convolution.
 | |
|     :type conv_with_batchnorm: list
 | |
|     :param pool_stride: pooling stride size.
 | |
|     :type pool_stride: int
 | |
|     :param pool_type: pooling type.
 | |
|     :type pool_type: BasePoolingType
 | |
|     :param param_attr: param attribute of convolution layer,
 | |
|                        None means default attribute.
 | |
|     :type param_attr: ParameterAttribute
 | |
|     :return: layer's output
 | |
|     :rtype: LayerOutput
 | |
|     """
 | |
|     tmp = input
 | |
| 
 | |
|     # Type checks
 | |
|     assert isinstance(tmp, LayerOutput)
 | |
|     assert isinstance(conv_num_filter, list) or isinstance(conv_num_filter,
 | |
|                                                            tuple)
 | |
|     for each_num_filter in conv_num_filter:
 | |
|         assert isinstance(each_num_filter, int)
 | |
| 
 | |
|     assert isinstance(pool_size, int)
 | |
| 
 | |
|     def __extend_list__(obj):
 | |
|         if not hasattr(obj, '__len__'):
 | |
|             return [obj] * len(conv_num_filter)
 | |
|         else:
 | |
|             return obj
 | |
| 
 | |
|     conv_padding = __extend_list__(conv_padding)
 | |
|     conv_filter_size = __extend_list__(conv_filter_size)
 | |
|     conv_act = __extend_list__(conv_act)
 | |
|     conv_with_batchnorm = __extend_list__(conv_with_batchnorm)
 | |
|     conv_batchnorm_drop_rate = __extend_list__(conv_batchnorm_drop_rate)
 | |
| 
 | |
|     for i in xrange(len(conv_num_filter)):
 | |
|         extra_kwargs = dict()
 | |
|         if num_channels is not None:
 | |
|             extra_kwargs['num_channels'] = num_channels
 | |
|             num_channels = None
 | |
|         if conv_with_batchnorm[i]:
 | |
|             extra_kwargs['act'] = LinearActivation()
 | |
|         else:
 | |
|             extra_kwargs['act'] = conv_act[i]
 | |
| 
 | |
|         tmp = img_conv_layer(
 | |
|             input=tmp,
 | |
|             padding=conv_padding[i],
 | |
|             filter_size=conv_filter_size[i],
 | |
|             num_filters=conv_num_filter[i],
 | |
|             param_attr=param_attr,
 | |
|             **extra_kwargs)
 | |
| 
 | |
|         # logger.debug("tmp.num_filters = %d" % tmp.num_filters)
 | |
| 
 | |
|         if conv_with_batchnorm[i]:
 | |
|             dropout = conv_batchnorm_drop_rate[i]
 | |
|             if dropout == 0 or abs(dropout) < 1e-5:  # dropout not set
 | |
|                 tmp = batch_norm_layer(input=tmp, act=conv_act[i])
 | |
|             else:
 | |
|                 tmp = batch_norm_layer(
 | |
|                     input=tmp,
 | |
|                     act=conv_act[i],
 | |
|                     layer_attr=ExtraAttr(drop_rate=dropout))
 | |
| 
 | |
|     return img_pool_layer(
 | |
|         input=tmp, stride=pool_stride, pool_size=pool_size, pool_type=pool_type)
 | |
| 
 | |
| 
 | |
| @wrap_name_default("separable_conv")
 | |
| def img_separable_conv(input,
 | |
|                        num_channels,
 | |
|                        num_out_channels,
 | |
|                        filter_size,
 | |
|                        stride=1,
 | |
|                        padding=0,
 | |
|                        depth_multiplier=1,
 | |
|                        act=None,
 | |
|                        bias_attr=None,
 | |
|                        param_attr=None,
 | |
|                        shared_bias=True,
 | |
|                        layer_type='exconv',
 | |
|                        name=None):
 | |
|     """
 | |
|     Separable Convolution.
 | |
| 
 | |
|     The separable convolution module is consisted of a depthwise convolution
 | |
|     that acts separately on input channels, followed by a pointwise convolution
 | |
|     with 1*1 kernels that mixes channels. It is used for Xception:
 | |
|     https://arxiv.org/pdf/1610.02357.pdf
 | |
| 
 | |
|     :param input: input layer.
 | |
|     :type input: LayerOutput
 | |
|     :param num_channels: the number of input channels.
 | |
|     :type num_channels: int
 | |
|     :param num_out_channels: the number of output channels.
 | |
|     :type num_out_channels: int
 | |
|     :param filter_size: the filter size for the depthwise convolution.
 | |
|     :type filter_size: int|tuple
 | |
|     :param stride: the stride size for the depthwise convolution.
 | |
|     :type stride: int|tuple
 | |
|     :param padding: the padding size for the depthwise convolution.
 | |
|     :type padding: int|tuple
 | |
|     :param depth_multiplier: the number of filter for one channel in the
 | |
|                              depthwize convolution.
 | |
|     :type depth_multiplier: int
 | |
|     :param act: the activation function for the output.
 | |
|     :type act: BaseActivation
 | |
|     :param bias_attr: see img_conv_layer for details.
 | |
|     :type bias_attr: ParameterAttribute
 | |
|     :param param_attr: see img_conv_layer for details.
 | |
|     :type param_attr: ParameterAttribute
 | |
|     :param shared_bias: see img_conv_layer for details.
 | |
|     :type shared_bias: bool
 | |
|     :param layer_type: see img_conv_layer for details.
 | |
|     :type layer_type: bool
 | |
|     :return: layer's output
 | |
|     :rtype: LayerOutput
 | |
|     """
 | |
|     __depthwise_conv__ = img_conv_layer(
 | |
|         name="%s_depthwise_conv" % name,
 | |
|         input=input,
 | |
|         num_channels=num_channels,
 | |
|         num_filters=num_channels * depth_multiplier,
 | |
|         groups=num_channels,
 | |
|         filter_size=filter_size,
 | |
|         stride=stride,
 | |
|         padding=padding,
 | |
|         act=LinearActivation(),
 | |
|         bias_attr=bias_attr,
 | |
|         param_attr=param_attr,
 | |
|         shared_biases=shared_bias,
 | |
|         layer_type=layer_type)
 | |
|     __pointwise_conv__ = img_conv_layer(
 | |
|         name="%s_pointwise_conv" % name,
 | |
|         input=__depthwise_conv__,
 | |
|         num_channels=num_channels * depth_multiplier,
 | |
|         num_filters=num_out_channels,
 | |
|         filter_size=1,
 | |
|         stride=1,
 | |
|         padding=0,
 | |
|         act=act,
 | |
|         bias_attr=bias_attr,
 | |
|         param_attr=param_attr,
 | |
|         shared_biases=shared_bias)
 | |
|     return __pointwise_conv__
 | |
| 
 | |
| 
 | |
| def small_vgg(input_image, num_channels, num_classes):
 | |
|     def __vgg__(ipt, num_filter, times, dropouts, num_channels_=None):
 | |
|         return img_conv_group(
 | |
|             input=ipt,
 | |
|             num_channels=num_channels_,
 | |
|             pool_size=2,
 | |
|             pool_stride=2,
 | |
|             conv_num_filter=[num_filter] * times,
 | |
|             conv_filter_size=3,
 | |
|             conv_act=ReluActivation(),
 | |
|             conv_with_batchnorm=True,
 | |
|             conv_batchnorm_drop_rate=dropouts,
 | |
|             pool_type=MaxPooling())
 | |
| 
 | |
|     tmp = __vgg__(input_image, 64, 2, [0.3, 0], num_channels)
 | |
|     tmp = __vgg__(tmp, 128, 2, [0.4, 0])
 | |
|     tmp = __vgg__(tmp, 256, 3, [0.4, 0.4, 0])
 | |
|     tmp = __vgg__(tmp, 512, 3, [0.4, 0.4, 0])
 | |
|     tmp = img_pool_layer(
 | |
|         input=tmp, stride=2, pool_size=2, pool_type=MaxPooling())
 | |
|     tmp = dropout_layer(input=tmp, dropout_rate=0.5)
 | |
|     tmp = fc_layer(
 | |
|         input=tmp,
 | |
|         size=512,
 | |
|         layer_attr=ExtraAttr(drop_rate=0.5),
 | |
|         act=LinearActivation())
 | |
|     tmp = batch_norm_layer(input=tmp, act=ReluActivation())
 | |
|     return fc_layer(input=tmp, size=num_classes, act=SoftmaxActivation())
 | |
| 
 | |
| 
 | |
| def vgg_16_network(input_image, num_channels, num_classes=1000):
 | |
|     """
 | |
|     Same model from https://gist.github.com/ksimonyan/211839e770f7b538e2d8
 | |
| 
 | |
|     :param num_classes: number of class.
 | |
|     :type num_classes: int
 | |
|     :param input_image: input layer.
 | |
|     :type input_image: LayerOutput
 | |
|     :param num_channels: input channels num.
 | |
|     :type num_channels: int
 | |
|     :return: layer's output
 | |
|     :rtype: LayerOutput
 | |
|     """
 | |
| 
 | |
|     tmp = img_conv_group(
 | |
|         input=input_image,
 | |
|         num_channels=num_channels,
 | |
|         conv_padding=1,
 | |
|         conv_num_filter=[64, 64],
 | |
|         conv_filter_size=3,
 | |
|         conv_act=ReluActivation(),
 | |
|         pool_size=2,
 | |
|         pool_stride=2,
 | |
|         pool_type=MaxPooling())
 | |
| 
 | |
|     tmp = img_conv_group(
 | |
|         input=tmp,
 | |
|         conv_num_filter=[128, 128],
 | |
|         conv_padding=1,
 | |
|         conv_filter_size=3,
 | |
|         conv_act=ReluActivation(),
 | |
|         pool_stride=2,
 | |
|         pool_type=MaxPooling(),
 | |
|         pool_size=2)
 | |
| 
 | |
|     tmp = img_conv_group(
 | |
|         input=tmp,
 | |
|         conv_num_filter=[256, 256, 256],
 | |
|         conv_padding=1,
 | |
|         conv_filter_size=3,
 | |
|         conv_act=ReluActivation(),
 | |
|         pool_stride=2,
 | |
|         pool_type=MaxPooling(),
 | |
|         pool_size=2)
 | |
| 
 | |
|     tmp = img_conv_group(
 | |
|         input=tmp,
 | |
|         conv_num_filter=[512, 512, 512],
 | |
|         conv_padding=1,
 | |
|         conv_filter_size=3,
 | |
|         conv_act=ReluActivation(),
 | |
|         pool_stride=2,
 | |
|         pool_type=MaxPooling(),
 | |
|         pool_size=2)
 | |
|     tmp = img_conv_group(
 | |
|         input=tmp,
 | |
|         conv_num_filter=[512, 512, 512],
 | |
|         conv_padding=1,
 | |
|         conv_filter_size=3,
 | |
|         conv_act=ReluActivation(),
 | |
|         pool_stride=2,
 | |
|         pool_type=MaxPooling(),
 | |
|         pool_size=2)
 | |
| 
 | |
|     tmp = fc_layer(
 | |
|         input=tmp,
 | |
|         size=4096,
 | |
|         act=ReluActivation(),
 | |
|         layer_attr=ExtraAttr(drop_rate=0.5))
 | |
| 
 | |
|     tmp = fc_layer(
 | |
|         input=tmp,
 | |
|         size=4096,
 | |
|         act=ReluActivation(),
 | |
|         layer_attr=ExtraAttr(drop_rate=0.5))
 | |
| 
 | |
|     return fc_layer(input=tmp, size=num_classes, act=SoftmaxActivation())
 | |
| 
 | |
| 
 | |
| ############################################################################
 | |
| #                       Recurrent                                          #
 | |
| ############################################################################
 | |
| 
 | |
| 
 | |
| @wrap_name_default("lstm")
 | |
| def simple_lstm(input,
 | |
|                 size,
 | |
|                 name=None,
 | |
|                 reverse=False,
 | |
|                 mat_param_attr=None,
 | |
|                 bias_param_attr=None,
 | |
|                 inner_param_attr=None,
 | |
|                 act=None,
 | |
|                 gate_act=None,
 | |
|                 state_act=None,
 | |
|                 mixed_layer_attr=None,
 | |
|                 lstm_cell_attr=None):
 | |
|     """
 | |
|     Simple LSTM Cell.
 | |
| 
 | |
|     It just combines a mixed layer with fully_matrix_projection and a lstmemory
 | |
|     layer. The simple lstm cell was implemented with follow equations.
 | |
| 
 | |
|     ..  math::
 | |
| 
 | |
|         i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
 | |
| 
 | |
|         f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
 | |
| 
 | |
|         c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
 | |
| 
 | |
|         o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
 | |
| 
 | |
|         h_t & = o_t tanh(c_t)
 | |
| 
 | |
|     Please refer to **Generating Sequences With Recurrent Neural Networks** for more
 | |
|     details about lstm. Link_ is here.
 | |
| 
 | |
|     .. _Link: http://arxiv.org/abs/1308.0850
 | |
| 
 | |
|     :param name: lstm layer name.
 | |
|     :type name: basestring
 | |
|     :param input: layer's input.
 | |
|     :type input: LayerOutput
 | |
|     :param size: lstm layer size.
 | |
|     :type size: int
 | |
|     :param reverse: process the input in a reverse order or not.
 | |
|     :type reverse: bool
 | |
|     :param mat_param_attr: parameter attribute of matrix projection in mixed layer.
 | |
|     :type mat_param_attr: ParameterAttribute
 | |
|     :param bias_param_attr: bias parameter attribute. False means no bias, None
 | |
|                             means default bias.
 | |
|     :type bias_param_attr: ParameterAttribute|False
 | |
|     :param inner_param_attr: parameter attribute of lstm cell.
 | |
|     :type inner_param_attr: ParameterAttribute
 | |
|     :param act: last activiation type of lstm.
 | |
|     :type act: BaseActivation
 | |
|     :param gate_act: gate activiation type of lstm.
 | |
|     :type gate_act: BaseActivation
 | |
|     :param state_act: state activiation type of lstm.
 | |
|     :type state_act: BaseActivation
 | |
|     :param mixed_layer_attr: extra attribute of mixed layer.
 | |
|     :type mixed_layer_attr: ExtraLayerAttribute
 | |
|     :param lstm_cell_attr: extra attribute of lstm.
 | |
|     :type lstm_cell_attr: ExtraLayerAttribute
 | |
|     :return: layer's output.
 | |
|     :rtype: LayerOutput
 | |
|     """
 | |
|     fc_name = 'lstm_transform_%s' % name
 | |
|     with mixed_layer(
 | |
|             name=fc_name,
 | |
|             size=size * 4,
 | |
|             act=IdentityActivation(),
 | |
|             layer_attr=mixed_layer_attr,
 | |
|             bias_attr=False) as m:
 | |
|         m += full_matrix_projection(input, param_attr=mat_param_attr)
 | |
| 
 | |
|     return lstmemory(
 | |
|         name=name,
 | |
|         input=m,
 | |
|         reverse=reverse,
 | |
|         bias_attr=bias_param_attr,
 | |
|         param_attr=inner_param_attr,
 | |
|         act=act,
 | |
|         gate_act=gate_act,
 | |
|         state_act=state_act,
 | |
|         layer_attr=lstm_cell_attr)
 | |
| 
 | |
| 
 | |
| @wrap_name_default('lstm_unit')
 | |
| def lstmemory_unit(input,
 | |
|                    out_memory=None,
 | |
|                    name=None,
 | |
|                    size=None,
 | |
|                    param_attr=None,
 | |
|                    act=None,
 | |
|                    gate_act=None,
 | |
|                    state_act=None,
 | |
|                    input_proj_bias_attr=None,
 | |
|                    input_proj_layer_attr=None,
 | |
|                    lstm_bias_attr=None,
 | |
|                    lstm_layer_attr=None):
 | |
|     """
 | |
|     lstmemory_unit defines the caculation process of a LSTM unit during a
 | |
|     single time step. This function is not a recurrent layer, so it can not be
 | |
|     directly used to process sequence input. This function is always used in
 | |
|     recurrent_group (see layers.py for more details) to implement attention
 | |
|     mechanism.
 | |
| 
 | |
|     Please refer to  **Generating Sequences With Recurrent Neural Networks**
 | |
|     for more details about LSTM. The link goes as follows:
 | |
|     .. _Link: https://arxiv.org/abs/1308.0850
 | |
| 
 | |
|     ..  math::
 | |
| 
 | |
|         i_t & = \\sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)
 | |
| 
 | |
|         f_t & = \\sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)
 | |
| 
 | |
|         c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)
 | |
| 
 | |
|         o_t & = \\sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)
 | |
| 
 | |
|         h_t & = o_t tanh(c_t)
 | |
| 
 | |
|     The example usage is:
 | |
| 
 | |
|     ..  code-block:: python
 | |
| 
 | |
|         lstm_step = lstmemory_unit(input=[layer1],
 | |
|                                    size=256,
 | |
|                                    act=TanhActivation(),
 | |
|                                    gate_act=SigmoidActivation(),
 | |
|                                    state_act=TanhActivation())
 | |
| 
 | |
| 
 | |
|     :param input: Input layer.
 | |
|     :type input: LayerOutput
 | |
|     :param out_memory: The output of previous time step.
 | |
|     :type out_memory: LayerOutput | None
 | |
|     :param name: The lstmemory unit name.
 | |
|     :type name: basestring
 | |
|     :param size: The lstmemory unit size.
 | |
|     :type size: int
 | |
|     :param param_attr: The parameter attribute for the weights in
 | |
|                      input to hidden projection.
 | |
|                      None means default attribute.
 | |
|     :type param_attr: ParameterAttribute
 | |
|     :param act: The last activiation type of lstm.
 | |
|     :type act: BaseActivation
 | |
|     :param gate_act: The gate activiation type of lstm.
 | |
|     :type gate_act: BaseActivation
 | |
|     :param state_act: The state activiation type of lstm.
 | |
|     :type state_act: BaseActivation
 | |
|     :param input_proj_bias_attr: The parameter attribute for the bias in
 | |
|                       input to hidden projection.
 | |
|                       False or None means no bias.
 | |
|                       If this parameter is set to True,
 | |
|                       the bias is initialized to zero.
 | |
|     :type input_proj_bias_attr: ParameterAttribute|bool|None
 | |
|     :param input_proj_layer_attr: The extra layer attribute for
 | |
|                      input to hidden projection of the LSTM unit,
 | |
|                      such as dropout, error clipping.
 | |
|     :type input_proj_layer_attr: ExtraLayerAttribute
 | |
|     :param lstm_bias_attr: The parameter attribute for the bias in lstm layer.
 | |
|                       False or None means no bias.
 | |
|                       If this parameter is set to True,
 | |
|                       the bias is initialized to zero.
 | |
|     :type lstm_bias_attr: ParameterAttribute|True|None
 | |
|     :param lstm_layer_attr: The extra attribute of lstm layer.
 | |
|     :type lstm_layer_attr: ExtraLayerAttribute
 | |
|     :return: The lstmemory unit name.
 | |
|     :rtype: LayerOutput
 | |
|     """
 | |
|     if size is None:
 | |
|         assert input.size % 4 == 0
 | |
|         size = input.size / 4
 | |
|     if out_memory is None:
 | |
|         out_mem = memory(name=name, size=size)
 | |
|     else:
 | |
|         out_mem = out_memory
 | |
| 
 | |
|     state_mem = memory(name="%s_state" % name, size=size)
 | |
| 
 | |
|     with mixed_layer(
 | |
|             name="%s_input_recurrent" % name,
 | |
|             size=size * 4,
 | |
|             bias_attr=input_proj_bias_attr,
 | |
|             layer_attr=input_proj_layer_attr,
 | |
|             act=IdentityActivation()) as m:
 | |
|         m += identity_projection(input=input)
 | |
|         m += full_matrix_projection(input=out_mem, param_attr=param_attr)
 | |
| 
 | |
|     lstm_out = lstm_step_layer(
 | |
|         name=name,
 | |
|         input=m,
 | |
|         state=state_mem,
 | |
|         size=size,
 | |
|         bias_attr=lstm_bias_attr,
 | |
|         act=act,
 | |
|         gate_act=gate_act,
 | |
|         state_act=state_act,
 | |
|         layer_attr=lstm_layer_attr)
 | |
|     get_output_layer(name='%s_state' % name, input=lstm_out, arg_name='state')
 | |
| 
 | |
|     return lstm_out
 | |
| 
 | |
| 
 | |
| @wrap_name_default('lstm_group')
 | |
| def lstmemory_group(input,
 | |
|                     size=None,
 | |
|                     name=None,
 | |
|                     out_memory=None,
 | |
|                     reverse=False,
 | |
|                     param_attr=None,
 | |
|                     act=None,
 | |
|                     gate_act=None,
 | |
|                     state_act=None,
 | |
|                     input_proj_bias_attr=None,
 | |
|                     input_proj_layer_attr=None,
 | |
|                     lstm_bias_attr=None,
 | |
|                     lstm_layer_attr=None):
 | |
|     """
 | |
|     lstm_group is a recurrent_group version of Long Short Term Memory. It
 | |
|     does exactly the same calculation as the lstmemory layer (see lstmemory in
 | |
|     layers.py for the maths) does. A promising benefit is that LSTM memory
 | |
|     cell states(or hidden states) in every time step are accessible to the
 | |
|     user. This is especially useful in attention model. If you do not need to
 | |
|     access the internal states of the lstm and merely use its outputs,
 | |
|     it is recommended to use the lstmemory, which is relatively faster than
 | |
|     lstmemory_group.
 | |
| 
 | |
|     NOTE: In PaddlePaddle's implementation, the following input-to-hidden
 | |
|     multiplications:
 | |
|     :math:`W_{x_i}x_{t}` , :math:`W_{x_f}x_{t}`,
 | |
|     :math:`W_{x_c}x_t`, :math:`W_{x_o}x_{t}` are not done in lstmemory_unit to
 | |
|     speed up the calculations. Consequently, an additional mixed_layer with
 | |
|     full_matrix_projection must be included before lstmemory_unit is called.
 | |
| 
 | |
|     The example usage is:
 | |
| 
 | |
|     ..  code-block:: python
 | |
| 
 | |
|         lstm_step = lstmemory_group(input=[layer1],
 | |
|                                     size=256,
 | |
|                                     act=TanhActivation(),
 | |
|                                     gate_act=SigmoidActivation(),
 | |
|                                     state_act=TanhActivation())
 | |
| 
 | |
|     :param input: Input layer.
 | |
|     :type input: LayerOutput
 | |
|     :param size: The lstmemory group size.
 | |
|     :type size: int
 | |
|     :param name: The name of lstmemory group.
 | |
|     :type name: basestring
 | |
|     :param out_memory: The output of previous time step.
 | |
|     :type out_memory: LayerOutput | None
 | |
|     :param reverse: Process the input in a reverse order or not.
 | |
|     :type reverse: bool
 | |
|     :param param_attr: The parameter attribute for the weights in
 | |
|                      input to hidden projection.
 | |
|                      None means default attribute.
 | |
|     :type param_attr: ParameterAttribute
 | |
|     :param act: The last activiation type of lstm.
 | |
|     :type act: BaseActivation
 | |
|     :param gate_act: The gate activiation type of lstm.
 | |
|     :type gate_act: BaseActivation
 | |
|     :param state_act: The state activiation type of lstm.
 | |
|     :type state_act: BaseActivation
 | |
|     :param input_proj_bias_attr: The parameter attribute for the bias in
 | |
|                       input to hidden projection.
 | |
|                       False or None means no bias.
 | |
|                       If this parameter is set to True,
 | |
|                       the bias is initialized to zero.
 | |
|     :type input_proj_bias_attr: ParameterAttribute|bool|None
 | |
|     :param input_proj_layer_attr: The extra layer attribute for
 | |
|                      input to hidden projection of the LSTM unit,
 | |
|                      such as dropout, error clipping.
 | |
|     :type input_proj_layer_attr: ExtraLayerAttribute
 | |
|     :param lstm_bias_attr: The parameter attribute for the bias in lstm layer.
 | |
|                       False or None means no bias.
 | |
|                       If this parameter is set to True,
 | |
|                       the bias is initialized to zero.
 | |
|     :type lstm_bias_attr: ParameterAttribute|True|None
 | |
|     :param lstm_layer_attr: The extra attribute of lstm layer.
 | |
|     :type lstm_layer_attr: ExtraLayerAttribute
 | |
|     :return: the lstmemory group.
 | |
|     :rtype: LayerOutput
 | |
|     """
 | |
| 
 | |
|     def __lstm_step__(ipt):
 | |
|         return lstmemory_unit(
 | |
|             input=ipt,
 | |
|             name=name,
 | |
|             size=size,
 | |
|             act=act,
 | |
|             gate_act=gate_act,
 | |
|             state_act=state_act,
 | |
|             out_memory=out_memory,
 | |
|             input_proj_bias_attr=input_proj_bias_attr,
 | |
|             input_proj_layer_attr=input_proj_layer_attr,
 | |
|             param_attr=param_attr,
 | |
|             lstm_layer_attr=lstm_layer_attr,
 | |
|             lstm_bias_attr=lstm_bias_attr)
 | |
| 
 | |
|     return recurrent_group(
 | |
|         name='%s_recurrent_group' % name,
 | |
|         step=__lstm_step__,
 | |
|         reverse=reverse,
 | |
|         input=input)
 | |
| 
 | |
| 
 | |
| @wrap_name_default('gru_unit')
 | |
| def gru_unit(input,
 | |
|              memory_boot=None,
 | |
|              size=None,
 | |
|              name=None,
 | |
|              gru_bias_attr=None,
 | |
|              gru_param_attr=None,
 | |
|              act=None,
 | |
|              gate_act=None,
 | |
|              gru_layer_attr=None,
 | |
|              naive=False):
 | |
|     """
 | |
|     gru_unit defines the calculation process of a gated recurrent unit during a single
 | |
|     time step. This function is not a recurrent layer, so it can not be
 | |
|     directly used to process sequence input. This function is always used in
 | |
|     the recurrent_group (see layers.py for more details) to implement attention
 | |
|     mechanism.
 | |
| 
 | |
|     Please see grumemory in layers.py for the details about the maths.
 | |
| 
 | |
|     :param input: input layer.
 | |
|     :type input: LayerOutput
 | |
|     :param memory_boot: the initialization state of the LSTM cell.
 | |
|     :type memory_boot: LayerOutput | None
 | |
|     :param name: name of the gru group.
 | |
|     :type name: basestring
 | |
|     :param size: hidden size of the gru.
 | |
|     :type size: int
 | |
|     :param act: activation type of gru
 | |
|     :type act: BaseActivation
 | |
|     :param gate_act: gate activation type or gru
 | |
|     :type gate_act: BaseActivation
 | |
|     :param gru_layer_attr: Extra attribute of the gru layer.
 | |
|     :type gru_layer_attr: ExtraLayerAttribute
 | |
|     :return: the gru output layer.
 | |
|     :rtype: LayerOutput
 | |
|     """
 | |
| 
 | |
|     assert input.size % 3 == 0
 | |
|     if size is None:
 | |
|         size = input.size / 3
 | |
| 
 | |
|     out_mem = memory(name=name, size=size, boot_layer=memory_boot)
 | |
| 
 | |
|     if naive:
 | |
|         __step__ = gru_step_naive_layer
 | |
|     else:
 | |
|         __step__ = gru_step_layer
 | |
| 
 | |
|     gru_out = __step__(
 | |
|         name=name,
 | |
|         input=input,
 | |
|         output_mem=out_mem,
 | |
|         size=size,
 | |
|         bias_attr=gru_bias_attr,
 | |
|         param_attr=gru_param_attr,
 | |
|         act=act,
 | |
|         gate_act=gate_act,
 | |
|         layer_attr=gru_layer_attr)
 | |
|     return gru_out
 | |
| 
 | |
| 
 | |
| @wrap_name_default('gru_group')
 | |
| def gru_group(input,
 | |
|               memory_boot=None,
 | |
|               size=None,
 | |
|               name=None,
 | |
|               reverse=False,
 | |
|               gru_bias_attr=None,
 | |
|               gru_param_attr=None,
 | |
|               act=None,
 | |
|               gate_act=None,
 | |
|               gru_layer_attr=None,
 | |
|               naive=False):
 | |
|     """
 | |
|     gru_group is a recurrent_group version of Gated Recurrent Unit. It
 | |
|     does exactly the same calculation as the grumemory layer does. A promising
 | |
|     benefit is that gru hidden states are accessible to the user. This is
 | |
|     especially useful in attention model. If you do not need to access
 | |
|     any internal state and merely use the outputs of a GRU, it is recommended
 | |
|     to use the grumemory, which is relatively faster.
 | |
| 
 | |
|     Please see grumemory in layers.py for more detail about the maths.
 | |
| 
 | |
|     The example usage is:
 | |
| 
 | |
|     ..  code-block:: python
 | |
| 
 | |
|         gru = gru_group(input=[layer1],
 | |
|                         size=256,
 | |
|                         act=TanhActivation(),
 | |
|                         gate_act=SigmoidActivation())
 | |
| 
 | |
|     :param input: input layer.
 | |
|     :type input: LayerOutput
 | |
|     :param memory_boot: the initialization state of the LSTM cell.
 | |
|     :type memory_boot: LayerOutput | None
 | |
|     :param name: name of the gru group.
 | |
|     :type name: basestring
 | |
|     :param size: hidden size of the gru.
 | |
|     :type size: int
 | |
|     :param reverse: process the input in a reverse order or not.
 | |
|     :type reverse: bool
 | |
|     :param act: activiation type of gru
 | |
|     :type act: BaseActivation
 | |
|     :param gate_act: gate activiation type of gru
 | |
|     :type gate_act: BaseActivation
 | |
|     :param gru_bias_attr: bias parameter attribute of gru layer,
 | |
|                           False means no bias, None means default bias.
 | |
|     :type gru_bias_attr: ParameterAttribute|False|None
 | |
|     :param gru_layer_attr: Extra attribute of the gru layer.
 | |
|     :type gru_layer_attr: ExtraLayerAttribute
 | |
|     :return: the gru group.
 | |
|     :rtype: LayerOutput
 | |
|     """
 | |
| 
 | |
|     def __gru_step__(ipt):
 | |
|         return gru_unit(
 | |
|             input=ipt,
 | |
|             memory_boot=memory_boot,
 | |
|             name=name,
 | |
|             size=size,
 | |
|             gru_bias_attr=gru_bias_attr,
 | |
|             gru_param_attr=gru_param_attr,
 | |
|             act=act,
 | |
|             gate_act=gate_act,
 | |
|             gru_layer_attr=gru_layer_attr,
 | |
|             naive=naive)
 | |
| 
 | |
|     return recurrent_group(
 | |
|         name='%s_recurrent_group' % name,
 | |
|         step=__gru_step__,
 | |
|         reverse=reverse,
 | |
|         input=input)
 | |
| 
 | |
| 
 | |
| @wrap_name_default('simple_gru')
 | |
| def simple_gru(input,
 | |
|                size,
 | |
|                name=None,
 | |
|                reverse=False,
 | |
|                mixed_param_attr=None,
 | |
|                mixed_bias_param_attr=None,
 | |
|                mixed_layer_attr=None,
 | |
|                gru_bias_attr=None,
 | |
|                gru_param_attr=None,
 | |
|                act=None,
 | |
|                gate_act=None,
 | |
|                gru_layer_attr=None,
 | |
|                naive=False):
 | |
|     """
 | |
|     You may see gru_step_layer, grumemory in layers.py, gru_unit, gru_group,
 | |
|     simple_gru in network.py. The reason why there are so many interfaces is
 | |
|     that we have two ways to implement recurrent neural network. One way is to
 | |
|     use one complete layer to implement rnn (including simple rnn, gru and lstm)
 | |
|     with multiple time steps, such as recurrent_layer, lstmemory, grumemory. But
 | |
|     the multiplication operation :math:`W x_t` is not computed in these layers.
 | |
|     See details in their interfaces in layers.py.
 | |
|     The other implementation is to use an recurrent group which can ensemble a
 | |
|     series of layers to compute rnn step by step. This way is flexible for
 | |
|     attenion mechanism or other complex connections.
 | |
| 
 | |
|     - gru_step_layer: only compute rnn by one step. It needs an memory as input
 | |
|       and can be used in recurrent group.
 | |
|     - gru_unit: a wrapper of gru_step_layer with memory.
 | |
|     - gru_group: a GRU cell implemented by a combination of multiple layers in
 | |
|       recurrent group.
 | |
|       But :math:`W x_t` is not done in group.
 | |
|     - gru_memory: a GRU cell implemented by one layer, which does same calculation
 | |
|       with gru_group and is faster than gru_group.
 | |
|     - simple_gru: a complete GRU implementation inlcuding :math:`W x_t` and
 | |
|       gru_group. :math:`W` contains :math:`W_r`, :math:`W_z` and :math:`W`, see
 | |
|       formula in grumemory.
 | |
| 
 | |
|     The computational speed is that, grumemory is relatively better than
 | |
|     gru_group, and gru_group is relatively better than simple_gru.
 | |
| 
 | |
|     The example usage is:
 | |
| 
 | |
|     ..  code-block:: python
 | |
| 
 | |
|         gru = simple_gru(input=[layer1], size=256)
 | |
| 
 | |
|     :param input: input layer.
 | |
|     :type input: LayerOutput
 | |
|     :param name: name of the gru group.
 | |
|     :type name: basestring
 | |
|     :param size: hidden size of the gru.
 | |
|     :type size: int
 | |
|     :param reverse: process the input in a reverse order or not.
 | |
|     :type reverse: bool
 | |
|     :param act: activiation type of gru
 | |
|     :type act: BaseActivation
 | |
|     :param gate_act: gate activiation type of gru
 | |
|     :type gate_act: BaseActivation
 | |
|     :param gru_bias_attr: bias parameter attribute of gru layer,
 | |
|                           False means no bias, None means default bias.
 | |
|     :type gru_bias_attr: ParameterAttribute|False|None
 | |
|     :param gru_layer_attr: Extra attribute of the gru layer.
 | |
|     :type gru_layer_attr: ExtraLayerAttribute
 | |
|     :return: the gru group.
 | |
|     :rtype: LayerOutput
 | |
|     """
 | |
|     with mixed_layer(
 | |
|             name='%s_transform' % name,
 | |
|             size=size * 3,
 | |
|             bias_attr=mixed_bias_param_attr,
 | |
|             layer_attr=mixed_layer_attr) as m:
 | |
|         m += full_matrix_projection(input=input, param_attr=mixed_param_attr)
 | |
| 
 | |
|     return gru_group(
 | |
|         name=name,
 | |
|         size=size,
 | |
|         input=m,
 | |
|         reverse=reverse,
 | |
|         gru_bias_attr=gru_bias_attr,
 | |
|         gru_param_attr=gru_param_attr,
 | |
|         act=act,
 | |
|         gate_act=gate_act,
 | |
|         gru_layer_attr=gru_layer_attr,
 | |
|         naive=naive)
 | |
| 
 | |
| 
 | |
| @wrap_name_default('simple_gru2')
 | |
| def simple_gru2(input,
 | |
|                 size,
 | |
|                 name=None,
 | |
|                 reverse=False,
 | |
|                 mixed_param_attr=None,
 | |
|                 mixed_bias_attr=None,
 | |
|                 gru_param_attr=None,
 | |
|                 gru_bias_attr=None,
 | |
|                 act=None,
 | |
|                 gate_act=None,
 | |
|                 mixed_layer_attr=None,
 | |
|                 gru_cell_attr=None):
 | |
|     """
 | |
|     simple_gru2 is the same with simple_gru, but using grumemory instead.
 | |
|     Please refer to grumemory in layers.py for more detail about the math.
 | |
|     simple_gru2 is faster than simple_gru.
 | |
| 
 | |
|     The example usage is:
 | |
| 
 | |
|     ..  code-block:: python
 | |
| 
 | |
|         gru = simple_gru2(input=[layer1], size=256)
 | |
| 
 | |
|     :param input: input layer.
 | |
|     :type input: LayerOutput
 | |
|     :param name: name of the gru group.
 | |
|     :type name: basestring
 | |
|     :param size: hidden size of the gru.
 | |
|     :type size: int
 | |
|     :param reverse: process the input in a reverse order or not.
 | |
|     :type reverse: bool
 | |
|     :param act: activiation type of gru
 | |
|     :type act: BaseActivation
 | |
|     :param gate_act: gate activiation type of gru
 | |
|     :type gate_act: BaseActivation
 | |
|     :param gru_bias_attr: bias parameter attribute of gru layer,
 | |
|                           False means no bias, None means default bias.
 | |
|     :type gru_bias_attr: ParameterAttribute|False|None
 | |
|     :param gru_param_attr: param parameter attribute of gru layer,
 | |
|                           None means default param.
 | |
|     :type gru_param_attr: ParameterAttribute|None
 | |
|     :return: the gru group.
 | |
|     :rtype: LayerOutput
 | |
|     """
 | |
|     with mixed_layer(
 | |
|             name='%s_transform' % name,
 | |
|             size=size * 3,
 | |
|             bias_attr=mixed_bias_attr,
 | |
|             layer_attr=mixed_layer_attr) as m:
 | |
|         m += full_matrix_projection(input=input, param_attr=mixed_param_attr)
 | |
| 
 | |
|     return grumemory(
 | |
|         name=name,
 | |
|         input=m,
 | |
|         reverse=reverse,
 | |
|         bias_attr=gru_bias_attr,
 | |
|         param_attr=gru_param_attr,
 | |
|         act=act,
 | |
|         gate_act=gate_act,
 | |
|         layer_attr=gru_cell_attr)
 | |
| 
 | |
| 
 | |
| @wrap_name_default("bidirectional_gru")
 | |
| def bidirectional_gru(input,
 | |
|                       size,
 | |
|                       name=None,
 | |
|                       return_seq=False,
 | |
|                       fwd_mixed_param_attr=None,
 | |
|                       fwd_mixed_bias_attr=None,
 | |
|                       fwd_gru_param_attr=None,
 | |
|                       fwd_gru_bias_attr=None,
 | |
|                       fwd_act=None,
 | |
|                       fwd_gate_act=None,
 | |
|                       fwd_mixed_layer_attr=None,
 | |
|                       fwd_gru_cell_attr=None,
 | |
|                       bwd_mixed_param_attr=None,
 | |
|                       bwd_mixed_bias_attr=None,
 | |
|                       bwd_gru_param_attr=None,
 | |
|                       bwd_gru_bias_attr=None,
 | |
|                       bwd_act=None,
 | |
|                       bwd_gate_act=None,
 | |
|                       bwd_mixed_layer_attr=None,
 | |
|                       bwd_gru_cell_attr=None,
 | |
|                       last_seq_attr=None,
 | |
|                       first_seq_attr=None,
 | |
|                       concat_attr=None,
 | |
|                       concat_act=None):
 | |
|     """
 | |
|     A bidirectional_gru is a recurrent unit that iterates over the input
 | |
|     sequence both in forward and backward orders, and then concatenate two
 | |
|     outputs to form a final output. However, concatenation of two outputs
 | |
|     is not the only way to form the final output, you can also, for example,
 | |
|     just add them together.
 | |
| 
 | |
|     The example usage is:
 | |
| 
 | |
|     ..  code-block:: python
 | |
| 
 | |
|         bi_gru = bidirectional_gru(input=[input1], size=512)
 | |
| 
 | |
|     :param name: bidirectional gru layer name.
 | |
|     :type name: basestring
 | |
|     :param input: input layer.
 | |
|     :type input: LayerOutput
 | |
|     :param size: gru layer size.
 | |
|     :type size: int
 | |
|     :param return_seq: If set False, the last time step of output are
 | |
|                        concatenated and returned.
 | |
|                        If set True, the entire output sequences in forward
 | |
|                        and backward directions are concatenated and returned.
 | |
|     :type return_seq: bool
 | |
|     :return: LayerOutput object.
 | |
|     :rtype: LayerOutput
 | |
|     """
 | |
|     args = locals()
 | |
| 
 | |
|     fw = simple_gru2(
 | |
|         name='%s_fw' % name,
 | |
|         input=input,
 | |
|         size=size,
 | |
|         **dict((k[len('fwd_'):], v) for k, v in args.iteritems()
 | |
|                if k.startswith('fwd_')))
 | |
| 
 | |
|     bw = simple_gru2(
 | |
|         name="%s_bw" % name,
 | |
|         input=input,
 | |
|         size=size,
 | |
|         reverse=True,
 | |
|         **dict((k[len('bwd_'):], v) for k, v in args.iteritems()
 | |
|                if k.startswith('bwd_')))
 | |
| 
 | |
|     if return_seq:
 | |
|         return concat_layer(
 | |
|             name=name, input=[fw, bw], layer_attr=concat_attr, act=concat_act)
 | |
|     else:
 | |
|         fw_seq = last_seq(
 | |
|             name="%s_fw_last" % name, input=fw, layer_attr=last_seq_attr)
 | |
|         bw_seq = first_seq(
 | |
|             name="%s_bw_last" % name, input=bw, layer_attr=first_seq_attr)
 | |
|         return concat_layer(
 | |
|             name=name,
 | |
|             input=[fw_seq, bw_seq],
 | |
|             layer_attr=concat_attr,
 | |
|             act=concat_act)
 | |
| 
 | |
| 
 | |
| @wrap_name_default("bidirectional_lstm")
 | |
| def bidirectional_lstm(input,
 | |
|                        size,
 | |
|                        name=None,
 | |
|                        return_seq=False,
 | |
|                        fwd_mat_param_attr=None,
 | |
|                        fwd_bias_param_attr=None,
 | |
|                        fwd_inner_param_attr=None,
 | |
|                        fwd_act=None,
 | |
|                        fwd_gate_act=None,
 | |
|                        fwd_state_act=None,
 | |
|                        fwd_mixed_layer_attr=None,
 | |
|                        fwd_lstm_cell_attr=None,
 | |
|                        bwd_mat_param_attr=None,
 | |
|                        bwd_bias_param_attr=None,
 | |
|                        bwd_inner_param_attr=None,
 | |
|                        bwd_act=None,
 | |
|                        bwd_gate_act=None,
 | |
|                        bwd_state_act=None,
 | |
|                        bwd_mixed_layer_attr=None,
 | |
|                        bwd_lstm_cell_attr=None,
 | |
|                        last_seq_attr=None,
 | |
|                        first_seq_attr=None,
 | |
|                        concat_attr=None,
 | |
|                        concat_act=None):
 | |
|     """
 | |
|     A bidirectional_lstm is a recurrent unit that iterates over the input
 | |
|     sequence both in forward and backward orders, and then concatenate two
 | |
|     outputs to form a final output. However, concatenation of two outputs
 | |
|     is not the only way to form the final output, you can also, for example,
 | |
|     just add them together.
 | |
| 
 | |
|     Please refer to  **Neural Machine Translation by Jointly Learning to Align
 | |
|     and Translate** for more details about the bidirectional lstm.
 | |
|     The link goes as follows:
 | |
|     .. _Link: https://arxiv.org/pdf/1409.0473v3.pdf
 | |
| 
 | |
|     The example usage is:
 | |
| 
 | |
|     ..  code-block:: python
 | |
| 
 | |
|         bi_lstm = bidirectional_lstm(input=[input1], size=512)
 | |
| 
 | |
|     :param name: bidirectional lstm layer name.
 | |
|     :type name: basestring
 | |
|     :param input: input layer.
 | |
|     :type input: LayerOutput
 | |
|     :param size: lstm layer size.
 | |
|     :type size: int
 | |
|     :param return_seq: If set False, the last time step of output are
 | |
|                        concatenated and returned.
 | |
|                        If set True, the entire output sequences in forward
 | |
|                        and backward directions are concatenated and returned.
 | |
|     :type return_seq: bool
 | |
|     :return: LayerOutput object.
 | |
|     :rtype: LayerOutput
 | |
|     """
 | |
|     args = locals()
 | |
| 
 | |
|     fw = simple_lstm(
 | |
|         name='%s_fw' % name,
 | |
|         input=input,
 | |
|         size=size,
 | |
|         **dict((k[len('fwd_'):], v) for k, v in args.iteritems()
 | |
|                if k.startswith('fwd_')))
 | |
| 
 | |
|     bw = simple_lstm(
 | |
|         name="%s_bw" % name,
 | |
|         input=input,
 | |
|         size=size,
 | |
|         reverse=True,
 | |
|         **dict((k[len('bwd_'):], v) for k, v in args.iteritems()
 | |
|                if k.startswith('bwd_')))
 | |
| 
 | |
|     if return_seq:
 | |
|         return concat_layer(
 | |
|             name=name, input=[fw, bw], layer_attr=concat_attr, act=concat_act)
 | |
|     else:
 | |
|         fw_seq = last_seq(
 | |
|             name="%s_fw_last" % name, input=fw, layer_attr=last_seq_attr)
 | |
|         bw_seq = first_seq(
 | |
|             name="%s_bw_last" % name, input=bw, layer_attr=first_seq_attr)
 | |
|         return concat_layer(
 | |
|             name=name,
 | |
|             input=[fw_seq, bw_seq],
 | |
|             layer_attr=concat_attr,
 | |
|             act=concat_act)
 | |
| 
 | |
| 
 | |
| @wrap_name_default()
 | |
| @wrap_act_default(param_names=['weight_act'], act=TanhActivation())
 | |
| def simple_attention(encoded_sequence,
 | |
|                      encoded_proj,
 | |
|                      decoder_state,
 | |
|                      transform_param_attr=None,
 | |
|                      softmax_param_attr=None,
 | |
|                      weight_act=None,
 | |
|                      name=None):
 | |
|     """
 | |
|     Calculate and return a context vector with attention mechanism.
 | |
|     Size of the context vector equals to size of the encoded_sequence.
 | |
| 
 | |
|     ..  math::
 | |
| 
 | |
|         a(s_{i-1},h_{j}) & = v_{a}f(W_{a}s_{t-1} + U_{a}h_{j})
 | |
| 
 | |
|         e_{i,j} & = a(s_{i-1}, h_{j})
 | |
| 
 | |
|         a_{i,j} & = \\frac{exp(e_{i,j})}{\\sum_{k=1}^{T_x}{exp(e_{i,k})}}
 | |
| 
 | |
|         c_{i} & = \\sum_{j=1}^{T_{x}}a_{i,j}h_{j}
 | |
| 
 | |
|     where :math:`h_{j}` is the jth element of encoded_sequence,
 | |
|     :math:`U_{a}h_{j}` is the jth element of encoded_proj
 | |
|     :math:`s_{i-1}` is decoder_state
 | |
|     :math:`f` is weight_act, and is set to tanh by default.
 | |
| 
 | |
|     Please refer to **Neural Machine Translation by Jointly Learning to
 | |
|     Align and Translate** for more details. The link is as follows:
 | |
|     https://arxiv.org/abs/1409.0473.
 | |
| 
 | |
|     The example usage is:
 | |
| 
 | |
|     ..  code-block:: python
 | |
| 
 | |
|         context = simple_attention(encoded_sequence=enc_seq,
 | |
|                                    encoded_proj=enc_proj,
 | |
|                                    decoder_state=decoder_prev,)
 | |
| 
 | |
|     :param name: name of the attention model.
 | |
|     :type name: basestring
 | |
|     :param softmax_param_attr: parameter attribute of sequence softmax
 | |
|                                that is used to produce attention weight.
 | |
|     :type softmax_param_attr: ParameterAttribute
 | |
|     :param weight_act: activation of the attention model.
 | |
|     :type weight_act: BaseActivation
 | |
|     :param encoded_sequence: output of the encoder
 | |
|     :type encoded_sequence: LayerOutput
 | |
|     :param encoded_proj: attention weight is computed by a feed forward neural
 | |
|                          network which has two inputs : decoder's hidden state
 | |
|                          of previous time step and encoder's output.
 | |
|                          encoded_proj is output of the feed-forward network for
 | |
|                          encoder's output. Here we pre-compute it outside
 | |
|                          simple_attention for speed consideration.
 | |
|     :type encoded_proj: LayerOutput
 | |
|     :param decoder_state: hidden state of decoder in previous time step
 | |
|     :type decoder_state: LayerOutput
 | |
|     :param transform_param_attr: parameter attribute of the feed-forward
 | |
|                                 network that takes decoder_state as inputs to
 | |
|                                 compute attention weight.
 | |
|     :type transform_param_attr: ParameterAttribute
 | |
|     :return: a context vector
 | |
|     :rtype: LayerOutput
 | |
|     """
 | |
|     assert encoded_proj.size == decoder_state.size
 | |
|     proj_size = encoded_proj.size
 | |
| 
 | |
|     with mixed_layer(size=proj_size, name="%s_transform" % name) as m:
 | |
|         m += full_matrix_projection(
 | |
|             decoder_state, param_attr=transform_param_attr)
 | |
| 
 | |
|     expanded = expand_layer(
 | |
|         input=m, expand_as=encoded_sequence, name='%s_expand' % name)
 | |
| 
 | |
|     with mixed_layer(
 | |
|             size=proj_size, act=weight_act, name="%s_combine" % name) as m:
 | |
|         m += identity_projection(expanded)
 | |
|         m += identity_projection(encoded_proj)
 | |
| 
 | |
|     # sequence softmax is used to normalize similarities between decoder state
 | |
|     # and encoder outputs into a distribution
 | |
|     attention_weight = fc_layer(
 | |
|         input=m,
 | |
|         size=1,
 | |
|         act=SequenceSoftmaxActivation(),
 | |
|         param_attr=softmax_param_attr,
 | |
|         name="%s_softmax" % name,
 | |
|         bias_attr=False)
 | |
| 
 | |
|     scaled = scaling_layer(
 | |
|         weight=attention_weight,
 | |
|         input=encoded_sequence,
 | |
|         name='%s_scaling' % name)
 | |
| 
 | |
|     return pooling_layer(
 | |
|         input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name)
 | |
| 
 | |
| 
 | |
| @wrap_name_default()
 | |
| def dot_product_attention(encoded_sequence,
 | |
|                           attended_sequence,
 | |
|                           transformed_state,
 | |
|                           softmax_param_attr=None,
 | |
|                           name=None):
 | |
|     """
 | |
|     Calculate and return a context vector with dot-product attention mechanism.
 | |
|     The dimension of the context vector equals to that of the attended_sequence.
 | |
| 
 | |
|     ..  math::
 | |
| 
 | |
|         a(s_{i-1},h_{j}) & = s_{i-1}^\mathrm{T} h_{j}
 | |
| 
 | |
|         e_{i,j} & = a(s_{i-1}, h_{j})
 | |
| 
 | |
|         a_{i,j} & = \\frac{exp(e_{i,j})}{\\sum_{k=1}^{T_x}{exp(e_{i,k})}}
 | |
| 
 | |
|         c_{i} & = \\sum_{j=1}^{T_{x}}a_{i,j}z_{j}
 | |
| 
 | |
|     where :math:`h_{j}` is the jth element of encoded_sequence,
 | |
|     :math:`z_{j}` is the jth element of attended_sequence,
 | |
|     :math:`s_{i-1}` is transformed_state.
 | |
| 
 | |
|     The example usage is:
 | |
| 
 | |
|     ..  code-block:: python
 | |
| 
 | |
|         context = dot_product_attention(encoded_sequence=enc_seq,
 | |
|                                         attended_sequence=att_seq,
 | |
|                                         transformed_state=state,)
 | |
| 
 | |
|     :param name: A prefix attached to the name of each layer that defined inside
 | |
|                  the dot_product_attention.
 | |
|     :type name: basestring
 | |
|     :param softmax_param_attr: The parameter attribute of sequence softmax
 | |
|                                that is used to produce attention weight.
 | |
|     :type softmax_param_attr: ParameterAttribute
 | |
|     :param encoded_sequence: The output hidden vectors of the encoder.
 | |
|     :type encoded_sequence: LayerOutput
 | |
|     :param attended_sequence: The attention weight is computed by a feed forward neural
 | |
|                               network which has two inputs : decoder's transformed hidden
 | |
|                               state of previous time step and encoder's output.
 | |
|                               attended_sequence is the sequence to be attended.
 | |
|     :type attended_sequence: LayerOutput
 | |
|     :param transformed_state: The transformed hidden state of decoder in previous time step.
 | |
|                               Since the dot-product operation will be performed on it and the
 | |
|                               encoded_sequence, their dimensions must be equal. For flexibility,
 | |
|                               we suppose transformations of the decoder's hidden state have been
 | |
|                               done outside dot_product_attention and no more will be performed
 | |
|                               inside. Then users can use either the original or transformed one.
 | |
|     :type transformed_state: LayerOutput
 | |
|     :return: The context vector.
 | |
|     :rtype: LayerOutput
 | |
|     """
 | |
|     assert transformed_state.size == encoded_sequence.size
 | |
| 
 | |
|     expanded = expand_layer(
 | |
|         input=transformed_state,
 | |
|         expand_as=encoded_sequence,
 | |
|         name='%s_expand' % name)
 | |
| 
 | |
|     m = dot_prod_layer(
 | |
|         input1=expanded, input2=encoded_sequence, name='%s_dot-product' % name)
 | |
| 
 | |
|     attention_weight = fc_layer(
 | |
|         input=m,
 | |
|         size=1,
 | |
|         act=SequenceSoftmaxActivation(),
 | |
|         param_attr=softmax_param_attr,
 | |
|         name="%s_softmax" % name,
 | |
|         bias_attr=False)
 | |
| 
 | |
|     scaled = scaling_layer(
 | |
|         weight=attention_weight,
 | |
|         input=attended_sequence,
 | |
|         name='%s_scaling' % name)
 | |
| 
 | |
|     return pooling_layer(
 | |
|         input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name)
 | |
| 
 | |
| 
 | |
| @wrap_name_default()
 | |
| def multi_head_attention(query,
 | |
|                          key,
 | |
|                          value,
 | |
|                          key_proj_size,
 | |
|                          value_proj_size,
 | |
|                          head_num,
 | |
|                          attention_type,
 | |
|                          softmax_param_attr=None,
 | |
|                          name=None):
 | |
|     """
 | |
|     Calculate and return a context vector with dot-product attention mechanism.
 | |
|     The dimension of the context vector equals to value_proj_size * head_num.
 | |
| 
 | |
|     Please refer to **Attention Is All You Need** for more details. The link is
 | |
|     as follows:
 | |
|     https://arxiv.org/abs/1706.03762.
 | |
| 
 | |
|     The example usage is:
 | |
| 
 | |
|     ..  code-block:: python
 | |
| 
 | |
|         context = multi_head_attention(query=decoder_state,
 | |
|                                        key=enc_seq,
 | |
|                                        value=enc_seq,
 | |
|                                        key_proj_size=64,
 | |
|                                        value_pro_size=64,
 | |
|                                        head_num=8,
 | |
|                                        attention_type='dot-product attention')
 | |
| 
 | |
|     :param name: A prefix attached to the name of each layer that defined inside
 | |
|                  the multi_head_attention.
 | |
|     :type name: basestring
 | |
|     :param softmax_param_attr: The parameter attribute of sequence softmax
 | |
|                                that is used to produce attention weight.
 | |
|     :type softmax_param_attr: ParameterAttribute
 | |
|     :param query: query is used to calculate attention weights over values at current step.
 | |
|     :type query: LayerOutput
 | |
|     :param key: key is used to calculate the attention weight of the corresponding value.
 | |
|     :type key: LayerOutput
 | |
|     :param value: value is the sequence to be attended.
 | |
|     :type value: LayerOutput
 | |
|     :param key_proj_size: The dimension of the linear projection performed on key and query.
 | |
|     :type key_proj_size: int
 | |
|     :param value_proj_size: The dimension of the linear projection performed on value.
 | |
|     :type value_proj_size: int
 | |
|     :param head_num: The number of attention heads.
 | |
|     :type head_num: int
 | |
|     :param attention_type: The type of the attention mechanism used in each attention
 | |
|                            heads. Now, we only support scaled dot-product attention and
 | |
|                            additive attention.
 | |
|     :type attention_type: basestring
 | |
|     :return: The context vector.
 | |
|     :rtype: LayerOutput
 | |
|     """
 | |
|     assert attention_type in ['dot-product attention', 'additive attention']
 | |
| 
 | |
|     with mixed_layer(
 | |
|             size=key_proj_size * head_num,
 | |
|             name='%s_query_proj' % name) as query_proj:
 | |
|         query_proj += full_matrix_projection(query)
 | |
|     query_proj = expand_layer(input=query_proj, expand_as=key)
 | |
| 
 | |
|     with mixed_layer(
 | |
|             size=key_proj_size * head_num,
 | |
|             name='%s_key_proj' % name) as key_proj:
 | |
|         key_proj += full_matrix_projection(key)
 | |
| 
 | |
|     with mixed_layer(
 | |
|             size=value_proj_size * head_num,
 | |
|             name='%s_value_proj' % name) as value_proj:
 | |
|         value_proj += full_matrix_projection(value)
 | |
| 
 | |
|     head_list = []
 | |
|     for i in range(head_num):
 | |
|         with mixed_layer(size=key_proj_size) as sub_query_proj:
 | |
|             sub_query_proj += identity_projection(
 | |
|                 query_proj, offset=key_proj_size * i, size=key_proj_size)
 | |
| 
 | |
|         with mixed_layer(size=key_proj_size) as sub_key_proj:
 | |
|             sub_key_proj += identity_projection(
 | |
|                 key_proj, offset=key_proj_size * i, size=key_proj_size)
 | |
| 
 | |
|         with mixed_layer(size=value_proj_size) as sub_value_proj:
 | |
|             sub_value_proj += identity_projection(
 | |
|                 value_proj, offset=value_proj_size * i, size=value_proj_size)
 | |
| 
 | |
|         if attention_type == 'dot-product attention':
 | |
|             m = dot_prod_layer(
 | |
|                 input1=sub_query_proj,
 | |
|                 input2=sub_key_proj,
 | |
|                 name='%s_dot-product_%d' % (name, i))
 | |
|             m = slope_intercept_layer(
 | |
|                 input=m,
 | |
|                 slope=math.sqrt(1.0 / key_proj_size),
 | |
|                 name='%s_dot-product_scaling_%d' % (name, i))
 | |
|         else:
 | |
|             with mixed_layer(
 | |
|                     size=key_proj_size,
 | |
|                     act=TanhActivation(),
 | |
|                     name='%s_combine_%d' % (name, i)) as m:
 | |
|                 m += identity_projection(sub_query_proj)
 | |
|                 m += identity_projection(sub_key_proj)
 | |
| 
 | |
|         attention_weight = fc_layer(
 | |
|             input=m,
 | |
|             size=1,
 | |
|             act=SequenceSoftmaxActivation(),
 | |
|             param_attr=softmax_param_attr,
 | |
|             name="%s_softmax_%d" % (name, i),
 | |
|             bias_attr=False)
 | |
| 
 | |
|         scaled = scaling_layer(
 | |
|             weight=attention_weight,
 | |
|             input=sub_value_proj,
 | |
|             name='%s_scaling_%d' % (name, i))
 | |
|         head = pooling_layer(
 | |
|             input=scaled,
 | |
|             pooling_type=SumPooling(),
 | |
|             name="%s_pooling_%d" % (name, i))
 | |
| 
 | |
|         head_list.append(head)
 | |
| 
 | |
|     attended = concat_layer(head_list)
 | |
| 
 | |
|     return attended
 | |
| 
 | |
| 
 | |
| def inputs(layers, *args):
 | |
|     """
 | |
|     Declare the inputs of network. The order of input should be as same as
 | |
|     the data provider's return order.
 | |
| 
 | |
|     :param layers: Input Layers.
 | |
|     :type layers: list|tuple|LayerOutput.
 | |
|     :return:
 | |
|     """
 | |
| 
 | |
|     if isinstance(layers, LayerOutput) or isinstance(layers, basestring):
 | |
|         layers = [layers]
 | |
|     if len(args) != 0:
 | |
|         layers.extend(args)
 | |
| 
 | |
|     Inputs(*[l.name for l in layers])
 | |
| 
 | |
| 
 | |
| def outputs(layers, *args):
 | |
|     """
 | |
|     Declare the outputs of network. If user has not defined the inputs of
 | |
|     network, this method will calculate the input order by dfs travel.
 | |
| 
 | |
|     :param layers: Output layers.
 | |
|     :type layers: list|tuple|LayerOutput
 | |
|     :return:
 | |
|     """
 | |
| 
 | |
|     traveled = set()
 | |
| 
 | |
|     def __dfs_travel__(layer,
 | |
|                        predicate=lambda x: x.layer_type == LayerType.DATA):
 | |
|         """
 | |
|         DFS LRV Travel for output layer.
 | |
| 
 | |
|         The return order is define order for data_layer in this leaf node.
 | |
| 
 | |
|         :param layer:
 | |
|         :type layer: LayerOutput
 | |
|         :return:
 | |
|         """
 | |
|         if layer in traveled:
 | |
|             return []
 | |
|         else:
 | |
|             traveled.add(layer)
 | |
| 
 | |
|         assert isinstance(layer, LayerOutput), "layer is %s" % (layer)
 | |
|         retv = []
 | |
|         if layer.parents is not None:
 | |
|             for p in layer.parents:
 | |
|                 retv.extend(__dfs_travel__(p, predicate))
 | |
| 
 | |
|         if predicate(layer):
 | |
|             retv.append(layer)
 | |
|         return retv
 | |
| 
 | |
|     if isinstance(layers, LayerOutput):
 | |
|         layers = [layers]
 | |
| 
 | |
|     if len(args) != 0:
 | |
|         layers.extend(args)
 | |
| 
 | |
|     assert len(layers) > 0
 | |
| 
 | |
|     if HasInputsSet():  # input already set
 | |
|         Outputs(*[l.name for l in layers])
 | |
|         return  # just return outputs.
 | |
| 
 | |
|     if len(layers) != 1:
 | |
|         logger.warning("`outputs` routine try to calculate network's"
 | |
|                        " inputs and outputs order. It might not work well."
 | |
|                        "Please see follow log carefully.")
 | |
|     inputs = []
 | |
|     outputs_ = []
 | |
|     for each_layer in layers:
 | |
|         assert isinstance(each_layer, LayerOutput)
 | |
|         inputs.extend(__dfs_travel__(each_layer))
 | |
|         outputs_.extend(
 | |
|             __dfs_travel__(each_layer,
 | |
|                            lambda x: x.layer_type == LayerType.COST))
 | |
| 
 | |
|     # Currently, we got each leaf node's inputs order, output order.
 | |
|     # We merge them together.
 | |
| 
 | |
|     final_inputs = []
 | |
|     final_outputs = []
 | |
| 
 | |
|     for each_input in inputs:
 | |
|         assert isinstance(each_input, LayerOutput)
 | |
|         if each_input.name not in final_inputs:
 | |
|             final_inputs.append(each_input.name)
 | |
| 
 | |
|     for each_output in outputs_:
 | |
|         assert isinstance(each_output, LayerOutput)
 | |
|         if each_output.name not in final_outputs:
 | |
|             final_outputs.append(each_output.name)
 | |
| 
 | |
|     logger.info("".join(["The input order is [", ", ".join(final_inputs), "]"]))
 | |
| 
 | |
|     if len(final_outputs) == 0:
 | |
|         final_outputs = map(lambda x: x.name, layers)
 | |
| 
 | |
|     logger.info("".join(
 | |
|         ["The output order is [", ", ".join(final_outputs), "]"]))
 | |
| 
 | |
|     Inputs(*final_inputs)
 | |
|     Outputs(*final_outputs)
 |