Merge pull request #4674 from ranqiu92/attention

add config helper for dot-product attention.
8 years ago · f12f61d5ac
parent 064c3695ec 7ad15259fc
commit f12f61d5ac
2 changed files with 91 additions and 2 deletions
--- a/doc/api/v2/config/networks.rst
+++ b/doc/api/v2/config/networks.rst
@ -125,3 +125,8 @@ simple_attention
    :members: simple_attention
    :noindex:
 dot_product_attention
 ---------------------
 ..  automodule:: paddle.v2.networks
    :members: dot_product_attention
    :noindex:
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@ -26,8 +26,9 @@ __all__ = [
    'sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
    "img_conv_bn_pool", 'lstmemory_group', 'lstmemory_unit', 'small_vgg',
    'img_conv_group', 'vgg_16_network', 'gru_unit', 'gru_group', 'simple_gru',
-    'simple_attention', 'simple_gru2', 'bidirectional_gru', 'text_conv_pool',
+    'simple_attention', 'dot_product_attention', 'simple_gru2',
-    'bidirectional_lstm', 'inputs', 'outputs'
+    'bidirectional_gru', 'text_conv_pool', 'bidirectional_lstm', 'inputs',
    'outputs'
 ]
 ######################################################
@ -1361,6 +1362,7 @@ def simple_attention(encoded_sequence,
                                compute attention weight.
    :type transform_param_attr: ParameterAttribute
    :return: a context vector
    :rtype: LayerOutput
    """
    assert encoded_proj.size == decoder_state.size
    proj_size = encoded_proj.size
@ -1396,6 +1398,88 @@ def simple_attention(encoded_sequence,
        input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name)
@wrap_name_default()
 def dot_product_attention(encoded_sequence,
                          attended_sequence,
                          transformed_state,
                          softmax_param_attr=None,
                          name=None):
    """
    Calculate and return a context vector with dot-product attention mechanism.
    The dimension of the context vector equals to that of the attended_sequence.
    ..  math::
        a(s_{i-1},h_{j}) & = s_{i-1}^\mathrm{T} h_{j}
        e_{i,j} & = a(s_{i-1}, h_{j})
        a_{i,j} & = \\frac{exp(e_{i,j})}{\\sum_{k=1}^{T_x}{exp(e_{i,k})}}
        c_{i} & = \\sum_{j=1}^{T_{x}}a_{i,j}z_{j}
    where :math:`h_{j}` is the jth element of encoded_sequence,
    :math:`z_{j}` is the jth element of attended_sequence,
    :math:`s_{i-1}` is transformed_state.
    The example usage is:
    ..  code-block:: python
        context = dot_product_attention(encoded_sequence=enc_seq,
                                        attended_sequence=att_seq,
                                        transformed_state=state,)
    :param name: A prefix attached to the name of each layer that defined inside
                 the dot_product_attention.
    :type name: basestring
    :param softmax_param_attr: The parameter attribute of sequence softmax
                               that is used to produce attention weight.
    :type softmax_param_attr: ParameterAttribute
    :param encoded_sequence: The output hidden vectors of the encoder.
    :type encoded_sequence: LayerOutput
    :param attended_sequence: The attention weight is computed by a feed forward neural
                              network which has two inputs : decoder's transformed hidden
                              state of previous time step and encoder's output.
                              attended_sequence is the sequence to be attended.
    :type attended_sequence: LayerOutput
    :param transformed_state: The transformed hidden state of decoder in previous time step.
                              Since the dot-product operation will be performed on it and the
                              encoded_sequence, their dimensions must be equal. For flexibility,
                              we suppose transformations of the decoder's hidden state have been
                              done outside dot_product_attention and no more will be performed
                              inside. Then users can use either the original or transformed one.
    :type transformed_state: LayerOutput
    :return: The context vector.
    :rtype: LayerOutput
    """
    assert transformed_state.size == encoded_sequence.size
    expanded = expand_layer(
        input=transformed_state,
        expanded_as=encoded_sequence,
        name='%s_expand' % name)
    m = linear_comb_layer(
        weights=expanded, vectors=encoded_sequence, name='%s_dot-product')
    attention_weight = fc_layer(
        input=m,
        size=1,
        act=SequenceSoftmaxActivation(),
        param_attr=softmax_param_attr,
        name="%s_softmax" % name,
        bias_attr=False)
    scaled = scaling_layer(
        weight=attention_weight,
        input=attended_sequence,
        name='%s_scaling' % name)
    return pooling_layer(
        input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name)
 def inputs(layers, *args):
    """
    Declare the inputs of network. The order of input should be as same as