Merge pull request #4674 from ranqiu92/attention

add config helper for dot-product attention.
revert-4814-Add_sequence_project_op
Cao Ying 8 years ago committed by GitHub
commit f12f61d5ac

@ -125,3 +125,8 @@ simple_attention
:members: simple_attention :members: simple_attention
:noindex: :noindex:
dot_product_attention
---------------------
.. automodule:: paddle.v2.networks
:members: dot_product_attention
:noindex:

@ -26,8 +26,9 @@ __all__ = [
'sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool", 'sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
"img_conv_bn_pool", 'lstmemory_group', 'lstmemory_unit', 'small_vgg', "img_conv_bn_pool", 'lstmemory_group', 'lstmemory_unit', 'small_vgg',
'img_conv_group', 'vgg_16_network', 'gru_unit', 'gru_group', 'simple_gru', 'img_conv_group', 'vgg_16_network', 'gru_unit', 'gru_group', 'simple_gru',
'simple_attention', 'simple_gru2', 'bidirectional_gru', 'text_conv_pool', 'simple_attention', 'dot_product_attention', 'simple_gru2',
'bidirectional_lstm', 'inputs', 'outputs' 'bidirectional_gru', 'text_conv_pool', 'bidirectional_lstm', 'inputs',
'outputs'
] ]
###################################################### ######################################################
@ -1361,6 +1362,7 @@ def simple_attention(encoded_sequence,
compute attention weight. compute attention weight.
:type transform_param_attr: ParameterAttribute :type transform_param_attr: ParameterAttribute
:return: a context vector :return: a context vector
:rtype: LayerOutput
""" """
assert encoded_proj.size == decoder_state.size assert encoded_proj.size == decoder_state.size
proj_size = encoded_proj.size proj_size = encoded_proj.size
@ -1396,6 +1398,88 @@ def simple_attention(encoded_sequence,
input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name) input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name)
@wrap_name_default()
def dot_product_attention(encoded_sequence,
attended_sequence,
transformed_state,
softmax_param_attr=None,
name=None):
"""
Calculate and return a context vector with dot-product attention mechanism.
The dimension of the context vector equals to that of the attended_sequence.
.. math::
a(s_{i-1},h_{j}) & = s_{i-1}^\mathrm{T} h_{j}
e_{i,j} & = a(s_{i-1}, h_{j})
a_{i,j} & = \\frac{exp(e_{i,j})}{\\sum_{k=1}^{T_x}{exp(e_{i,k})}}
c_{i} & = \\sum_{j=1}^{T_{x}}a_{i,j}z_{j}
where :math:`h_{j}` is the jth element of encoded_sequence,
:math:`z_{j}` is the jth element of attended_sequence,
:math:`s_{i-1}` is transformed_state.
The example usage is:
.. code-block:: python
context = dot_product_attention(encoded_sequence=enc_seq,
attended_sequence=att_seq,
transformed_state=state,)
:param name: A prefix attached to the name of each layer that defined inside
the dot_product_attention.
:type name: basestring
:param softmax_param_attr: The parameter attribute of sequence softmax
that is used to produce attention weight.
:type softmax_param_attr: ParameterAttribute
:param encoded_sequence: The output hidden vectors of the encoder.
:type encoded_sequence: LayerOutput
:param attended_sequence: The attention weight is computed by a feed forward neural
network which has two inputs : decoder's transformed hidden
state of previous time step and encoder's output.
attended_sequence is the sequence to be attended.
:type attended_sequence: LayerOutput
:param transformed_state: The transformed hidden state of decoder in previous time step.
Since the dot-product operation will be performed on it and the
encoded_sequence, their dimensions must be equal. For flexibility,
we suppose transformations of the decoder's hidden state have been
done outside dot_product_attention and no more will be performed
inside. Then users can use either the original or transformed one.
:type transformed_state: LayerOutput
:return: The context vector.
:rtype: LayerOutput
"""
assert transformed_state.size == encoded_sequence.size
expanded = expand_layer(
input=transformed_state,
expanded_as=encoded_sequence,
name='%s_expand' % name)
m = linear_comb_layer(
weights=expanded, vectors=encoded_sequence, name='%s_dot-product')
attention_weight = fc_layer(
input=m,
size=1,
act=SequenceSoftmaxActivation(),
param_attr=softmax_param_attr,
name="%s_softmax" % name,
bias_attr=False)
scaled = scaling_layer(
weight=attention_weight,
input=attended_sequence,
name='%s_scaling' % name)
return pooling_layer(
input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name)
def inputs(layers, *args): def inputs(layers, *args):
""" """
Declare the inputs of network. The order of input should be as same as Declare the inputs of network. The order of input should be as same as

Loading…
Cancel
Save