You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Paddle/python/paddle/fluid/nets.py

346 lines
12 KiB

# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import layers
__all__ = [
"simple_img_conv_pool",
"sequence_conv_pool",
"glu",
"scaled_dot_product_attention",
]
def simple_img_conv_pool(input,
num_filters,
filter_size,
pool_size,
pool_stride,
act,
param_attr=None,
pool_type='max',
use_cudnn=True,
use_mkldnn=False):
conv_out = layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
param_attr=param_attr,
act=act,
use_cudnn=use_cudnn,
use_mkldnn=use_mkldnn)
pool_out = layers.pool2d(
input=conv_out,
pool_size=pool_size,
pool_type=pool_type,
pool_stride=pool_stride,
use_cudnn=use_cudnn,
use_mkldnn=use_mkldnn)
return pool_out
def img_conv_group(input,
conv_num_filter,
pool_size,
conv_padding=1,
conv_filter_size=3,
conv_act=None,
param_attr=None,
conv_with_batchnorm=False,
conv_batchnorm_drop_rate=0.0,
pool_stride=1,
pool_type=None,
use_cudnn=True,
use_mkldnn=False):
"""
Image Convolution Group, Used for vgg net.
"""
tmp = input
assert isinstance(conv_num_filter, list) or \
isinstance(conv_num_filter, tuple)
def __extend_list__(obj):
if not hasattr(obj, '__len__'):
return [obj] * len(conv_num_filter)
else:
return obj
conv_padding = __extend_list__(conv_padding)
conv_filter_size = __extend_list__(conv_filter_size)
param_attr = __extend_list__(param_attr)
conv_with_batchnorm = __extend_list__(conv_with_batchnorm)
conv_batchnorm_drop_rate = __extend_list__(conv_batchnorm_drop_rate)
for i in xrange(len(conv_num_filter)):
local_conv_act = conv_act
if conv_with_batchnorm[i]:
local_conv_act = None
tmp = layers.conv2d(
input=tmp,
num_filters=conv_num_filter[i],
filter_size=conv_filter_size[i],
padding=conv_padding[i],
param_attr=param_attr[i],
act=local_conv_act,
use_cudnn=use_cudnn,
use_mkldnn=use_mkldnn)
if conv_with_batchnorm[i]:
tmp = layers.batch_norm(input=tmp, act=conv_act, in_place=True)
drop_rate = conv_batchnorm_drop_rate[i]
if abs(drop_rate) > 1e-5:
tmp = layers.dropout(x=tmp, dropout_prob=drop_rate)
pool_out = layers.pool2d(
input=tmp,
pool_size=pool_size,
pool_type=pool_type,
pool_stride=pool_stride,
use_cudnn=use_cudnn,
use_mkldnn=use_mkldnn)
return pool_out
def sequence_conv_pool(input,
num_filters,
filter_size,
param_attr=None,
act="sigmoid",
pool_type="max"):
conv_out = layers.sequence_conv(
input=input,
num_filters=num_filters,
filter_size=filter_size,
param_attr=param_attr,
act=act)
pool_out = layers.sequence_pool(input=conv_out, pool_type=pool_type)
return pool_out
def glu(input, dim=-1):
"""
The gated linear unit composed by split, sigmoid activation and elementwise
multiplication. Specifically, Split the input into two equal sized parts
:math:`a` and :math:`b` along the given dimension and then compute as
following:
.. math::
{GLU}(a, b)= a \otimes \sigma(b)
Refer to `Language Modeling with Gated Convolutional Networks
<https://arxiv.org/pdf/1612.08083.pdf>`_.
Args:
input (Variable): The input variable which is a Tensor or LoDTensor.
dim (int): The dimension along which to split. If :math:`dim < 0`, the
dimension to split along is :math:`rank(input) + dim`.
Returns:
Variable: The Tensor variable with half the size of input.
Examples:
.. code-block:: python
# x is a Tensor variable with shape [3, 6, 9]
fluid.nets.glu(input=x, dim=1) # shape of output: [3, 3, 9]
"""
a, b = layers.split(input, num_or_sections=2, dim=dim)
act_b = layers.sigmoid(x=b)
out = layers.elementwise_mul(x=a, y=act_b)
return out
def scaled_dot_product_attention(queries,
keys,
values,
num_heads=1,
dropout_rate=0.):
"""
The dot-product attention.
Attention mechanism can be seen as mapping a query and a set of key-value
pairs to an output. The output is computed as a weighted sum of the values,
where the weight assigned to each value is computed by a compatibility
function (dot-product here) of the query with the corresponding key.
The dot-product attention can be implemented through (batch) matrix
multipication as follows:
.. math::
Attention(Q, K, V)= softmax(QK^\mathrm{T})V
Refer to `Attention Is All You Need
<https://arxiv.org/pdf/1706.03762.pdf>`_.
Args:
queries (Variable): The input variable which should be a 3-D Tensor.
keys (Variable): The input variable which should be a 3-D Tensor.
values (Variable): The input variable which should be a 3-D Tensor.
num_heads (int): Head number to compute the scaled dot product
attention. Default value is 1.
dropout_rate (float): The dropout rate to drop the attention weight.
Default value is 0.
Returns:
Variable: A 3-D Tensor computed by multi-head scaled dot product \
attention.
Raises:
ValueError: If input queries, keys, values are not 3-D Tensors.
NOTE:
1. When num_heads > 1, three linear projections are learned respectively
to map input queries, keys and values into queries', keys' and values'.
queries', keys' and values' have the same shapes with queries, keys
and values.
1. When num_heads == 1, scaled_dot_product_attention has no learnable
parameters.
Examples:
.. code-block:: python
# Suppose q, k, v are Tensors with the following shape:
# q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10]
contexts = fluid.nets.scaled_dot_product_attention(q, k, v)
contexts.shape # [3, 5, 10]
"""
if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
raise ValueError(
"Inputs quries, keys and values should all be 3-D tensors.")
if queries.shape[-1] != keys.shape[-1]:
raise ValueError(
"The hidden size of queries and keys should be the same.")
if keys.shape[-2] != values.shape[-2]:
raise ValueError(
"The max sequence length in query batch and in key batch "
"should be the same.")
if keys.shape[-1] % num_heads != 0:
raise ValueError("The hidden size of keys (%d) must be divisible "
"by the number of attention heads (%d)." %
(keys.shape[-1], num_heads))
if values.shape[-1] % num_heads != 0:
raise ValueError("The hidden size of values (%d) must be divisible "
"by the number of attention heads (%d)." %
(values.shape[-1], num_heads))
def __compute_qkv(queries, keys, values, num_heads):
"""
Add linear projection to queries, keys, and values.
Args:
queries(Tensor): a 3-D input Tensor.
keys(Tensor): a 3-D input Tensor.
values(Tensor): a 3-D input Tensor.
num_heads(int): The number of heads. Linearly project the inputs
ONLY when num_heads > 1.
Returns:
Tensor: linearly projected output Tensors: queries', keys' and
values'. They have the same shapes with queries, keys and
values.
"""
if num_heads == 1:
return queries, keys, values
q = layers.fc(input=queries, size=queries.shape[-1], num_flatten_dims=2)
k = layers.fc(input=keys, size=keys.shape[-1], num_flatten_dims=2)
v = layers.fc(input=values, size=values.shape[-1], num_flatten_dims=2)
return q, k, v
def __split_heads(x, num_heads):
"""
Reshape the last dimension of inpunt tensor x so that it becomes two
dimensions.
Args:
x(Tensor): a 3-D input Tensor.
num_heads(int): The number of heads.
Returns:
Tensor: a Tensor with shape [..., n, m/num_heads], where m is size
of the last dimension of x.
"""
if num_heads == 1:
return x
hidden_size = x.shape[-1]
# reshape the 3-D input: [batch_size, max_sequence_length, hidden_dim]
# into a 4-D output:
# [batch_size, max_sequence_length, num_heads, hidden_size_per_head].
reshaped = layers.reshape(
x=x,
shape=list(x.shape[:-1]) + [num_heads, hidden_size // num_heads])
# permuate the dimensions into:
# [batch_size, num_heads, max_sequence_len, hidden_size_per_head]
return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
def __combine_heads(x):
"""
Reshape the last two dimensions of inpunt tensor x so that it becomes
one dimension.
Args:
x(Tensor): a 4-D input Tensor with shape
[bs, num_heads, max_sequence_length, hidden_dim].
Returns:
Tensor: a Tensor with shape
[bs, max_sequence_length, num_heads * hidden_dim].
"""
if len(x.shape) == 3: return x
if len(x.shape) != 4:
raise ValueError("Input(x) should be a 4-D Tensor.")
trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
return layers.reshape(
x=trans_x,
shape=map(int, [
trans_x.shape[0], trans_x.shape[1],
trans_x.shape[2] * trans_x.shape[3]
]))
q, k, v = __compute_qkv(queries, keys, values, num_heads)
q = __split_heads(q, num_heads)
k = __split_heads(k, num_heads)
v = __split_heads(v, num_heads)
key_dim_per_head = keys.shape[-1] // num_heads
scaled_q = layers.scale(x=q, scale=key_dim_per_head**-0.5)
product = layers.matmul(x=k, y=scaled_q, transpose_y=True)
weights = layers.reshape(
x=layers.reshape(
x=product, shape=[-1, product.shape[-1]], act="softmax"),
shape=product.shape)
if dropout_rate:
weights = layers.dropout(
weights, dropout_prob=dropout_rate, is_test=False)
ctx_multiheads = layers.matmul(weights, v)
return __combine_heads(ctx_multiheads)