You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Paddle/python/paddle/v2/fluid/nets.py

311 lines
10 KiB

# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import layers
__all__ = [
"simple_img_conv_pool",
"sequence_conv_pool",
"glu",
"scaled_dot_product_attention",
]
def simple_img_conv_pool(input,
num_filters,
filter_size,
pool_size,
pool_stride,
act,
param_attr=None,
pool_type='max',
use_cudnn=True):
conv_out = layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
param_attr=param_attr,
act=act,
use_cudnn=use_cudnn)
pool_out = layers.pool2d(
input=conv_out,
pool_size=pool_size,
pool_type=pool_type,
pool_stride=pool_stride,
use_cudnn=use_cudnn)
return pool_out
def img_conv_group(input,
conv_num_filter,
pool_size,
conv_padding=1,
conv_filter_size=3,
conv_act=None,
param_attr=None,
conv_with_batchnorm=False,
conv_batchnorm_drop_rate=0.0,
pool_stride=1,
pool_type=None,
use_cudnn=True):
"""
Image Convolution Group, Used for vgg net.
"""
tmp = input
assert isinstance(conv_num_filter, list) or \
isinstance(conv_num_filter, tuple)
def __extend_list__(obj):
if not hasattr(obj, '__len__'):
return [obj] * len(conv_num_filter)
else:
return obj
conv_padding = __extend_list__(conv_padding)
conv_filter_size = __extend_list__(conv_filter_size)
param_attr = __extend_list__(param_attr)
conv_with_batchnorm = __extend_list__(conv_with_batchnorm)
conv_batchnorm_drop_rate = __extend_list__(conv_batchnorm_drop_rate)
for i in xrange(len(conv_num_filter)):
local_conv_act = conv_act
if conv_with_batchnorm[i]:
local_conv_act = None
tmp = layers.conv2d(
input=tmp,
num_filters=conv_num_filter[i],
filter_size=conv_filter_size[i],
padding=conv_padding[i],
param_attr=param_attr[i],
act=local_conv_act,
use_cudnn=use_cudnn)
if conv_with_batchnorm[i]:
tmp = layers.batch_norm(input=tmp, act=conv_act)
drop_rate = conv_batchnorm_drop_rate[i]
if abs(drop_rate) > 1e-5:
tmp = layers.dropout(x=tmp, dropout_prob=drop_rate)
pool_out = layers.pool2d(
input=tmp,
pool_size=pool_size,
pool_type=pool_type,
pool_stride=pool_stride,
use_cudnn=use_cudnn)
return pool_out
def sequence_conv_pool(input,
num_filters,
filter_size,
param_attr=None,
act="sigmoid",
pool_type="max"):
conv_out = layers.sequence_conv(
input=input,
num_filters=num_filters,
filter_size=filter_size,
param_attr=param_attr,
act=act)
pool_out = layers.sequence_pool(input=conv_out, pool_type=pool_type)
return pool_out
def glu(input, dim=-1):
"""
The gated linear unit composed by split, sigmoid activation and elementwise
multiplication. Specifically, Split the input into two equal sized parts
:math:`a` and :math:`b` along the given dimension and then compute as
following:
.. math::
{GLU}(a, b)= a \otimes \sigma(b)
Refer to `Language Modeling with Gated Convolutional Networks
<https://arxiv.org/pdf/1612.08083.pdf>`_.
Args:
input (Variable): The input variable which is a Tensor or LoDTensor.
dim (int): The dimension along which to split. If :math:`dim < 0`, the
dimension to split along is :math:`rank(input) + dim`.
Returns:
Variable: The Tensor variable with half the size of input.
Examples:
.. code-block:: python
# x is a Tensor variable with shape [3, 6, 9]
fluid.nets.glu(input=x, dim=1) # shape of output: [3, 3, 9]
"""
a, b = layers.split(input, num_or_sections=2, dim=dim)
act_b = layers.sigmoid(x=b)
out = layers.elementwise_mul(x=a, y=act_b)
return out
def scaled_dot_product_attention(queries,
keys,
values,
num_heads=1,
dropout_rate=0.):
"""
The dot-product attention.
Attention mechanism can be seen as mapping a query and a set of key-value
pairs to an output. The output is computed as a weighted sum of the values,
where the weight assigned to each value is computed by a compatibility
function (dot-product here) of the query with the corresponding key.
The dot-product attention can be implemented through (batch) matrix
multipication as follows:
.. math::
Attention(Q, K, V)= softmax(QK^\mathrm{T})V
Refer to `Attention Is All You Need
<https://arxiv.org/pdf/1706.03762.pdf>`_.
Args:
queries (Variable): The input variable which should be a 3-D Tensor.
keys (Variable): The input variable which should be a 3-D Tensor.
values (Variable): The input variable which should be a 3-D Tensor.
num_heads (int): Head number to compute the scaled dot product
attention. Default value is 1.
dropout_rate (float): The dropout rate to drop the attention weight.
Default value is 0.
Returns:
Variable: A 3-D Tensor computed by multi-head scaled dot product
attention.
Examples:
.. code-block:: python
# Suppose q, k, v are Tensors with the following shape:
# q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10]
contexts = fluid.nets.dot_product_attention(q, k, v)
out.shape # [3, 5, 10]
attn_scores.shape # [3, 5, 6]
"""
if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
raise ValueError(
"Inputs quries, keys and values should all be 3-D tensors.")
if queries.shape[-1] != keys.shape[-1]:
raise ValueError(
"The hidden size of queries and keys should be the same.")
if keys.shape[-2] != values.shape[-2]:
raise ValueError(
"The max sequence length in query batch and in key batch "
"should be the same.")
if keys.shape[-1] % num_heads != 0:
raise ValueError("The hidden size of keys (%d) must be divisible "
"by the number of attention heads (%d)." %
(keys.shape[-1], num_heads))
if values.shape[-1] % num_heads != 0:
raise ValueError("The hidden size of values (%d) must be divisible "
"by the number of attention heads (%d)." %
(values.shape[-1], num_heads))
def __compute_qkv(queries, keys, values, num_heads):
if num_heads == 1:
return queries, keys, values
q = layers.fc(input=queries, size=queries.shape[-1], num_flatten_dims=2)
k = layers.fc(input=keys, size=keys.shape[-1], num_flatten_dims=2)
v = layers.fc(input=values, size=values.shape[-1], num_flatten_dims=2)
return q, k, v
def __split_heads(x, num_heads):
"""
Reshape the last dimension of inpunt tensor x so that it becomes two
dimensions.
Args:
x(Tensor): a 3-D input Tensor.
num_heads(int): The number of heads.
Returns:
Tensor: a Tensor with shape [..., n, m/num_heads], where m is size
of the last dimension of x.
"""
if num_heads == 1:
return x
hidden_size = x.shape[-1]
# reshape the 3-D input: [batch_size, max_sequence_length, hidden_dim]
# into a 4-D output:
# [batch_size, max_sequence_length, num_heads, hidden_size_per_head].
reshaped = layers.reshape(
x=x,
shape=list(x.shape[:-1]) + [num_heads, hidden_size // num_heads])
# permuate the dimensions into:
# [batch_size, num_heads, max_sequence_len, hidden_size_per_head]
return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
def __combine_heads(x):
"""
Reshape the last two dimensions of inpunt tensor x so that it becomes
one dimension.
Args:
x(Tensor): a 4-D input Tensor with shape
[bs, num_heads, max_sequence_length, hidden_dim].
Returns:
Tensor: a Tensor with shape
[bs, max_sequence_length, num_heads * hidden_dim].
"""
if len(x.shape) == 3: return
if len(x.shape) != 4:
raise ValueError("Input(x) should be a 4-D Tensor.")
trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
return layers.reshape(
x=trans_x,
shape=map(int, [
trans_x.shape[0], trans_x.shape[1],
trans_x.shape[2] * trans_x.shape[3]
]))
q, k, v = __compute_qkv(queries, keys, values, num_heads)
q = __split_heads(q, num_heads)
k = __split_heads(k, num_heads)
v = __split_heads(v, num_heads)
key_dim_per_head = keys.shape[-1] // num_heads
scaled_q = layers.scale(x=q, scale=key_dim_per_head**-0.5)
product = layers.matmul(x=k, y=scaled_q, transpose_y=True)
weights = layers.reshape(
x=layers.reshape(
x=product, shape=[-1, product.shape[-1]], act="softmax"),
shape=product.shape)
if dropout_rate:
weights = layers.dropout(x, dropout_prob=dropout_rate, is_test=False)
ctx_multiheads = layers.matmul(weights, v)
return __combine_heads(ctx_multiheads)