Paddle/python/paddle/fluid/tests/unittests/test_transformer_api.py

615 lines
27 KiB

# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.nn.layer.transformer import MultiHeadAttention, TransformerEncoderLayer, TransformerDecoderLayer, TransformerEncoder, TransformerDecoder, Transformer
import unittest
def generate_basic_params(mode="attn", self_attention=True):
batch_size, query_length = [np.random.randint(2, 10) for _ in range(2)]
d_head, num_heads = [np.random.randint(3, 10) for _ in range(2)]
attn_dropout = 0.0
embed_dim = d_head * num_heads
if mode == "attn":
if self_attention:
kdim, vdim = embed_dim, embed_dim
key_length, value_length = query_length, query_length
else:
kdim, vdim = [np.random.randint(5, 20) for _ in range(2)]
key_length = np.random.randint(2, 10)
value_length = key_length
return batch_size, query_length, key_length, value_length, embed_dim, kdim, vdim, num_heads, attn_dropout
else:
dropout, act_dropout = 0.0, 0.0
dim_feedforward = np.random.randint(128, 1024)
sequence_length = np.random.randint(2, 10)
if mode == "encoder_layer":
return batch_size, embed_dim, num_heads, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length
elif mode == "decoder_layer":
target_length = np.random.randint(2, 10)
return batch_size, embed_dim, num_heads, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length, target_length
def generate_query_key_value_cache(self_attention,
batch_size,
num_heads,
query_length,
embed_dim,
key_length=None,
value_length=None,
kdim=None,
vdim=None,
cache=None):
query = np.random.rand(batch_size, query_length,
embed_dim).astype("float32")
attn_mask = np.zeros((batch_size, num_heads, query_length, key_length))
attn_mask[0][0][0][0] = -1e9
head_dim = embed_dim // num_heads
if self_attention:
key, value = query, query
else:
key = np.random.rand(batch_size, key_length, kdim).astype("float32")
value = np.random.rand(batch_size, value_length, vdim).astype("float32")
cache_dict = {}
if cache:
if not self_attention:
cache_dict["static_k"] = np.random.rand(
batch_size, num_heads, key_length, head_dim).astype("float32")
cache_dict["static_v"] = np.random.rand(
batch_size, num_heads, value_length, head_dim).astype("float32")
else:
cache_dict["k"] = np.random.rand(batch_size, num_heads, key_length,
head_dim).astype("float32")
cache_dict["v"] = np.random.rand(
batch_size, num_heads, value_length, head_dim).astype("float32")
else:
cache_dict = None
return query, key, value, attn_mask, cache_dict
def fc(x, weight):
return np.matmul(x, weight)
def softmax(x):
np.seterr(invalid='ignore')
output = np.zeros(x.shape, dtype=np.float64)
for i in range(x.shape[0]):
for j in range(x.shape[1]):
for k in range(x.shape[2]):
x_curr = x[i, j, k, :]
e_x = np.exp(x_curr - np.amax(x_curr))
output[i, j, k, :] = e_x / np.sum(e_x)
return output
def batch_matmul(x, y):
assert x.shape[0] == y.shape[0]
assert x.shape[1] == y.shape[1]
retval = np.zeros(
(x.shape[0], x.shape[1], x.shape[2], y.shape[3]), dtype=np.float64)
for i in range(x.shape[0]):
for j in range(x.shape[1]):
retval[i, j, :, :] = np.matmul(x[i, j, :, :], y[i, j, :, :])
return retval
def scaled_dot_product_attention(q, k, v, d_key, attn_mask, multi_head_attn):
k = k.transpose([0, 1, 3, 2])
qkt = batch_matmul(q, k / np.sqrt(d_key, dtype=np.float64))
if attn_mask is not None:
qkt += attn_mask
weight = softmax(qkt)
attn_heads = batch_matmul(weight, v)
attn_heads = attn_heads.transpose((0, 2, 1, 3))
attn_heads = attn_heads.reshape((attn_heads.shape[0], attn_heads.shape[1],
attn_heads.shape[2] * attn_heads.shape[3]))
return attn_heads
def cal_qkv(key, value, num_heads, embed_dim, multi_head_attn):
with fluid.dygraph.guard():
head_dim = embed_dim // num_heads
k_weight = multi_head_attn.k_proj.weight.numpy()
v_weight = multi_head_attn.v_proj.weight.numpy()
k = fc(key, k_weight)
v = fc(value, v_weight)
k = k.reshape((k.shape[0], k.shape[1], num_heads, head_dim))
k = k.transpose((0, 2, 1, 3))
v = v.reshape((v.shape[0], v.shape[1], num_heads, head_dim))
v = v.transpose((0, 2, 1, 3))
return k, v
def prepare_qkv(query, key, value, num_heads, embed_dim, self_attention,
multi_head_attn, cache_dict):
q_weight = multi_head_attn.q_proj.weight.numpy()
q = fc(query, q_weight)
q = q.reshape((q.shape[0], q.shape[1], num_heads, embed_dim // num_heads))
q = q.transpose((0, 2, 1, 3))
if not self_attention and cache_dict:
k, v = cache_dict["static_k"], cache_dict["static_v"]
else:
k, v = cal_qkv(key, value, num_heads, embed_dim, multi_head_attn)
if cache_dict is not None:
k = np.concatenate((cache_dict["k"], k), axis=2)
v = np.concatenate((cache_dict["v"], v), axis=2)
return (q, k, v, cache_dict)
def add(x, y=None):
fluid.enable_dygraph()
with fluid.dygraph.guard():
x = x.numpy() if not isinstance(x, np.ndarray) else x
if y is not None:
x += y
return x
return x
def relu(x):
compare = x > 0
return x * compare
def layer_norm(x, normalized_shape, norm, epsilon=1e-05, act=None):
fluid.enable_dygraph()
with fluid.dygraph.guard():
# scale:
weight = norm.weight.numpy()
# shift:
bias = norm.bias.numpy()
batch_size, src_len, d_model = x.shape
x = x.reshape((batch_size * src_len, d_model))
mu = np.mean(x, axis=1, keepdims=True)
sigma_squar = np.sum(np.square(x - mu), axis=1) / d_model
x1_up = (x - mu)
x1_down_1 = sigma_squar + epsilon
x1_down = np.sqrt(x1_down_1)
x1_down = x1_down.reshape((x1_down.shape[0], 1))
x1 = x1_up / x1_down
x_scaled = weight * x1
x_scaled_bias = x_scaled + bias
x_scaled_bias = x_scaled_bias.reshape((batch_size, src_len, d_model))
return x_scaled_bias
def ffn(src, encoder_layer, ffn_fc1_act="relu"):
assert ffn_fc1_act == "relu", "only relu is supported"
fluid.enable_dygraph()
with fluid.dygraph.guard():
src = src.numpy() if not isinstance(src, np.ndarray) else src
w1 = encoder_layer.linear1.weight.numpy()
w2 = encoder_layer.linear2.weight.numpy()
# fc1
x1 = fc(src, w1)
x1 = relu(x1)
# fc2
x2 = fc(x1, w2)
return x2
class TestTransformer(unittest.TestCase):
def test_multi_head_attention(self):
def multihead_attention_test_helper(self_attention, cache):
paddle.manual_seed(2020)
paddle.framework.random._manual_program_seed(2020)
# self_attention|cross_attention, cache|No cache
with fluid.dygraph.guard(fluid.CPUPlace()):
# generate params for multi_head_attention
batch_size, query_length, key_length, value_length, embed_dim, kdim, vdim, num_heads, attn_dropout = generate_basic_params(
"attn", self_attention)
query, key, value, attn_mask, cache_dict = generate_query_key_value_cache(
self_attention, batch_size, num_heads, query_length,
embed_dim, key_length, value_length, kdim, vdim, cache)
if cache and self_attention:
attn_mask = np.concatenate((attn_mask, attn_mask), axis=3)
need_weight, param_attr, bias_attr = False, None, None
# call paddle's function
multi_head_attn = MultiHeadAttention(
embed_dim, num_heads, attn_dropout, kdim, vdim, need_weight,
param_attr, bias_attr)
# construct cache object
cache_obj = None
if cache_dict:
if 'k' and 'v' in cache_dict:
cache_obj = multi_head_attn.Cache(
paddle.to_tensor(cache_dict['k']),
paddle.to_tensor(cache_dict['v']))
elif 'static_k' and 'static_v' in cache_dict:
cache_obj = multi_head_attn.StaticCache(
paddle.to_tensor(cache_dict['static_k']),
paddle.to_tensor(cache_dict['static_v']))
if attn_mask is not None:
attn_output = multi_head_attn(
paddle.to_tensor(query),
paddle.to_tensor(key),
paddle.to_tensor(value),
paddle.to_tensor(attn_mask), cache_obj)
else:
attn_output = multi_head_attn(
paddle.to_tensor(query),
paddle.to_tensor(key),
paddle.to_tensor(value), attn_mask, cache_obj)
attn_output = attn_output[0] if cache_dict else attn_output
# implementation by numpy
# compute q, k, v
q, k, v, _ = prepare_qkv(query, key, value, num_heads,
embed_dim, self_attention,
multi_head_attn, cache_dict)
# scale dot product attention
attn_heads = scaled_dot_product_attention(
q, k, v, embed_dim // num_heads, attn_mask, multi_head_attn)
out_proj_weight = multi_head_attn.out_proj.weight.numpy()
reference = fc(attn_heads, out_proj_weight)
np.testing.assert_allclose(
attn_output.numpy(), reference, atol=1e-6)
multihead_attention_test_helper(True, True)
multihead_attention_test_helper(True, False)
multihead_attention_test_helper(False, True)
multihead_attention_test_helper(False, False)
def test_transformer_encoder_layer(self):
with fluid.dygraph.guard(fluid.CPUPlace()):
paddle.framework.manual_seed(2020)
paddle.framework.random._manual_program_seed(2020)
ffn_fc1_act = "relu"
# 1.generate basic params
batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length = generate_basic_params(
mode="encoder_layer")
# 2.generate input for encoder
src = np.random.rand(batch_size, sequence_length,
d_model).astype("float32")
residual = src
src_mask = np.zeros((batch_size, n_head, sequence_length,
sequence_length)).astype("float32")
src_mask[0][0][0][0] = -np.inf
# paddle
encoder_layer = TransformerEncoderLayer(
d_model, n_head, dim_feedforward, dropout, ffn_fc1_act,
attn_dropout, act_dropout)
encoder_output = encoder_layer(
paddle.to_tensor(src),
paddle.to_tensor(src_mask)) # paddle.to_tensor(src_mask))
# 4.numpy:
# paddle self attention
self_attn = MultiHeadAttention(
d_model, n_head, dropout=attn_dropout)
attn_output = self_attn(
paddle.to_tensor(src),
paddle.to_tensor(src),
paddle.to_tensor(src), paddle.to_tensor(src_mask)).numpy()
src = attn_output + residual
src_norm = layer_norm(src, d_model, encoder_layer.norm1)
residual = src_norm
ffn_output = ffn(src_norm, encoder_layer, ffn_fc1_act)
src = residual + ffn_output
src = layer_norm(src, d_model, encoder_layer.norm2)
np.testing.assert_allclose(
encoder_output.numpy(), src, rtol=1e-5, atol=1e-6)
def test_transformer_decoder_layer(self):
with fluid.dygraph.guard(fluid.CPUPlace()):
paddle.framework.manual_seed(2020)
activation = "relu"
normalize_before = False
batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, source_length, target_length = generate_basic_params(
mode="decoder_layer")
tgt = np.random.rand(batch_size, target_length,
d_model).astype("float32")
memory = np.random.rand(batch_size, source_length,
d_model).astype("float32")
tgt_mask = np.zeros((batch_size, n_head, target_length,
target_length)).astype("float32")
tgt_mask[0][0][0][0] = -1e9
memory_mask = np.zeros((batch_size, n_head, target_length,
source_length)).astype("float32")
memory_mask[0][0][0][0] = -1e9
for cache in [True, False]:
self_attn = MultiHeadAttention(
d_model, n_head, dropout=attn_dropout)
cross_attn = MultiHeadAttention(
d_model, n_head, dropout=attn_dropout)
# paddle decoderlayer:
decoder_layer = TransformerDecoderLayer(
d_model, n_head, dim_feedforward, dropout, activation,
attn_dropout, act_dropout, normalize_before)
cache_objs = None
if cache:
cache_objs = decoder_layer.gen_cache(
paddle.to_tensor(memory))
decoder_output = decoder_layer(
paddle.to_tensor(tgt),
paddle.to_tensor(memory),
paddle.to_tensor(tgt_mask),
paddle.to_tensor(memory_mask), cache_objs)
decoder_output = decoder_output[0].numpy(
) if cache else decoder_output.numpy()
# numpy:
residual = tgt
# self-attn
self_attn_cache = cache_objs[
0] if cache_objs is not None else None
tgt = self_attn(
paddle.to_tensor(tgt),
paddle.to_tensor(tgt),
paddle.to_tensor(tgt),
paddle.to_tensor(tgt_mask), self_attn_cache)
tgt = tgt[0].numpy() if cache else tgt.numpy()
tgt = residual + tgt
# postprocess
tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm1)
residual = tgt_norm
# cross-attn
cross_attn_cache = cache_objs[
1] if cache_objs is not None else None
tgt = cross_attn(
paddle.to_tensor(tgt_norm),
paddle.to_tensor(memory),
paddle.to_tensor(memory),
paddle.to_tensor(memory_mask), cross_attn_cache)
tgt = tgt[0].numpy() if cache else tgt.numpy()
# postprocess
tgt = tgt + residual
tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm2)
residual = tgt_norm
# FFN
ffn_output = ffn(tgt_norm, decoder_layer, activation)
# post process
tgt = residual + ffn_output
tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm3)
np.testing.assert_allclose(
decoder_output, tgt_norm, rtol=1e-5, atol=1e-6)
def test_encoder(self):
batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length = generate_basic_params(
mode="encoder_layer")
src = np.random.rand(batch_size, sequence_length,
d_model).astype("float32")
src_mask = np.zeros((batch_size, n_head, sequence_length,
sequence_length)).astype("float32")
src_mask[0][0][0][0] = -np.inf
with fluid.dygraph.guard(fluid.CPUPlace()):
encoder_layer = TransformerEncoderLayer(d_model, n_head,
dim_feedforward, dropout)
num_layers = 6
encoder = TransformerEncoder(encoder_layer, num_layers)
# src, src_mask
enc_output = encoder(
paddle.to_tensor(src), paddle.to_tensor(src_mask))
def test_decoder(self):
batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
mode="decoder_layer")
tgt = np.random.rand(batch_size, target_length,
d_model).astype("float32")
memory = np.random.rand(batch_size, source_length,
d_model).astype("float32")
tgt_mask = np.zeros((batch_size, n_head, target_length,
target_length)).astype("float32")
tgt_mask[0][0][0][0] = -1e9
memory_mask = np.zeros((batch_size, n_head, target_length,
source_length)).astype("float32")
memory_mask[0][0][0][0] = -1e9
with fluid.dygraph.guard(fluid.CPUPlace()):
decoder_layer = TransformerDecoderLayer(d_model, n_head,
dim_feedforward, dropout)
num_layers = 6
decoder = TransformerDecoder(decoder_layer, num_layers)
output = decoder(
paddle.to_tensor(tgt),
paddle.to_tensor(memory),
paddle.to_tensor(tgt_mask), paddle.to_tensor(memory_mask))
def test_transformer(self):
batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
mode="decoder_layer")
# batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
with fluid.dygraph.guard(fluid.CPUPlace()):
transformer = Transformer(
d_model,
n_head,
dim_feedforward=dim_feedforward,
dropout=dropout)
src = paddle.to_tensor(
np.random.rand(batch_size, source_length, d_model).astype(
"float32"))
tgt = paddle.to_tensor(
np.random.rand(batch_size, target_length, d_model).astype(
"float32"))
src_mask = np.zeros((batch_size, n_head, source_length,
source_length)).astype("float32")
src_mask[0][0][0][0] = -np.inf
src_mask = paddle.to_tensor(src_mask)
tgt_mask = np.zeros((batch_size, n_head, target_length,
target_length)).astype("float32")
tgt_mask[0][0][0][0] = -1e9
memory_mask = np.zeros((batch_size, n_head, target_length,
source_length)).astype("float32")
memory_mask[0][0][0][0] = -1e9
tgt_mask, memory_mask = paddle.to_tensor(
tgt_mask), paddle.to_tensor(memory_mask)
trans_output = transformer(src, tgt, src_mask, tgt_mask,
memory_mask)
def test_transformer_attr_1(self):
batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
mode="decoder_layer")
# batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
with fluid.dygraph.guard(fluid.CPUPlace()):
transformer = Transformer(
d_model,
n_head,
dim_feedforward=dim_feedforward,
dropout=dropout,
weight_attr=[None],
bias_attr=[False])
src = paddle.to_variable(
np.random.rand(batch_size, source_length, d_model).astype(
"float32"))
tgt = paddle.to_variable(
np.random.rand(batch_size, target_length, d_model).astype(
"float32"))
src_mask = np.zeros((batch_size, n_head, source_length,
source_length)).astype("float32")
src_mask[0][0][0][0] = -np.inf
src_mask = paddle.to_variable(src_mask)
tgt_mask = np.zeros((batch_size, n_head, target_length,
target_length)).astype("float32")
tgt_mask[0][0][0][0] = -1e9
memory_mask = np.zeros((batch_size, n_head, target_length,
source_length)).astype("float32")
memory_mask[0][0][0][0] = -1e9
tgt_mask, memory_mask = paddle.to_variable(
tgt_mask), paddle.to_variable(memory_mask)
trans_output = transformer(src, tgt, src_mask, tgt_mask,
memory_mask)
def test_transformer_attr_2(self):
batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
mode="decoder_layer")
# batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
with fluid.dygraph.guard(fluid.CPUPlace()):
transformer = Transformer(
d_model,
n_head,
dim_feedforward=dim_feedforward,
dropout=dropout,
weight_attr=[None, None],
bias_attr=[False, False])
src = paddle.to_variable(
np.random.rand(batch_size, source_length, d_model).astype(
"float32"))
tgt = paddle.to_variable(
np.random.rand(batch_size, target_length, d_model).astype(
"float32"))
src_mask = np.zeros((batch_size, n_head, source_length,
source_length)).astype("float32")
src_mask[0][0][0][0] = -np.inf
src_mask = paddle.to_variable(src_mask)
tgt_mask = np.zeros((batch_size, n_head, target_length,
target_length)).astype("float32")
tgt_mask[0][0][0][0] = -1e9
memory_mask = np.zeros((batch_size, n_head, target_length,
source_length)).astype("float32")
memory_mask[0][0][0][0] = -1e9
tgt_mask, memory_mask = paddle.to_variable(
tgt_mask), paddle.to_variable(memory_mask)
trans_output = transformer(src, tgt, src_mask, tgt_mask,
memory_mask)
def test_transformer_attr_3(self):
batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
mode="decoder_layer")
# batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
with fluid.dygraph.guard(fluid.CPUPlace()):
transformer = Transformer(
d_model,
n_head,
dim_feedforward=dim_feedforward,
dropout=dropout,
weight_attr=[None, None, None],
bias_attr=[False, False, True])
src = paddle.to_variable(
np.random.rand(batch_size, source_length, d_model).astype(
"float32"))
tgt = paddle.to_variable(
np.random.rand(batch_size, target_length, d_model).astype(
"float32"))
src_mask = np.zeros((batch_size, n_head, source_length,
source_length)).astype("float32")
src_mask[0][0][0][0] = -np.inf
src_mask = paddle.to_variable(src_mask)
tgt_mask = np.zeros((batch_size, n_head, target_length,
target_length)).astype("float32")
tgt_mask[0][0][0][0] = -1e9
memory_mask = np.zeros((batch_size, n_head, target_length,
source_length)).astype("float32")
memory_mask[0][0][0][0] = -1e9
tgt_mask, memory_mask = paddle.to_variable(
tgt_mask), paddle.to_variable(memory_mask)
trans_output = transformer(src, tgt, src_mask, tgt_mask,
memory_mask)
def test_transformer_attr_boolean(self):
batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
mode="decoder_layer")
# batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
with fluid.dygraph.guard(fluid.CPUPlace()):
transformer = Transformer(
d_model,
n_head,
dim_feedforward=dim_feedforward,
dropout=dropout,
bias_attr=False)
src = paddle.to_variable(
np.random.rand(batch_size, source_length, d_model).astype(
"float32"))
tgt = paddle.to_variable(
np.random.rand(batch_size, target_length, d_model).astype(
"float32"))
src_mask = np.zeros((batch_size, n_head, source_length,
source_length)).astype("float32")
src_mask[0][0][0][0] = -np.inf
src_mask = paddle.to_variable(src_mask)
tgt_mask = np.zeros((batch_size, n_head, target_length,
target_length)).astype("float32")
tgt_mask[0][0][0][0] = -1e9
memory_mask = np.zeros((batch_size, n_head, target_length,
source_length)).astype("float32")
memory_mask[0][0][0][0] = -1e9
tgt_mask, memory_mask = paddle.to_variable(
tgt_mask), paddle.to_variable(memory_mask)
trans_output = transformer(src, tgt, src_mask, tgt_mask,
memory_mask)
if __name__ == "__main__":
unittest.main()