You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
348 lines
12 KiB
348 lines
12 KiB
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
from __future__ import print_function
|
|
import contextlib
|
|
|
|
import numpy as np
|
|
import paddle
|
|
import paddle.fluid as fluid
|
|
import paddle.fluid.framework as framework
|
|
import paddle.fluid.layers as pd
|
|
from paddle.fluid.executor import Executor
|
|
import unittest
|
|
import os
|
|
|
|
dict_size = 30000
|
|
source_dict_dim = target_dict_dim = dict_size
|
|
hidden_dim = 32
|
|
word_dim = 16
|
|
batch_size = 2
|
|
max_length = 8
|
|
topk_size = 50
|
|
trg_dic_size = 10000
|
|
beam_size = 2
|
|
|
|
decoder_size = hidden_dim
|
|
|
|
|
|
def encoder(is_sparse):
|
|
# encoder
|
|
src_word_id = pd.data(
|
|
name="src_word_id", shape=[1], dtype='int64', lod_level=1)
|
|
src_embedding = pd.embedding(
|
|
input=src_word_id,
|
|
size=[dict_size, word_dim],
|
|
dtype='float32',
|
|
is_sparse=is_sparse,
|
|
param_attr=fluid.ParamAttr(name='vemb'))
|
|
|
|
fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
|
|
lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4)
|
|
encoder_out = pd.sequence_last_step(input=lstm_hidden0)
|
|
return encoder_out
|
|
|
|
|
|
def decoder_train(context, is_sparse):
|
|
# decoder
|
|
trg_language_word = pd.data(
|
|
name="target_language_word", shape=[1], dtype='int64', lod_level=1)
|
|
trg_embedding = pd.embedding(
|
|
input=trg_language_word,
|
|
size=[dict_size, word_dim],
|
|
dtype='float32',
|
|
is_sparse=is_sparse,
|
|
param_attr=fluid.ParamAttr(name='vemb'))
|
|
|
|
rnn = pd.DynamicRNN()
|
|
with rnn.block():
|
|
current_word = rnn.step_input(trg_embedding)
|
|
pre_state = rnn.memory(init=context)
|
|
current_state = pd.fc(input=[current_word, pre_state],
|
|
size=decoder_size,
|
|
act='tanh')
|
|
|
|
current_score = pd.fc(input=current_state,
|
|
size=target_dict_dim,
|
|
act='softmax')
|
|
rnn.update_memory(pre_state, current_state)
|
|
rnn.output(current_score)
|
|
|
|
return rnn()
|
|
|
|
|
|
def decoder_decode(context, is_sparse):
|
|
init_state = context
|
|
array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
|
|
counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True)
|
|
|
|
# fill the first element with init_state
|
|
state_array = pd.create_array('float32')
|
|
pd.array_write(init_state, array=state_array, i=counter)
|
|
|
|
# ids, scores as memory
|
|
ids_array = pd.create_array('int64')
|
|
scores_array = pd.create_array('float32')
|
|
|
|
init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
|
|
init_scores = pd.data(
|
|
name="init_scores", shape=[1], dtype="float32", lod_level=2)
|
|
|
|
pd.array_write(init_ids, array=ids_array, i=counter)
|
|
pd.array_write(init_scores, array=scores_array, i=counter)
|
|
|
|
cond = pd.less_than(x=counter, y=array_len)
|
|
|
|
while_op = pd.While(cond=cond)
|
|
with while_op.block():
|
|
pre_ids = pd.array_read(array=ids_array, i=counter)
|
|
pre_state = pd.array_read(array=state_array, i=counter)
|
|
pre_score = pd.array_read(array=scores_array, i=counter)
|
|
|
|
# expand the recursive_sequence_lengths of pre_state to be the same with pre_score
|
|
pre_state_expanded = pd.sequence_expand(pre_state, pre_score)
|
|
|
|
pre_ids_emb = pd.embedding(
|
|
input=pre_ids,
|
|
size=[dict_size, word_dim],
|
|
dtype='float32',
|
|
is_sparse=is_sparse)
|
|
|
|
# use rnn unit to update rnn
|
|
current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb],
|
|
size=decoder_size,
|
|
act='tanh')
|
|
current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score)
|
|
# use score to do beam search
|
|
current_score = pd.fc(input=current_state_with_lod,
|
|
size=target_dict_dim,
|
|
act='softmax')
|
|
topk_scores, topk_indices = pd.topk(current_score, k=beam_size)
|
|
# calculate accumulated scores after topk to reduce computation cost
|
|
accu_scores = pd.elementwise_add(
|
|
x=pd.log(topk_scores), y=pd.reshape(
|
|
pre_score, shape=[-1]), axis=0)
|
|
selected_ids, selected_scores = pd.beam_search(
|
|
pre_ids,
|
|
pre_score,
|
|
topk_indices,
|
|
accu_scores,
|
|
beam_size,
|
|
end_id=10,
|
|
level=0)
|
|
|
|
pd.increment(x=counter, value=1, in_place=True)
|
|
|
|
# update the memories
|
|
pd.array_write(current_state, array=state_array, i=counter)
|
|
pd.array_write(selected_ids, array=ids_array, i=counter)
|
|
pd.array_write(selected_scores, array=scores_array, i=counter)
|
|
|
|
# update the break condition: up to the max length or all candidates of
|
|
# source sentences have ended.
|
|
length_cond = pd.less_than(x=counter, y=array_len)
|
|
finish_cond = pd.logical_not(pd.is_empty(x=selected_ids))
|
|
pd.logical_and(x=length_cond, y=finish_cond, out=cond)
|
|
|
|
translation_ids, translation_scores = pd.beam_search_decode(
|
|
ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10)
|
|
|
|
# return init_ids, init_scores
|
|
|
|
return translation_ids, translation_scores
|
|
|
|
|
|
def train_main(use_cuda, is_sparse, is_local=True):
|
|
if use_cuda and not fluid.core.is_compiled_with_cuda():
|
|
return
|
|
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
|
|
|
|
context = encoder(is_sparse)
|
|
rnn_out = decoder_train(context, is_sparse)
|
|
label = pd.data(
|
|
name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
|
|
cost = pd.cross_entropy(input=rnn_out, label=label)
|
|
avg_cost = pd.mean(cost)
|
|
|
|
optimizer = fluid.optimizer.Adagrad(
|
|
learning_rate=1e-4,
|
|
regularization=fluid.regularizer.L2DecayRegularizer(
|
|
regularization_coeff=0.1))
|
|
optimizer.minimize(avg_cost)
|
|
|
|
train_data = paddle.batch(
|
|
paddle.reader.shuffle(
|
|
paddle.dataset.wmt14.train(dict_size), buf_size=1000),
|
|
batch_size=batch_size)
|
|
|
|
feed_order = [
|
|
'src_word_id', 'target_language_word', 'target_language_next_word'
|
|
]
|
|
|
|
exe = Executor(place)
|
|
|
|
def train_loop(main_program):
|
|
exe.run(framework.default_startup_program())
|
|
|
|
feed_list = [
|
|
main_program.global_block().var(var_name) for var_name in feed_order
|
|
]
|
|
feeder = fluid.DataFeeder(feed_list, place)
|
|
|
|
batch_id = 0
|
|
for pass_id in range(1):
|
|
for data in train_data():
|
|
outs = exe.run(main_program,
|
|
feed=feeder.feed(data),
|
|
fetch_list=[avg_cost])
|
|
avg_cost_val = np.array(outs[0])
|
|
print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
|
|
" avg_cost=" + str(avg_cost_val))
|
|
if batch_id > 3:
|
|
break
|
|
batch_id += 1
|
|
|
|
if is_local:
|
|
train_loop(framework.default_main_program())
|
|
else:
|
|
port = os.getenv("PADDLE_PSERVER_PORT", "6174")
|
|
pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip...
|
|
eplist = []
|
|
for ip in pserver_ips.split(","):
|
|
eplist.append(':'.join([ip, port]))
|
|
pserver_endpoints = ",".join(eplist) # ip:port,ip:port...
|
|
trainers = int(os.getenv("PADDLE_TRAINERS"))
|
|
current_endpoint = os.getenv("POD_IP") + ":" + port
|
|
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
|
|
training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
|
|
t = fluid.DistributeTranspiler()
|
|
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
|
|
if training_role == "PSERVER":
|
|
pserver_prog = t.get_pserver_program(current_endpoint)
|
|
pserver_startup = t.get_startup_program(current_endpoint,
|
|
pserver_prog)
|
|
exe.run(pserver_startup)
|
|
exe.run(pserver_prog)
|
|
elif training_role == "TRAINER":
|
|
train_loop(t.get_trainer_program())
|
|
|
|
|
|
def decode_main(use_cuda, is_sparse):
|
|
if use_cuda and not fluid.core.is_compiled_with_cuda():
|
|
return
|
|
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
|
|
|
|
context = encoder(is_sparse)
|
|
translation_ids, translation_scores = decoder_decode(context, is_sparse)
|
|
|
|
exe = Executor(place)
|
|
exe.run(framework.default_startup_program())
|
|
|
|
init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64')
|
|
init_scores_data = np.array(
|
|
[1. for _ in range(batch_size)], dtype='float32')
|
|
init_ids_data = init_ids_data.reshape((batch_size, 1))
|
|
init_scores_data = init_scores_data.reshape((batch_size, 1))
|
|
init_recursive_seq_lens = [1] * batch_size
|
|
init_recursive_seq_lens = [init_recursive_seq_lens, init_recursive_seq_lens]
|
|
|
|
init_ids = fluid.create_lod_tensor(init_ids_data, init_recursive_seq_lens,
|
|
place)
|
|
init_scores = fluid.create_lod_tensor(init_scores_data,
|
|
init_recursive_seq_lens, place)
|
|
|
|
train_data = paddle.batch(
|
|
paddle.reader.shuffle(
|
|
paddle.dataset.wmt14.train(dict_size), buf_size=1000),
|
|
batch_size=batch_size)
|
|
|
|
feed_order = ['src_word_id']
|
|
feed_list = [
|
|
framework.default_main_program().global_block().var(var_name)
|
|
for var_name in feed_order
|
|
]
|
|
feeder = fluid.DataFeeder(feed_list, place)
|
|
|
|
for data in train_data():
|
|
feed_dict = feeder.feed([[x[0]] for x in data])
|
|
feed_dict['init_ids'] = init_ids
|
|
feed_dict['init_scores'] = init_scores
|
|
|
|
result_ids, result_scores = exe.run(
|
|
framework.default_main_program(),
|
|
feed=feed_dict,
|
|
fetch_list=[translation_ids, translation_scores],
|
|
return_numpy=False)
|
|
print(result_ids.recursive_sequence_lengths())
|
|
break
|
|
|
|
|
|
class TestMachineTranslation(unittest.TestCase):
|
|
pass
|
|
|
|
|
|
@contextlib.contextmanager
|
|
def scope_prog_guard():
|
|
prog = fluid.Program()
|
|
startup_prog = fluid.Program()
|
|
scope = fluid.core.Scope()
|
|
with fluid.scope_guard(scope):
|
|
with fluid.program_guard(prog, startup_prog):
|
|
yield
|
|
|
|
|
|
def inject_test_train(use_cuda, is_sparse):
|
|
f_name = 'test_{0}_{1}_train'.format('cuda' if use_cuda else 'cpu', 'sparse'
|
|
if is_sparse else 'dense')
|
|
|
|
def f(*args):
|
|
with scope_prog_guard():
|
|
train_main(use_cuda, is_sparse)
|
|
|
|
setattr(TestMachineTranslation, f_name, f)
|
|
|
|
|
|
def inject_test_decode(use_cuda, is_sparse, decorator=None):
|
|
f_name = 'test_{0}_{1}_decode'.format('cuda'
|
|
if use_cuda else 'cpu', 'sparse'
|
|
if is_sparse else 'dense')
|
|
|
|
def f(*args):
|
|
with scope_prog_guard():
|
|
decode_main(use_cuda, is_sparse)
|
|
|
|
if decorator is not None:
|
|
f = decorator(f)
|
|
|
|
setattr(TestMachineTranslation, f_name, f)
|
|
|
|
|
|
for _use_cuda_ in (False, True):
|
|
for _is_sparse_ in (False, True):
|
|
inject_test_train(_use_cuda_, _is_sparse_)
|
|
|
|
for _use_cuda_ in (False, True):
|
|
for _is_sparse_ in (False, True):
|
|
|
|
_decorator_ = None
|
|
if _use_cuda_:
|
|
_decorator_ = unittest.skip(
|
|
reason='Beam Search does not support CUDA!')
|
|
|
|
inject_test_decode(
|
|
is_sparse=_is_sparse_, use_cuda=_use_cuda_, decorator=_decorator_)
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|