Merge pull request #7425 from helinwang/distributed_label_semantic_role
Add distributed label semantic role book chapteradd_depthwiseConv_op_gpu
commit
bfc68f256a
@ -0,0 +1,225 @@
|
||||
import math
|
||||
|
||||
import numpy as np
|
||||
import paddle.v2 as paddle
|
||||
import paddle.v2.dataset.conll05 as conll05
|
||||
import paddle.v2.fluid as fluid
|
||||
import time
|
||||
import os
|
||||
|
||||
word_dict, verb_dict, label_dict = conll05.get_dict()
|
||||
word_dict_len = len(word_dict)
|
||||
label_dict_len = len(label_dict)
|
||||
pred_len = len(verb_dict)
|
||||
|
||||
mark_dict_len = 2
|
||||
word_dim = 32
|
||||
mark_dim = 5
|
||||
hidden_dim = 512
|
||||
depth = 8
|
||||
mix_hidden_lr = 1e-3
|
||||
|
||||
IS_SPARSE = True
|
||||
PASS_NUM = 10
|
||||
BATCH_SIZE = 20
|
||||
|
||||
embedding_name = 'emb'
|
||||
|
||||
|
||||
def load_parameter(file_name, h, w):
|
||||
with open(file_name, 'rb') as f:
|
||||
f.read(16) # skip header.
|
||||
return np.fromfile(f, dtype=np.float32).reshape(h, w)
|
||||
|
||||
|
||||
def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
|
||||
**ignored):
|
||||
# 8 features
|
||||
predicate_embedding = fluid.layers.embedding(
|
||||
input=predicate,
|
||||
size=[pred_len, word_dim],
|
||||
dtype='float32',
|
||||
is_sparse=IS_SPARSE,
|
||||
param_attr='vemb')
|
||||
|
||||
mark_embedding = fluid.layers.embedding(
|
||||
input=mark,
|
||||
size=[mark_dict_len, mark_dim],
|
||||
dtype='float32',
|
||||
is_sparse=IS_SPARSE)
|
||||
|
||||
word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
|
||||
emb_layers = [
|
||||
fluid.layers.embedding(
|
||||
size=[word_dict_len, word_dim],
|
||||
input=x,
|
||||
param_attr=fluid.ParamAttr(
|
||||
name=embedding_name, trainable=False)) for x in word_input
|
||||
]
|
||||
emb_layers.append(predicate_embedding)
|
||||
emb_layers.append(mark_embedding)
|
||||
|
||||
hidden_0_layers = [
|
||||
fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers
|
||||
]
|
||||
|
||||
hidden_0 = fluid.layers.sums(input=hidden_0_layers)
|
||||
|
||||
lstm_0 = fluid.layers.dynamic_lstm(
|
||||
input=hidden_0,
|
||||
size=hidden_dim,
|
||||
candidate_activation='relu',
|
||||
gate_activation='sigmoid',
|
||||
cell_activation='sigmoid')
|
||||
|
||||
# stack L-LSTM and R-LSTM with direct edges
|
||||
input_tmp = [hidden_0, lstm_0]
|
||||
|
||||
for i in range(1, depth):
|
||||
mix_hidden = fluid.layers.sums(input=[
|
||||
fluid.layers.fc(input=input_tmp[0], size=hidden_dim),
|
||||
fluid.layers.fc(input=input_tmp[1], size=hidden_dim)
|
||||
])
|
||||
|
||||
lstm = fluid.layers.dynamic_lstm(
|
||||
input=mix_hidden,
|
||||
size=hidden_dim,
|
||||
candidate_activation='relu',
|
||||
gate_activation='sigmoid',
|
||||
cell_activation='sigmoid',
|
||||
is_reverse=((i % 2) == 1))
|
||||
|
||||
input_tmp = [mix_hidden, lstm]
|
||||
|
||||
feature_out = fluid.layers.sums(input=[
|
||||
fluid.layers.fc(input=input_tmp[0], size=label_dict_len),
|
||||
fluid.layers.fc(input=input_tmp[1], size=label_dict_len)
|
||||
])
|
||||
|
||||
return feature_out
|
||||
|
||||
|
||||
def to_lodtensor(data, place):
|
||||
seq_lens = [len(seq) for seq in data]
|
||||
cur_len = 0
|
||||
lod = [cur_len]
|
||||
for l in seq_lens:
|
||||
cur_len += l
|
||||
lod.append(cur_len)
|
||||
flattened_data = np.concatenate(data, axis=0).astype("int64")
|
||||
flattened_data = flattened_data.reshape([len(flattened_data), 1])
|
||||
res = fluid.LoDTensor()
|
||||
res.set(flattened_data, place)
|
||||
res.set_lod([lod])
|
||||
return res
|
||||
|
||||
|
||||
def main():
|
||||
# define network topology
|
||||
word = fluid.layers.data(
|
||||
name='word_data', shape=[1], dtype='int64', lod_level=1)
|
||||
predicate = fluid.layers.data(
|
||||
name='verb_data', shape=[1], dtype='int64', lod_level=1)
|
||||
ctx_n2 = fluid.layers.data(
|
||||
name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
|
||||
ctx_n1 = fluid.layers.data(
|
||||
name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
|
||||
ctx_0 = fluid.layers.data(
|
||||
name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
|
||||
ctx_p1 = fluid.layers.data(
|
||||
name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
|
||||
ctx_p2 = fluid.layers.data(
|
||||
name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
|
||||
mark = fluid.layers.data(
|
||||
name='mark_data', shape=[1], dtype='int64', lod_level=1)
|
||||
feature_out = db_lstm(**locals())
|
||||
target = fluid.layers.data(
|
||||
name='target', shape=[1], dtype='int64', lod_level=1)
|
||||
crf_cost = fluid.layers.linear_chain_crf(
|
||||
input=feature_out,
|
||||
label=target,
|
||||
param_attr=fluid.ParamAttr(
|
||||
name='crfw', learning_rate=mix_hidden_lr))
|
||||
avg_cost = fluid.layers.mean(x=crf_cost)
|
||||
|
||||
# TODO(qiao)
|
||||
# check other optimizers and check why out will be NAN
|
||||
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
|
||||
optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
|
||||
|
||||
# TODO(qiao)
|
||||
# add dependency track and move this config before optimizer
|
||||
crf_decode = fluid.layers.crf_decoding(
|
||||
input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
|
||||
|
||||
chunk_evaluator = fluid.evaluator.ChunkEvaluator(
|
||||
input=crf_decode,
|
||||
label=target,
|
||||
chunk_scheme="IOB",
|
||||
num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))
|
||||
|
||||
train_data = paddle.batch(
|
||||
paddle.reader.shuffle(
|
||||
paddle.dataset.conll05.test(), buf_size=8192),
|
||||
batch_size=BATCH_SIZE)
|
||||
place = fluid.CPUPlace()
|
||||
feeder = fluid.DataFeeder(
|
||||
feed_list=[
|
||||
word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
|
||||
],
|
||||
place=place)
|
||||
exe = fluid.Executor(place)
|
||||
|
||||
t = fluid.DistributeTranspiler()
|
||||
pserver_endpoints = os.getenv("PSERVERS")
|
||||
# server endpoint for current node
|
||||
current_endpoint = os.getenv("SERVER_ENDPOINT")
|
||||
# run as trainer or parameter server
|
||||
training_role = os.getenv(
|
||||
"TRAINING_ROLE", "TRAINER") # get the training role: trainer/pserver
|
||||
t.transpile(
|
||||
optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
|
||||
|
||||
if training_role == "PSERVER":
|
||||
if not current_endpoint:
|
||||
print("need env SERVER_ENDPOINT")
|
||||
exit(1)
|
||||
pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
|
||||
exe.run(fluid.default_startup_program())
|
||||
exe.run(pserver_prog)
|
||||
elif training_role == "TRAINER":
|
||||
trainer_prog = t.get_trainer_program()
|
||||
start_time = time.time()
|
||||
batch_id = 0
|
||||
exe.run(fluid.default_startup_program())
|
||||
embedding_param = fluid.global_scope().find_var(
|
||||
embedding_name).get_tensor()
|
||||
embedding_param.set(
|
||||
load_parameter(conll05.get_embedding(), word_dict_len, word_dim),
|
||||
place)
|
||||
for pass_id in xrange(PASS_NUM):
|
||||
chunk_evaluator.reset(exe)
|
||||
for data in train_data():
|
||||
cost, precision, recall, f1_score = exe.run(
|
||||
trainer_prog,
|
||||
feed=feeder.feed(data),
|
||||
fetch_list=[avg_cost] + chunk_evaluator.metrics)
|
||||
pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(
|
||||
exe)
|
||||
|
||||
if batch_id % 10 == 0:
|
||||
print("avg_cost:" + str(cost) + " precision:" + str(
|
||||
precision) + " recall:" + str(recall) + " f1_score:" +
|
||||
str(f1_score) + " pass_precision:" + str(
|
||||
pass_precision) + " pass_recall:" + str(
|
||||
pass_recall) + " pass_f1_score:" + str(
|
||||
pass_f1_score))
|
||||
if batch_id != 0:
|
||||
print("second per batch: " + str((time.time(
|
||||
) - start_time) / batch_id))
|
||||
|
||||
batch_id = batch_id + 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Reference in new issue