commit
b6ca314c44
@ -0,0 +1,190 @@
|
|||||||
|
import sys
|
||||||
|
import math
|
||||||
|
import numpy as np
|
||||||
|
import paddle.v2 as paddle
|
||||||
|
import paddle.v2.dataset.conll05 as conll05
|
||||||
|
|
||||||
|
|
||||||
|
def db_lstm():
|
||||||
|
word_dict, verb_dict, label_dict = conll05.get_dict()
|
||||||
|
word_dict_len = len(word_dict)
|
||||||
|
label_dict_len = len(label_dict)
|
||||||
|
pred_len = len(verb_dict)
|
||||||
|
|
||||||
|
mark_dict_len = 2
|
||||||
|
word_dim = 32
|
||||||
|
mark_dim = 5
|
||||||
|
hidden_dim = 512
|
||||||
|
depth = 8
|
||||||
|
|
||||||
|
#8 features
|
||||||
|
def d_type(size):
|
||||||
|
return paddle.data_type.integer_value_sequence(size)
|
||||||
|
|
||||||
|
word = paddle.layer.data(name='word_data', type=d_type(word_dict_len))
|
||||||
|
predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len))
|
||||||
|
|
||||||
|
ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len))
|
||||||
|
ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len))
|
||||||
|
ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len))
|
||||||
|
ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len))
|
||||||
|
ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len))
|
||||||
|
mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len))
|
||||||
|
|
||||||
|
target = paddle.layer.data(name='target', type=d_type(label_dict_len))
|
||||||
|
|
||||||
|
default_std = 1 / math.sqrt(hidden_dim) / 3.0
|
||||||
|
|
||||||
|
emb_para = paddle.attr.Param(name='emb', initial_std=0., learning_rate=0.)
|
||||||
|
std_0 = paddle.attr.Param(initial_std=0.)
|
||||||
|
std_default = paddle.attr.Param(initial_std=default_std)
|
||||||
|
|
||||||
|
predicate_embedding = paddle.layer.embedding(
|
||||||
|
size=word_dim,
|
||||||
|
input=predicate,
|
||||||
|
param_attr=paddle.attr.Param(
|
||||||
|
name='vemb', initial_std=default_std))
|
||||||
|
mark_embedding = paddle.layer.embedding(
|
||||||
|
size=mark_dim, input=mark, param_attr=std_0)
|
||||||
|
|
||||||
|
word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
|
||||||
|
emb_layers = [
|
||||||
|
paddle.layer.embedding(
|
||||||
|
size=word_dim, input=x, param_attr=emb_para) for x in word_input
|
||||||
|
]
|
||||||
|
emb_layers.append(predicate_embedding)
|
||||||
|
emb_layers.append(mark_embedding)
|
||||||
|
|
||||||
|
hidden_0 = paddle.layer.mixed(
|
||||||
|
size=hidden_dim,
|
||||||
|
bias_attr=std_default,
|
||||||
|
input=[
|
||||||
|
paddle.layer.full_matrix_projection(
|
||||||
|
input=emb, param_attr=std_default) for emb in emb_layers
|
||||||
|
])
|
||||||
|
|
||||||
|
mix_hidden_lr = 1e-3
|
||||||
|
lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0)
|
||||||
|
hidden_para_attr = paddle.attr.Param(
|
||||||
|
initial_std=default_std, learning_rate=mix_hidden_lr)
|
||||||
|
|
||||||
|
lstm_0 = paddle.layer.lstmemory(
|
||||||
|
input=hidden_0,
|
||||||
|
act=paddle.activation.Relu(),
|
||||||
|
gate_act=paddle.activation.Sigmoid(),
|
||||||
|
state_act=paddle.activation.Sigmoid(),
|
||||||
|
bias_attr=std_0,
|
||||||
|
param_attr=lstm_para_attr)
|
||||||
|
|
||||||
|
#stack L-LSTM and R-LSTM with direct edges
|
||||||
|
input_tmp = [hidden_0, lstm_0]
|
||||||
|
|
||||||
|
for i in range(1, depth):
|
||||||
|
mix_hidden = paddle.layer.mixed(
|
||||||
|
size=hidden_dim,
|
||||||
|
bias_attr=std_default,
|
||||||
|
input=[
|
||||||
|
paddle.layer.full_matrix_projection(
|
||||||
|
input=input_tmp[0], param_attr=hidden_para_attr),
|
||||||
|
paddle.layer.full_matrix_projection(
|
||||||
|
input=input_tmp[1], param_attr=lstm_para_attr)
|
||||||
|
])
|
||||||
|
|
||||||
|
lstm = paddle.layer.lstmemory(
|
||||||
|
input=mix_hidden,
|
||||||
|
act=paddle.activation.Relu(),
|
||||||
|
gate_act=paddle.activation.Sigmoid(),
|
||||||
|
state_act=paddle.activation.Sigmoid(),
|
||||||
|
reverse=((i % 2) == 1),
|
||||||
|
bias_attr=std_0,
|
||||||
|
param_attr=lstm_para_attr)
|
||||||
|
|
||||||
|
input_tmp = [mix_hidden, lstm]
|
||||||
|
|
||||||
|
feature_out = paddle.layer.mixed(
|
||||||
|
size=label_dict_len,
|
||||||
|
bias_attr=std_default,
|
||||||
|
input=[
|
||||||
|
paddle.layer.full_matrix_projection(
|
||||||
|
input=input_tmp[0], param_attr=hidden_para_attr),
|
||||||
|
paddle.layer.full_matrix_projection(
|
||||||
|
input=input_tmp[1], param_attr=lstm_para_attr)
|
||||||
|
], )
|
||||||
|
|
||||||
|
crf_cost = paddle.layer.crf(size=label_dict_len,
|
||||||
|
input=feature_out,
|
||||||
|
label=target,
|
||||||
|
param_attr=paddle.attr.Param(
|
||||||
|
name='crfw',
|
||||||
|
initial_std=default_std,
|
||||||
|
learning_rate=mix_hidden_lr))
|
||||||
|
|
||||||
|
crf_dec = paddle.layer.crf_decoding(
|
||||||
|
name='crf_dec_l',
|
||||||
|
size=label_dict_len,
|
||||||
|
input=feature_out,
|
||||||
|
label=target,
|
||||||
|
param_attr=paddle.attr.Param(name='crfw'))
|
||||||
|
|
||||||
|
return crf_cost, crf_dec
|
||||||
|
|
||||||
|
|
||||||
|
def load_parameter(file_name, h, w):
|
||||||
|
with open(file_name, 'rb') as f:
|
||||||
|
f.read(16) # skip header.
|
||||||
|
return np.fromfile(f, dtype=np.float32).reshape(h, w)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
paddle.init(use_gpu=False, trainer_count=1)
|
||||||
|
|
||||||
|
# define network topology
|
||||||
|
crf_cost, crf_dec = db_lstm()
|
||||||
|
|
||||||
|
# create parameters
|
||||||
|
parameters = paddle.parameters.create([crf_cost, crf_dec])
|
||||||
|
|
||||||
|
# create optimizer
|
||||||
|
optimizer = paddle.optimizer.Momentum(
|
||||||
|
momentum=0,
|
||||||
|
learning_rate=2e-2,
|
||||||
|
regularization=paddle.optimizer.L2Regularization(rate=8e-4),
|
||||||
|
model_average=paddle.optimizer.ModelAverage(
|
||||||
|
average_window=0.5, max_average_window=10000), )
|
||||||
|
|
||||||
|
def event_handler(event):
|
||||||
|
if isinstance(event, paddle.event.EndIteration):
|
||||||
|
if event.batch_id % 100 == 0:
|
||||||
|
print "Pass %d, Batch %d, Cost %f, %s" % (
|
||||||
|
event.pass_id, event.batch_id, event.cost, event.metrics)
|
||||||
|
|
||||||
|
trainer = paddle.trainer.SGD(cost=crf_cost,
|
||||||
|
parameters=parameters,
|
||||||
|
update_equation=optimizer)
|
||||||
|
parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32))
|
||||||
|
|
||||||
|
trn_reader = paddle.reader.batched(
|
||||||
|
paddle.reader.shuffle(
|
||||||
|
conll05.test(), buf_size=8192), batch_size=10)
|
||||||
|
|
||||||
|
reader_dict = {
|
||||||
|
'word_data': 0,
|
||||||
|
'ctx_n2_data': 1,
|
||||||
|
'ctx_n1_data': 2,
|
||||||
|
'ctx_0_data': 3,
|
||||||
|
'ctx_p1_data': 4,
|
||||||
|
'ctx_p2_data': 5,
|
||||||
|
'verb_data': 6,
|
||||||
|
'mark_data': 7,
|
||||||
|
'target': 8
|
||||||
|
}
|
||||||
|
|
||||||
|
trainer.train(
|
||||||
|
reader=trn_reader,
|
||||||
|
event_handler=event_handler,
|
||||||
|
num_passes=10000,
|
||||||
|
reader_dict=reader_dict)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
@ -0,0 +1,247 @@
|
|||||||
|
import sys
|
||||||
|
from os.path import join as join_path
|
||||||
|
import paddle.trainer_config_helpers.attrs as attrs
|
||||||
|
from paddle.trainer_config_helpers.poolings import MaxPooling
|
||||||
|
import paddle.v2.layer as layer
|
||||||
|
import paddle.v2.activation as activation
|
||||||
|
import paddle.v2.data_type as data_type
|
||||||
|
import paddle.v2.dataset.imdb as imdb
|
||||||
|
import paddle.v2 as paddle
|
||||||
|
|
||||||
|
|
||||||
|
def sequence_conv_pool(input,
|
||||||
|
input_size,
|
||||||
|
context_len,
|
||||||
|
hidden_size,
|
||||||
|
name=None,
|
||||||
|
context_start=None,
|
||||||
|
pool_type=None,
|
||||||
|
context_proj_layer_name=None,
|
||||||
|
context_proj_param_attr=False,
|
||||||
|
fc_layer_name=None,
|
||||||
|
fc_param_attr=None,
|
||||||
|
fc_bias_attr=None,
|
||||||
|
fc_act=None,
|
||||||
|
pool_bias_attr=None,
|
||||||
|
fc_attr=None,
|
||||||
|
context_attr=None,
|
||||||
|
pool_attr=None):
|
||||||
|
"""
|
||||||
|
Text convolution pooling layers helper.
|
||||||
|
|
||||||
|
Text input => Context Projection => FC Layer => Pooling => Output.
|
||||||
|
|
||||||
|
:param name: name of output layer(pooling layer name)
|
||||||
|
:type name: basestring
|
||||||
|
:param input: name of input layer
|
||||||
|
:type input: LayerOutput
|
||||||
|
:param context_len: context projection length. See
|
||||||
|
context_projection's document.
|
||||||
|
:type context_len: int
|
||||||
|
:param hidden_size: FC Layer size.
|
||||||
|
:type hidden_size: int
|
||||||
|
:param context_start: context projection length. See
|
||||||
|
context_projection's context_start.
|
||||||
|
:type context_start: int or None
|
||||||
|
:param pool_type: pooling layer type. See pooling_layer's document.
|
||||||
|
:type pool_type: BasePoolingType.
|
||||||
|
:param context_proj_layer_name: context projection layer name.
|
||||||
|
None if user don't care.
|
||||||
|
:type context_proj_layer_name: basestring
|
||||||
|
:param context_proj_param_attr: context projection parameter attribute.
|
||||||
|
None if user don't care.
|
||||||
|
:type context_proj_param_attr: ParameterAttribute or None.
|
||||||
|
:param fc_layer_name: fc layer name. None if user don't care.
|
||||||
|
:type fc_layer_name: basestring
|
||||||
|
:param fc_param_attr: fc layer parameter attribute. None if user don't care.
|
||||||
|
:type fc_param_attr: ParameterAttribute or None
|
||||||
|
:param fc_bias_attr: fc bias parameter attribute. False if no bias,
|
||||||
|
None if user don't care.
|
||||||
|
:type fc_bias_attr: ParameterAttribute or None
|
||||||
|
:param fc_act: fc layer activation type. None means tanh
|
||||||
|
:type fc_act: BaseActivation
|
||||||
|
:param pool_bias_attr: pooling layer bias attr. None if don't care.
|
||||||
|
False if no bias.
|
||||||
|
:type pool_bias_attr: ParameterAttribute or None.
|
||||||
|
:param fc_attr: fc layer extra attribute.
|
||||||
|
:type fc_attr: ExtraLayerAttribute
|
||||||
|
:param context_attr: context projection layer extra attribute.
|
||||||
|
:type context_attr: ExtraLayerAttribute
|
||||||
|
:param pool_attr: pooling layer extra attribute.
|
||||||
|
:type pool_attr: ExtraLayerAttribute
|
||||||
|
:return: output layer name.
|
||||||
|
:rtype: LayerOutput
|
||||||
|
"""
|
||||||
|
# Set Default Value to param
|
||||||
|
context_proj_layer_name = "%s_conv_proj" % name \
|
||||||
|
if context_proj_layer_name is None else context_proj_layer_name
|
||||||
|
|
||||||
|
with layer.mixed(
|
||||||
|
name=context_proj_layer_name,
|
||||||
|
size=input_size * context_len,
|
||||||
|
act=activation.Linear(),
|
||||||
|
layer_attr=context_attr) as m:
|
||||||
|
m += layer.context_projection(
|
||||||
|
input=input,
|
||||||
|
context_len=context_len,
|
||||||
|
context_start=context_start,
|
||||||
|
padding_attr=context_proj_param_attr)
|
||||||
|
|
||||||
|
fc_layer_name = "%s_conv_fc" % name \
|
||||||
|
if fc_layer_name is None else fc_layer_name
|
||||||
|
fl = layer.fc(name=fc_layer_name,
|
||||||
|
input=m,
|
||||||
|
size=hidden_size,
|
||||||
|
act=fc_act,
|
||||||
|
layer_attr=fc_attr,
|
||||||
|
param_attr=fc_param_attr,
|
||||||
|
bias_attr=fc_bias_attr)
|
||||||
|
|
||||||
|
return layer.pooling(
|
||||||
|
name=name,
|
||||||
|
input=fl,
|
||||||
|
pooling_type=pool_type,
|
||||||
|
bias_attr=pool_bias_attr,
|
||||||
|
layer_attr=pool_attr)
|
||||||
|
|
||||||
|
|
||||||
|
def convolution_net(input_dim,
|
||||||
|
class_dim=2,
|
||||||
|
emb_dim=128,
|
||||||
|
hid_dim=128,
|
||||||
|
is_predict=False):
|
||||||
|
data = layer.data("word", data_type.integer_value_sequence(input_dim))
|
||||||
|
emb = layer.embedding(input=data, size=emb_dim)
|
||||||
|
conv_3 = sequence_conv_pool(
|
||||||
|
input=emb, input_size=emb_dim, context_len=3, hidden_size=hid_dim)
|
||||||
|
conv_4 = sequence_conv_pool(
|
||||||
|
input=emb, input_size=emb_dim, context_len=4, hidden_size=hid_dim)
|
||||||
|
output = layer.fc(input=[conv_3, conv_4],
|
||||||
|
size=class_dim,
|
||||||
|
act=activation.Softmax())
|
||||||
|
lbl = layer.data("label", data_type.integer_value(2))
|
||||||
|
cost = layer.classification_cost(input=output, label=lbl)
|
||||||
|
return cost
|
||||||
|
|
||||||
|
|
||||||
|
def stacked_lstm_net(input_dim,
|
||||||
|
class_dim=2,
|
||||||
|
emb_dim=128,
|
||||||
|
hid_dim=512,
|
||||||
|
stacked_num=3,
|
||||||
|
is_predict=False):
|
||||||
|
"""
|
||||||
|
A Wrapper for sentiment classification task.
|
||||||
|
This network uses bi-directional recurrent network,
|
||||||
|
consisting three LSTM layers. This configure is referred to
|
||||||
|
the paper as following url, but use fewer layrs.
|
||||||
|
http://www.aclweb.org/anthology/P15-1109
|
||||||
|
|
||||||
|
input_dim: here is word dictionary dimension.
|
||||||
|
class_dim: number of categories.
|
||||||
|
emb_dim: dimension of word embedding.
|
||||||
|
hid_dim: dimension of hidden layer.
|
||||||
|
stacked_num: number of stacked lstm-hidden layer.
|
||||||
|
is_predict: is predicting or not.
|
||||||
|
Some layers is not needed in network when predicting.
|
||||||
|
"""
|
||||||
|
assert stacked_num % 2 == 1
|
||||||
|
|
||||||
|
layer_attr = attrs.ExtraLayerAttribute(drop_rate=0.5)
|
||||||
|
fc_para_attr = attrs.ParameterAttribute(learning_rate=1e-3)
|
||||||
|
lstm_para_attr = attrs.ParameterAttribute(initial_std=0., learning_rate=1.)
|
||||||
|
para_attr = [fc_para_attr, lstm_para_attr]
|
||||||
|
bias_attr = attrs.ParameterAttribute(initial_std=0., l2_rate=0.)
|
||||||
|
relu = activation.Relu()
|
||||||
|
linear = activation.Linear()
|
||||||
|
|
||||||
|
data = layer.data("word", data_type.integer_value_sequence(input_dim))
|
||||||
|
emb = layer.embedding(input=data, size=emb_dim)
|
||||||
|
|
||||||
|
fc1 = layer.fc(input=emb, size=hid_dim, act=linear, bias_attr=bias_attr)
|
||||||
|
lstm1 = layer.lstmemory(
|
||||||
|
input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr)
|
||||||
|
|
||||||
|
inputs = [fc1, lstm1]
|
||||||
|
for i in range(2, stacked_num + 1):
|
||||||
|
fc = layer.fc(input=inputs,
|
||||||
|
size=hid_dim,
|
||||||
|
act=linear,
|
||||||
|
param_attr=para_attr,
|
||||||
|
bias_attr=bias_attr)
|
||||||
|
lstm = layer.lstmemory(
|
||||||
|
input=fc,
|
||||||
|
reverse=(i % 2) == 0,
|
||||||
|
act=relu,
|
||||||
|
bias_attr=bias_attr,
|
||||||
|
layer_attr=layer_attr)
|
||||||
|
inputs = [fc, lstm]
|
||||||
|
|
||||||
|
fc_last = layer.pooling(input=inputs[0], pooling_type=MaxPooling())
|
||||||
|
lstm_last = layer.pooling(input=inputs[1], pooling_type=MaxPooling())
|
||||||
|
output = layer.fc(input=[fc_last, lstm_last],
|
||||||
|
size=class_dim,
|
||||||
|
act=activation.Softmax(),
|
||||||
|
bias_attr=bias_attr,
|
||||||
|
param_attr=para_attr)
|
||||||
|
|
||||||
|
lbl = layer.data("label", data_type.integer_value(2))
|
||||||
|
cost = layer.classification_cost(input=output, label=lbl)
|
||||||
|
return cost
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# init
|
||||||
|
paddle.init(use_gpu=True, trainer_count=4)
|
||||||
|
|
||||||
|
# network config
|
||||||
|
print 'load dictionary...'
|
||||||
|
word_dict = imdb.word_dict()
|
||||||
|
dict_dim = len(word_dict)
|
||||||
|
class_dim = 2
|
||||||
|
|
||||||
|
# Please choose the way to build the network
|
||||||
|
# by uncommenting the corresponding line.
|
||||||
|
cost = convolution_net(dict_dim, class_dim=class_dim)
|
||||||
|
# cost = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3)
|
||||||
|
|
||||||
|
# create parameters
|
||||||
|
parameters = paddle.parameters.create(cost)
|
||||||
|
|
||||||
|
# create optimizer
|
||||||
|
adam_optimizer = paddle.optimizer.Adam(
|
||||||
|
learning_rate=2e-3,
|
||||||
|
regularization=paddle.optimizer.L2Regularization(rate=8e-4),
|
||||||
|
model_average=paddle.optimizer.ModelAverage(average_window=0.5))
|
||||||
|
|
||||||
|
# End batch and end pass event handler
|
||||||
|
def event_handler(event):
|
||||||
|
if isinstance(event, paddle.event.EndIteration):
|
||||||
|
if event.batch_id % 100 == 0:
|
||||||
|
print "\nPass %d, Batch %d, Cost %f, %s" % (
|
||||||
|
event.pass_id, event.batch_id, event.cost, event.metrics)
|
||||||
|
else:
|
||||||
|
sys.stdout.write('.')
|
||||||
|
sys.stdout.flush()
|
||||||
|
if isinstance(event, paddle.event.EndPass):
|
||||||
|
result = trainer.test(
|
||||||
|
reader=paddle.reader.batched(
|
||||||
|
lambda: imdb.test(word_dict), batch_size=128),
|
||||||
|
reader_dict={'word': 0,
|
||||||
|
'label': 1})
|
||||||
|
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
|
||||||
|
|
||||||
|
# create trainer
|
||||||
|
trainer = paddle.trainer.SGD(cost=cost,
|
||||||
|
parameters=parameters,
|
||||||
|
update_equation=adam_optimizer)
|
||||||
|
|
||||||
|
trainer.train(
|
||||||
|
reader=paddle.reader.batched(
|
||||||
|
paddle.reader.shuffle(
|
||||||
|
lambda: imdb.train(word_dict), buf_size=1000),
|
||||||
|
batch_size=100),
|
||||||
|
event_handler=event_handler,
|
||||||
|
reader_dict={'word': 0,
|
||||||
|
'label': 1},
|
||||||
|
num_passes=10)
|
@ -1,4 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
brew update
|
|
||||||
brew tap homebrew/science
|
|
||||||
brew install openblas swig md5sha1sum
|
|
Loading…
Reference in new issue