Merge pull request #407 from zhangjcqq/develop

update srl demo
avx_docs
qingqing01 8 years ago committed by GitHub
commit 5d511a16b8

@ -17,24 +17,15 @@ import os
from optparse import OptionParser
def extract_dict_features(pair_file, feature_file, src_dict_file,
tgt_dict_file):
src_dict = set()
tgt_dict = set()
with open(pair_file) as fin, open(feature_file, 'w') as feature_out, open(
src_dict_file, 'w') as src_dict_out, open(tgt_dict_file,
'w') as tgt_dict_out:
def extract_dict_features(pair_file, feature_file):
with open(pair_file) as fin, open(feature_file, 'w') as feature_out:
for line in fin:
sentence, labels = line.strip().split('\t')
sentence, predicate, labels = line.strip().split('\t')
sentence_list = sentence.split()
labels_list = labels.split()
src_dict.update(sentence_list)
tgt_dict.update(labels_list)
verb_index = labels_list.index('B-V')
verb_feature = sentence_list[verb_index]
mark = [0] * len(labels_list)
if verb_index > 0:
@ -42,47 +33,50 @@ def extract_dict_features(pair_file, feature_file, src_dict_file,
ctx_n1 = sentence_list[verb_index - 1]
else:
ctx_n1 = 'bos'
ctx_n1_feature = ctx_n1
if verb_index > 1:
mark[verb_index - 2] = 1
ctx_n2 = sentence_list[verb_index - 2]
else:
ctx_n2 = 'bos'
mark[verb_index] = 1
ctx_0_feature = sentence_list[verb_index]
ctx_0 = sentence_list[verb_index]
if verb_index < len(labels_list) - 2:
mark[verb_index + 1] = 1
ctx_p1 = sentence_list[verb_index + 1]
else:
ctx_p1 = 'eos'
ctx_p1_feature = ctx_p1
if verb_index < len(labels_list) - 3:
mark[verb_index + 2] = 1
ctx_p2 = sentence_list[verb_index + 2]
else:
ctx_p2 = 'eos'
feature_str = sentence + '\t' \
+ verb_feature + '\t' \
+ ctx_n1_feature + '\t' \
+ ctx_0_feature + '\t' \
+ ctx_p1_feature + '\t' \
+ predicate + '\t' \
+ ctx_n2 + '\t' \
+ ctx_n1 + '\t' \
+ ctx_0 + '\t' \
+ ctx_p1 + '\t' \
+ ctx_p2 + '\t' \
+ ' '.join([str(i) for i in mark]) + '\t' \
+ labels
feature_out.write(feature_str + '\n')
src_dict_out.write('<unk>\n')
src_dict_out.write('\n'.join(list(src_dict)))
tgt_dict_out.write('\n'.join(list(tgt_dict)))
if __name__ == '__main__':
usage = '-p pair_file -f feature_file -s source dictionary -t target dictionary '
usage = '-p pair_file -f feature_file'
parser = OptionParser(usage)
parser.add_option('-p', dest='pair_file', help='the pair file')
parser.add_option(
'-f', dest='feature_file', help='the file to store feature')
parser.add_option(
'-s', dest='src_dict', help='the file to store source dictionary')
parser.add_option(
'-t', dest='tgt_dict', help='the file to store target dictionary')
parser.add_option('-f', dest='feature_file', help='the feature file')
(options, args) = parser.parse_args()
extract_dict_features(options.pair_file, options.feature_file,
options.src_dict, options.tgt_dict)
extract_dict_features(options.pair_file, options.feature_file)

@ -51,7 +51,7 @@ def read_sentences(words_file):
for line in fin:
line = line.strip()
if line == '':
sentences.append(s.lower())
sentences.append(s)
s = ''
else:
s += line + ' '
@ -64,6 +64,11 @@ def transform_labels(sentences, labels):
if len(labels[i]) == 1:
continue
else:
verb_list = []
for x in labels[i][0]:
if x !='-':
verb_list.append(x)
for j in xrange(1, len(labels[i])):
label_list = labels[i][j]
current_tag = 'O'
@ -88,8 +93,7 @@ def transform_labels(sentences, labels):
is_in_bracket = True
else:
print 'error:', ll
sen_lab_pair.append((sentences[i], label_seq))
sen_lab_pair.append((sentences[i], verb_list[j-1], label_seq))
return sen_lab_pair
@ -97,9 +101,9 @@ def write_file(sen_lab_pair, output_file):
with open(output_file, 'w') as fout:
for x in sen_lab_pair:
sentence = x[0]
label_seq = ' '.join(x[1])
assert len(sentence.split()) == len(x[1])
fout.write(sentence + '\t' + label_seq + '\n')
label_seq = ' '.join(x[2])
assert len(sentence.split()) == len(x[2])
fout.write(sentence + '\t' + x[1]+'\t' +label_seq + '\n')
if __name__ == '__main__':

@ -14,6 +14,10 @@
# limitations under the License.
set -e
wget http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz
wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/verbDict.txt --no-check-certificate
wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/targetDict.txt --no-check-certificate
wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/wordDict.txt --no-check-certificate
wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/emb --no-check-certificate
tar -xzvf conll05st-tests.tar.gz
rm conll05st-tests.tar.gz
cp ./conll05st-release/test.wsj/words/test.wsj.words.gz .
@ -22,4 +26,4 @@ gunzip test.wsj.words.gz
gunzip test.wsj.props.gz
python extract_pairs.py -w test.wsj.words -p test.wsj.props -o test.wsj.seq_pair
python extract_dict_feature.py -p test.wsj.seq_pair -f feature -s src.dict -t tgt.dict
python extract_dict_feature.py -p test.wsj.seq_pair -f feature

@ -17,11 +17,15 @@ from paddle.trainer.PyDataProvider2 import *
UNK_IDX = 0
def hook(settings, word_dict, label_dict, **kwargs):
def hook(settings, word_dict, label_dict, predicate_dict, **kwargs):
settings.word_dict = word_dict
settings.label_dict = label_dict
settings.predicate_dict = predicate_dict
#all inputs are integral and sequential type
settings.slots = [
integer_value_sequence(len(word_dict)),
integer_value_sequence(len(predicate_dict)),
integer_value_sequence(len(word_dict)),
integer_value_sequence(len(word_dict)),
integer_value_sequence(len(word_dict)),
@ -31,27 +35,33 @@ def hook(settings, word_dict, label_dict, **kwargs):
]
@provider(init_hook=hook)
def process(obj, file_name):
def get_batch_size(yeild_data):
return len(yeild_data[0])
@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size,
can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, file_name):
with open(file_name, 'r') as fdata:
for line in fdata:
sentence, predicate, ctx_n1, ctx_0, ctx_p1, mark, label = \
sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = \
line.strip().split('\t')
words = sentence.split()
sen_len = len(words)
word_slot = [obj.word_dict.get(w, UNK_IDX) for w in words]
word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
predicate_slot = [obj.word_dict.get(predicate, UNK_IDX)] * sen_len
ctx_n1_slot = [obj.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
ctx_0_slot = [obj.word_dict.get(ctx_0, UNK_IDX)] * sen_len
ctx_p1_slot = [obj.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len
ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len
ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
marks = mark.split()
mark_slot = [int(w) for w in marks]
label_list = label.split()
label_slot = [obj.label_dict.get(w) for w in label_list]
yield word_slot, predicate_slot, ctx_n1_slot, \
ctx_0_slot, ctx_p1_slot, mark_slot, label_slot
label_slot = [settings.label_dict.get(w) for w in label_list]
yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot

@ -18,8 +18,9 @@ import sys
from paddle.trainer_config_helpers import *
#file paths
word_dict_file = './data/src.dict'
label_dict_file = './data/tgt.dict'
word_dict_file = './data/wordDict.txt'
label_dict_file = './data/targetDict.txt'
predicate_file= './data/verbDict.txt'
train_list_file = './data/train.list'
test_list_file = './data/test.list'
@ -30,8 +31,10 @@ if not is_predict:
#load dictionaries
word_dict = dict()
label_dict = dict()
predicate_dict = dict()
with open(word_dict_file, 'r') as f_word, \
open(label_dict_file, 'r') as f_label:
open(label_dict_file, 'r') as f_label, \
open(predicate_file, 'r') as f_pre:
for i, line in enumerate(f_word):
w = line.strip()
word_dict[w] = i
@ -40,6 +43,11 @@ if not is_predict:
w = line.strip()
label_dict[w] = i
for i, line in enumerate(f_pre):
w = line.strip()
predicate_dict[w] = i
if is_test:
train_list_file = None
@ -50,91 +58,157 @@ if not is_predict:
module='dataprovider',
obj='process',
args={'word_dict': word_dict,
'label_dict': label_dict})
'label_dict': label_dict,
'predicate_dict': predicate_dict })
word_dict_len = len(word_dict)
label_dict_len = len(label_dict)
pred_len = len(predicate_dict)
else:
word_dict_len = get_config_arg('dict_len', int)
label_dict_len = get_config_arg('label_len', int)
pred_len = get_config_arg('pred_len', int)
############################## Hyper-parameters ##################################
mark_dict_len = 2
word_dim = 32
mark_dim = 5
hidden_dim = 128
hidden_dim = 512
depth = 8
emb_lr = 1e-2
fc_lr = 1e-2
lstm_lr = 2e-2
########################### Optimizer #######################################
settings(
batch_size=150,
learning_method=AdamOptimizer(),
learning_rate=1e-3,
learning_method=MomentumOptimizer(momentum=0),
learning_rate=2e-2,
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25)
is_async=False,
model_average=ModelAverage(average_window=0.5,
max_average_window=10000),
)
#6 features
####################################### network ##############################
#8 features and 1 target
word = data_layer(name='word_data', size=word_dict_len)
predicate = data_layer(name='verb_data', size=word_dict_len)
predicate = data_layer(name='verb_data', size=pred_len)
ctx_n2 = data_layer(name='ctx_n2_data', size=word_dict_len)
ctx_n1 = data_layer(name='ctx_n1_data', size=word_dict_len)
ctx_0 = data_layer(name='ctx_0_data', size=word_dict_len)
ctx_p1 = data_layer(name='ctx_p1_data', size=word_dict_len)
ctx_p2 = data_layer(name='ctx_p2_data', size=word_dict_len)
mark = data_layer(name='mark_data', size=mark_dict_len)
if not is_predict:
target = data_layer(name='target', size=label_dict_len)
ptt = ParameterAttribute(name='src_emb', learning_rate=emb_lr)
layer_attr = ExtraLayerAttribute(drop_rate=0.5)
fc_para_attr = ParameterAttribute(learning_rate=fc_lr)
lstm_para_attr = ParameterAttribute(initial_std=0., learning_rate=lstm_lr)
para_attr = [fc_para_attr, lstm_para_attr]
word_embedding = embedding_layer(size=word_dim, input=word, param_attr=ptt)
predicate_embedding = embedding_layer(
size=word_dim, input=predicate, param_attr=ptt)
ctx_n1_embedding = embedding_layer(size=word_dim, input=ctx_n1, param_attr=ptt)
ctx_0_embedding = embedding_layer(size=word_dim, input=ctx_0, param_attr=ptt)
ctx_p1_embedding = embedding_layer(size=word_dim, input=ctx_p1, param_attr=ptt)
mark_embedding = embedding_layer(size=mark_dim, input=mark)
default_std=1/math.sqrt(hidden_dim)/3.0
emb_para = ParameterAttribute(name='emb', initial_std=0., learning_rate=0.)
std_0 = ParameterAttribute(initial_std=0.)
std_default = ParameterAttribute(initial_std=default_std)
predicate_embedding = embedding_layer(size=word_dim, input=predicate, param_attr=ParameterAttribute(name='vemb',initial_std=default_std))
mark_embedding = embedding_layer(name='word_ctx-in_embedding', size=mark_dim, input=mark, param_attr=std_0)
word_input=[word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
emb_layers = [embedding_layer(size=word_dim, input=x, param_attr=emb_para) for x in word_input]
emb_layers.append(predicate_embedding)
emb_layers.append(mark_embedding)
hidden_0 = mixed_layer(
name='hidden0',
size=hidden_dim,
input=[
full_matrix_projection(input=word_embedding),
full_matrix_projection(input=predicate_embedding),
full_matrix_projection(input=ctx_n1_embedding),
full_matrix_projection(input=ctx_0_embedding),
full_matrix_projection(input=ctx_p1_embedding),
full_matrix_projection(input=mark_embedding),
])
bias_attr=std_default,
input=[ full_matrix_projection(input=emb, param_attr=std_default ) for emb in emb_layers ])
lstm_0 = lstmemory(input=hidden_0, layer_attr=layer_attr)
mix_hidden_lr = 1e-3
lstm_para_attr = ParameterAttribute(initial_std=0.0, learning_rate=1.0)
hidden_para_attr = ParameterAttribute(initial_std=default_std, learning_rate=mix_hidden_lr)
lstm_0 = lstmemory(name='lstm0',
input=hidden_0,
act=ReluActivation(),
gate_act=SigmoidActivation(),
state_act=SigmoidActivation(),
bias_attr=std_0,
param_attr=lstm_para_attr)
#stack L-LSTM and R-LSTM with direct edges
input_tmp = [hidden_0, lstm_0]
for i in range(1, depth):
fc = fc_layer(input=input_tmp, size=hidden_dim, param_attr=para_attr)
mix_hidden = mixed_layer(name='hidden'+str(i),
size=hidden_dim,
bias_attr=std_default,
input=[full_matrix_projection(input=input_tmp[0], param_attr=hidden_para_attr),
full_matrix_projection(input=input_tmp[1], param_attr=lstm_para_attr)
]
)
lstm = lstmemory(name='lstm'+str(i),
input=mix_hidden,
act=ReluActivation(),
gate_act=SigmoidActivation(),
state_act=SigmoidActivation(),
reverse=((i % 2)==1),
bias_attr=std_0,
param_attr=lstm_para_attr)
input_tmp = [mix_hidden, lstm]
feature_out = mixed_layer(name='output',
size=label_dict_len,
bias_attr=std_default,
input=[full_matrix_projection(input=input_tmp[0], param_attr=hidden_para_attr),
full_matrix_projection(input=input_tmp[1], param_attr=lstm_para_attr)
],
)
lstm = lstmemory(
input=fc,
act=ReluActivation(),
reverse=(i % 2) == 1,
layer_attr=layer_attr)
input_tmp = [fc, lstm]
prob = fc_layer(
input=input_tmp,
size=label_dict_len,
act=SoftmaxActivation(),
param_attr=para_attr)
if not is_predict:
cls = classification_cost(input=prob, label=target)
outputs(cls)
crf_l = crf_layer( name = 'crf',
size = label_dict_len,
input = feature_out,
label = target,
param_attr=ParameterAttribute(name='crfw',initial_std=default_std, learning_rate=mix_hidden_lr)
)
crf_dec_l = crf_decoding_layer(name = 'crf_dec_l',
size = label_dict_len,
input = feature_out,
label = target,
param_attr=ParameterAttribute(name='crfw')
)
eval = sum_evaluator(input=crf_dec_l)
outputs(crf_l)
else:
outputs(prob)
crf_dec_l = crf_decoding_layer(name = 'crf_dec_l',
size = label_dict_len,
input = feature_out,
param_attr=ParameterAttribute(name='crfw')
)
outputs(crf_dec_l)

@ -26,7 +26,7 @@ UNK_IDX = 0
class Prediction():
def __init__(self, train_conf, dict_file, model_dir, label_file):
def __init__(self, train_conf, dict_file, model_dir, label_file, predicate_dict_file):
"""
train_conf: trainer configure.
dict_file: word dictionary file name.
@ -35,26 +35,41 @@ class Prediction():
self.dict = {}
self.labels = {}
self.predicate_dict={}
self.labels_reverse = {}
self.load_dict_label(dict_file, label_file)
self.load_dict_label(dict_file, label_file, predicate_dict_file)
len_dict = len(self.dict)
len_label = len(self.labels)
conf = parse_config(train_conf, 'dict_len=' + str(len_dict) +
',label_len=' + str(len_label) + ',is_predict=True')
len_pred = len(self.predicate_dict)
conf = parse_config(
train_conf,
'dict_len=' + str(len_dict) +
',label_len=' + str(len_label) +
',pred_len=' + str(len_pred) +
',is_predict=True')
self.network = swig_paddle.GradientMachine.createFromConfigProto(
conf.model_config)
self.network.loadParameters(model_dir)
slots = [
integer_value_sequence(len_dict),
integer_value_sequence(len_pred),
integer_value_sequence(len_dict),
integer_value_sequence(len_dict),
integer_value_sequence(len_dict),
integer_value_sequence(len_dict),
integer_value_sequence(len_dict),
integer_value_sequence(2)
]
integer_value_sequence(len_dict), integer_value_sequence(len_dict),
integer_value_sequence(len_dict), integer_value_sequence(len_dict),
integer_value_sequence(len_dict), integer_value_sequence(2)
]
self.converter = DataProviderConverter(slots)
def load_dict_label(self, dict_file, label_file):
def load_dict_label(self, dict_file, label_file, predicate_dict_file):
"""
Load dictionary from self.dict_file.
"""
@ -65,39 +80,42 @@ class Prediction():
self.labels[line.strip()] = line_count
self.labels_reverse[line_count] = line.strip()
for line_count, line in enumerate(open(predicate_dict_file, 'r')):
self.predicate_dict[line.strip()] = line_count
def get_data(self, data_file):
"""
Get input data of paddle format.
"""
with open(data_file, 'r') as fdata:
for line in fdata:
sentence, predicate, ctx_n1, ctx_0, ctx_p1, mark, label = line.strip(
sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = line.strip(
).split('\t')
words = sentence.split()
sen_len = len(words)
word_slot = [self.dict.get(w, UNK_IDX) for w in words]
predicate_slot = [self.dict.get(predicate, UNK_IDX)] * sen_len
predicate_slot = [self.predicate_dict.get(predicate, UNK_IDX)] * sen_len
ctx_n2_slot = [self.dict.get(ctx_n2, UNK_IDX)] * sen_len
ctx_n1_slot = [self.dict.get(ctx_n1, UNK_IDX)] * sen_len
ctx_0_slot = [self.dict.get(ctx_0, UNK_IDX)] * sen_len
ctx_p1_slot = [self.dict.get(ctx_p1, UNK_IDX)] * sen_len
ctx_p2_slot = [self.dict.get(ctx_p2, UNK_IDX)] * sen_len
marks = mark.split()
mark_slot = [int(w) for w in marks]
yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot
yield word_slot, predicate_slot, ctx_n1_slot, \
ctx_0_slot, ctx_p1_slot, mark_slot
def predict(self, data_file):
def predict(self, data_file, output_file):
"""
data_file: file name of input data.
"""
input = self.converter(self.get_data(data_file))
output = self.network.forwardTest(input)
prob = output[0]["value"]
lab = list(np.argsort(-prob)[:, 0])
lab = output[0]["id"].tolist()
with open(data_file, 'r') as fin, open('predict.res', 'w') as fout:
with open(data_file, 'r') as fin, open(output_file, 'w') as fout:
index = 0
for line in fin:
sen = line.split('\t')[0]
@ -109,8 +127,8 @@ class Prediction():
def option_parser():
usage = ("python predict.py -c config -w model_dir "
"-d word dictionary -l label_file -i input_file")
usage = ("python predict.py -c config -w model_dir "
"-d word dictionary -l label_file -i input_file -p pred_dict_file")
parser = OptionParser(usage="usage: %s [options]" % usage)
parser.add_option(
"-c",
@ -131,6 +149,13 @@ def option_parser():
dest="label_file",
default=None,
help="label file")
parser.add_option(
"-p",
"--predict_dict_file",
action="store",
dest="predict_dict_file",
default=None,
help="predict_dict_file")
parser.add_option(
"-i",
"--data",
@ -144,6 +169,14 @@ def option_parser():
dest="model_path",
default=None,
help="model path")
parser.add_option(
"-o",
"--output_file",
action="store",
dest="output_file",
default=None,
help="output file")
return parser.parse_args()
@ -154,10 +187,12 @@ def main():
dict_file = options.dict_file
model_path = options.model_path
label_file = options.label_file
predict_dict_file = options.predict_dict_file
output_file = options.output_file
swig_paddle.initPaddle("--use_gpu=0")
predict = Prediction(train_conf, dict_file, model_path, label_file)
predict.predict(data_file)
predict = Prediction(train_conf, dict_file, model_path, label_file, predict_dict_file)
predict.predict(data_file,output_file)
if __name__ == '__main__':

@ -26,15 +26,18 @@ LOG=`get_best_pass $log`
LOG=(${LOG})
best_model_path="output/pass-${LOG[1]}"
config_file=db_lstm.py
dict_file=./data/src.dict
label_file=./data/tgt.dict
dict_file=./data/wordDict.txt
label_file=./data/targetDict.txt
predicate_dict_file=./data/verbDict.txt
input_file=./data/feature
output_file=predict.res
python predict.py \
-c $config_file \
-w $best_model_path \
-l $label_file \
-p $predicate_dict_file \
-d $dict_file \
-i $input_file
-i $input_file \
-o $output_file

@ -36,4 +36,5 @@ paddle train \
--job=test \
--use_gpu=false \
--config_args=is_test=1 \
--test_all_data_in_one_period=1 \
2>&1 | tee 'test.log'

@ -16,11 +16,14 @@
set -e
paddle train \
--config=./db_lstm.py \
--use_gpu=0 \
--log_period=5000 \
--trainer_count=1 \
--show_parameter_stats_period=5000 \
--save_dir=./output \
--trainer_count=4 \
--log_period=10 \
--num_passes=500 \
--use_gpu=false \
--show_parameter_stats_period=10 \
--num_passes=10000 \
--average_test_period=10000000 \
--init_model_path=./data \
--load_missing_parameter_strategy=rand \
--test_all_data_in_one_period=1 \
2>&1 | tee 'train.log'
2>&1 | tee 'train.log'

Binary file not shown.

After

Width:  |  Height:  |  Size: 52 KiB

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save