You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
116 lines
3.5 KiB
116 lines
3.5 KiB
import numpy as np
|
|
import paddle.v2 as paddle
|
|
from model_v2 import db_lstm
|
|
|
|
UNK_IDX = 0
|
|
|
|
word_dict_file = './data/wordDict.txt'
|
|
label_dict_file = './data/targetDict.txt'
|
|
predicate_file = './data/verbDict.txt'
|
|
|
|
word_dict = dict()
|
|
label_dict = dict()
|
|
predicate_dict = dict()
|
|
|
|
with open(word_dict_file, 'r') as f_word, \
|
|
open(label_dict_file, 'r') as f_label, \
|
|
open(predicate_file, 'r') as f_pre:
|
|
for i, line in enumerate(f_word):
|
|
w = line.strip()
|
|
word_dict[w] = i
|
|
|
|
for i, line in enumerate(f_label):
|
|
w = line.strip()
|
|
label_dict[w] = i
|
|
|
|
for i, line in enumerate(f_pre):
|
|
w = line.strip()
|
|
predicate_dict[w] = i
|
|
|
|
word_dict_len = len(word_dict)
|
|
label_dict_len = len(label_dict)
|
|
pred_len = len(predicate_dict)
|
|
|
|
|
|
def train_reader(file_name="data/feature"):
|
|
def reader():
|
|
with open(file_name, 'r') as fdata:
|
|
for line in fdata:
|
|
sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = \
|
|
line.strip().split('\t')
|
|
|
|
words = sentence.split()
|
|
sen_len = len(words)
|
|
word_slot = [word_dict.get(w, UNK_IDX) for w in words]
|
|
|
|
predicate_slot = [predicate_dict.get(predicate)] * sen_len
|
|
ctx_n2_slot = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len
|
|
ctx_n1_slot = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len
|
|
ctx_0_slot = [word_dict.get(ctx_0, UNK_IDX)] * sen_len
|
|
ctx_p1_slot = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len
|
|
ctx_p2_slot = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len
|
|
|
|
marks = mark.split()
|
|
mark_slot = [int(w) for w in marks]
|
|
|
|
label_list = label.split()
|
|
label_slot = [label_dict.get(w) for w in label_list]
|
|
yield word_slot, ctx_n2_slot, ctx_n1_slot, \
|
|
ctx_0_slot, ctx_p1_slot, ctx_p2_slot, predicate_slot, mark_slot, label_slot
|
|
|
|
return reader
|
|
|
|
|
|
def load_parameter(file_name, h, w):
|
|
with open(file_name, 'rb') as f:
|
|
f.read(16) # skip header for float type.
|
|
return np.fromfile(f, dtype=np.float32).reshape(h, w)
|
|
|
|
|
|
def main():
|
|
paddle.init(use_gpu=False, trainer_count=1)
|
|
|
|
# define network topology
|
|
crf_cost, crf_dec = db_lstm(word_dict_len, label_dict_len, pred_len)
|
|
|
|
parameters = paddle.parameters.create([crf_cost, crf_dec])
|
|
optimizer = paddle.optimizer.Momentum(momentum=0.01, learning_rate=2e-2)
|
|
|
|
def event_handler(event):
|
|
if isinstance(event, paddle.event.EndIteration):
|
|
if event.batch_id % 100 == 0:
|
|
print "Pass %d, Batch %d, Cost %f, %s" % (
|
|
event.pass_id, event.batch_id, event.cost, event.metrics)
|
|
else:
|
|
pass
|
|
|
|
trainer = paddle.trainer.SGD(cost=crf_cost,
|
|
parameters=parameters,
|
|
update_equation=optimizer)
|
|
|
|
parameters.set('emb', load_parameter("data/emb", 44068, 32))
|
|
|
|
reader_dict = {
|
|
'word_data': 0,
|
|
'ctx_n2_data': 1,
|
|
'ctx_n1_data': 2,
|
|
'ctx_0_data': 3,
|
|
'ctx_p1_data': 4,
|
|
'ctx_p2_data': 5,
|
|
'verb_data': 6,
|
|
'mark_data': 7,
|
|
'target': 8,
|
|
}
|
|
trn_reader = paddle.reader.batched(
|
|
paddle.reader.shuffle(
|
|
train_reader(), buf_size=8192), batch_size=10)
|
|
trainer.train(
|
|
reader=trn_reader,
|
|
event_handler=event_handler,
|
|
num_passes=10000,
|
|
reader_dict=reader_dict)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|