CONLL05 dataset for SRL

9 years ago · 06cbd81eec
parent cdecd53be3
commit 06cbd81eec
1 changed files with 188 additions and 0 deletions
--- a/python/paddle/v2/dataset/conll05.py
+++ b/python/paddle/v2/dataset/conll05.py
@ -0,0 +1,188 @@
 import paddle.v2.dataset.common
 import tarfile
 import gzip
 import itertools
 __all__ = ['test, get_dict', 'get_embedding']
 """
 Conll 2005 dataset.  Paddle semantic role labeling Book and demo use this
 dataset as an example. Because Conll 2005 is not free in public, the default
 downloaded URL is test set of Conll 2005 (which is public). Users can change
 URL and MD5 to their Conll dataset.
 """
 DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
 DATA_MD5 = '387719152ae52d60422c016e92a742fc'
 WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt'
 WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
 VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt'
 VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
 TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt'
 TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
 EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb'
 EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
 UNK_IDX = 0
 def load_dict(filename):
    d = dict()
    with open(filename, 'r') as f:
        for i, line in enumerate(f):
            d[line.strip()] = i
    return d
 def corpus_reader(data_path, words_name, props_name):
    """
    Read one corpus by corpus name. It returns an iterator. Each element of
    this iterator is a tuple including sentence and labels. The sentence is
    consist of a list of word IDs. The labels include a list of label IDs.
    :param name: corpus name.
    :type name: basestring
    :return: a iterator of data.
    :rtype: iterator
    """
    def reader():
        tf = tarfile.open(data_path)
        wf = tf.extractfile(words_name)
        pf = tf.extractfile(props_name)
        with gzip.GzipFile(fileobj=wf) as words_file, gzip.GzipFile(
                fileobj=pf) as props_file:
            sentences = []
            labels = []
            one_seg = []
            for word, label in itertools.izip(words_file, props_file):
                word = word.strip()
                label = label.strip().split()
                if len(label) == 0:  # end of sentence
                    for i in xrange(len(one_seg[0])):
                        a_kind_lable = [x[i] for x in one_seg]
                        labels.append(a_kind_lable)
                    if len(labels) >= 1:
                        verb_list = []
                        for x in labels[0]:
                            if x != '-':
                                verb_list.append(x)
                        for i, lbl in enumerate(labels[1:]):
                            cur_tag = 'O'
                            is_in_bracket = False
                            lbl_seq = []
                            verb_word = ''
                            for l in lbl:
                                if l == '*' and is_in_bracket == False:
                                    lbl_seq.append('O')
                                elif l == '*' and is_in_bracket == True:
                                    lbl_seq.append('I-' + cur_tag)
                                elif l == '*)':
                                    lbl_seq.append('I-' + cur_tag)
                                    is_in_bracket = False
                                elif l.find('(') != -1 and l.find(')') != -1:
                                    cur_tag = l[1:l.find('*')]
                                    lbl_seq.append('B-' + cur_tag)
                                    is_in_bracket = False
                                elif l.find('(') != -1 and l.find(')') == -1:
                                    cur_tag = l[1:l.find('*')]
                                    lbl_seq.append('B-' + cur_tag)
                                    is_in_bracket = True
                                else:
                                    print 'error:', l
                            yield sentences, verb_list[i], lbl_seq
                    sentences = []
                    labels = []
                    one_seg = []
                else:
                    sentences.append(word)
                    one_seg.append(label)
    return reader
 def reader_creator(corpus_reader,
                   word_dict=None,
                   predicate_dict=None,
                   label_dict=None):
    def reader():
        for sentence, predicate, labels in corpus_reader():
            sen_len = len(sentence)
            verb_index = labels.index('B-V')
            mark = [0] * len(labels)
            if verb_index > 0:
                mark[verb_index - 1] = 1
                ctx_n1 = sentence[verb_index - 1]
            else:
                ctx_n1 = 'bos'
            if verb_index > 1:
                mark[verb_index - 2] = 1
                ctx_n2 = sentence[verb_index - 2]
            else:
                ctx_n2 = 'bos'
            mark[verb_index] = 1
            ctx_0 = sentence[verb_index]
            if verb_index < len(labels) - 1:
                mark[verb_index + 1] = 1
                ctx_p1 = sentence[verb_index + 1]
            else:
                ctx_p1 = 'eos'
            if verb_index < len(labels) - 2:
                mark[verb_index + 2] = 1
                ctx_p2 = sentence[verb_index + 2]
            else:
                ctx_p2 = 'eos'
            word_idx = [word_dict.get(w, UNK_IDX) for w in sentence]
            pred_idx = [predicate_dict.get(predicate)] * sen_len
            ctx_n2_idx = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len
            ctx_n1_idx = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len
            ctx_0_idx = [word_dict.get(ctx_0, UNK_IDX)] * sen_len
            ctx_p1_idx = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len
            ctx_p2_idx = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len
            label_idx = [label_dict.get(w) for w in labels]
            yield word_idx, pred_idx, ctx_n2_idx, ctx_n1_idx, \
              ctx_0_idx, ctx_p1_idx, ctx_p2_idx, mark, label_idx
    return reader()
 def get_dict():
    word_dict = load_dict(
        common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5))
    verb_dict = load_dict(
        common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5))
    label_dict = load_dict(
        common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5))
    return word_dict, verb_dict, label_dict
 def get_embedding():
    return common.download(EMB_URL, 'conll05st', EMB_MD5)
 def test():
    word_dict, verb_dict, label_dict = get_dict()
    reader = corpus_reader(
        common.download(DATA_URL, 'conll05st', DATA_MD5),
        words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
        props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
    return reader_creator(reader, word_dict, verb_dict, label_dict)
 if __name__ == '__main__':
    print get_embedding()
    for f in test():
        print f