parent
							
								
									cdecd53be3
								
							
						
					
					
						commit
						06cbd81eec
					
				| @ -0,0 +1,188 @@ | |||||||
|  | import paddle.v2.dataset.common | ||||||
|  | import tarfile | ||||||
|  | import gzip | ||||||
|  | import itertools | ||||||
|  | 
 | ||||||
|  | __all__ = ['test, get_dict', 'get_embedding'] | ||||||
|  | """ | ||||||
|  | Conll 2005 dataset.  Paddle semantic role labeling Book and demo use this | ||||||
|  | dataset as an example. Because Conll 2005 is not free in public, the default | ||||||
|  | downloaded URL is test set of Conll 2005 (which is public). Users can change | ||||||
|  | URL and MD5 to their Conll dataset. | ||||||
|  | """ | ||||||
|  | 
 | ||||||
|  | DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz' | ||||||
|  | DATA_MD5 = '387719152ae52d60422c016e92a742fc' | ||||||
|  | WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt' | ||||||
|  | WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa' | ||||||
|  | VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt' | ||||||
|  | VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c' | ||||||
|  | TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt' | ||||||
|  | TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751' | ||||||
|  | EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb' | ||||||
|  | EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7' | ||||||
|  | 
 | ||||||
|  | UNK_IDX = 0 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def load_dict(filename): | ||||||
|  |     d = dict() | ||||||
|  |     with open(filename, 'r') as f: | ||||||
|  |         for i, line in enumerate(f): | ||||||
|  |             d[line.strip()] = i | ||||||
|  |     return d | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def corpus_reader(data_path, words_name, props_name): | ||||||
|  |     """ | ||||||
|  |     Read one corpus by corpus name. It returns an iterator. Each element of | ||||||
|  |     this iterator is a tuple including sentence and labels. The sentence is | ||||||
|  |     consist of a list of word IDs. The labels include a list of label IDs. | ||||||
|  |     :param name: corpus name. | ||||||
|  |     :type name: basestring | ||||||
|  |     :return: a iterator of data. | ||||||
|  |     :rtype: iterator | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     def reader(): | ||||||
|  |         tf = tarfile.open(data_path) | ||||||
|  |         wf = tf.extractfile(words_name) | ||||||
|  |         pf = tf.extractfile(props_name) | ||||||
|  |         with gzip.GzipFile(fileobj=wf) as words_file, gzip.GzipFile( | ||||||
|  |                 fileobj=pf) as props_file: | ||||||
|  |             sentences = [] | ||||||
|  |             labels = [] | ||||||
|  |             one_seg = [] | ||||||
|  |             for word, label in itertools.izip(words_file, props_file): | ||||||
|  |                 word = word.strip() | ||||||
|  |                 label = label.strip().split() | ||||||
|  | 
 | ||||||
|  |                 if len(label) == 0:  # end of sentence | ||||||
|  |                     for i in xrange(len(one_seg[0])): | ||||||
|  |                         a_kind_lable = [x[i] for x in one_seg] | ||||||
|  |                         labels.append(a_kind_lable) | ||||||
|  | 
 | ||||||
|  |                     if len(labels) >= 1: | ||||||
|  |                         verb_list = [] | ||||||
|  |                         for x in labels[0]: | ||||||
|  |                             if x != '-': | ||||||
|  |                                 verb_list.append(x) | ||||||
|  | 
 | ||||||
|  |                         for i, lbl in enumerate(labels[1:]): | ||||||
|  |                             cur_tag = 'O' | ||||||
|  |                             is_in_bracket = False | ||||||
|  |                             lbl_seq = [] | ||||||
|  |                             verb_word = '' | ||||||
|  |                             for l in lbl: | ||||||
|  |                                 if l == '*' and is_in_bracket == False: | ||||||
|  |                                     lbl_seq.append('O') | ||||||
|  |                                 elif l == '*' and is_in_bracket == True: | ||||||
|  |                                     lbl_seq.append('I-' + cur_tag) | ||||||
|  |                                 elif l == '*)': | ||||||
|  |                                     lbl_seq.append('I-' + cur_tag) | ||||||
|  |                                     is_in_bracket = False | ||||||
|  |                                 elif l.find('(') != -1 and l.find(')') != -1: | ||||||
|  |                                     cur_tag = l[1:l.find('*')] | ||||||
|  |                                     lbl_seq.append('B-' + cur_tag) | ||||||
|  |                                     is_in_bracket = False | ||||||
|  |                                 elif l.find('(') != -1 and l.find(')') == -1: | ||||||
|  |                                     cur_tag = l[1:l.find('*')] | ||||||
|  |                                     lbl_seq.append('B-' + cur_tag) | ||||||
|  |                                     is_in_bracket = True | ||||||
|  |                                 else: | ||||||
|  |                                     print 'error:', l | ||||||
|  | 
 | ||||||
|  |                             yield sentences, verb_list[i], lbl_seq | ||||||
|  | 
 | ||||||
|  |                     sentences = [] | ||||||
|  |                     labels = [] | ||||||
|  |                     one_seg = [] | ||||||
|  |                 else: | ||||||
|  |                     sentences.append(word) | ||||||
|  |                     one_seg.append(label) | ||||||
|  | 
 | ||||||
|  |     return reader | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def reader_creator(corpus_reader, | ||||||
|  |                    word_dict=None, | ||||||
|  |                    predicate_dict=None, | ||||||
|  |                    label_dict=None): | ||||||
|  |     def reader(): | ||||||
|  |         for sentence, predicate, labels in corpus_reader(): | ||||||
|  | 
 | ||||||
|  |             sen_len = len(sentence) | ||||||
|  | 
 | ||||||
|  |             verb_index = labels.index('B-V') | ||||||
|  |             mark = [0] * len(labels) | ||||||
|  |             if verb_index > 0: | ||||||
|  |                 mark[verb_index - 1] = 1 | ||||||
|  |                 ctx_n1 = sentence[verb_index - 1] | ||||||
|  |             else: | ||||||
|  |                 ctx_n1 = 'bos' | ||||||
|  | 
 | ||||||
|  |             if verb_index > 1: | ||||||
|  |                 mark[verb_index - 2] = 1 | ||||||
|  |                 ctx_n2 = sentence[verb_index - 2] | ||||||
|  |             else: | ||||||
|  |                 ctx_n2 = 'bos' | ||||||
|  | 
 | ||||||
|  |             mark[verb_index] = 1 | ||||||
|  |             ctx_0 = sentence[verb_index] | ||||||
|  | 
 | ||||||
|  |             if verb_index < len(labels) - 1: | ||||||
|  |                 mark[verb_index + 1] = 1 | ||||||
|  |                 ctx_p1 = sentence[verb_index + 1] | ||||||
|  |             else: | ||||||
|  |                 ctx_p1 = 'eos' | ||||||
|  | 
 | ||||||
|  |             if verb_index < len(labels) - 2: | ||||||
|  |                 mark[verb_index + 2] = 1 | ||||||
|  |                 ctx_p2 = sentence[verb_index + 2] | ||||||
|  |             else: | ||||||
|  |                 ctx_p2 = 'eos' | ||||||
|  | 
 | ||||||
|  |             word_idx = [word_dict.get(w, UNK_IDX) for w in sentence] | ||||||
|  |             pred_idx = [predicate_dict.get(predicate)] * sen_len | ||||||
|  | 
 | ||||||
|  |             ctx_n2_idx = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len | ||||||
|  |             ctx_n1_idx = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len | ||||||
|  |             ctx_0_idx = [word_dict.get(ctx_0, UNK_IDX)] * sen_len | ||||||
|  |             ctx_p1_idx = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len | ||||||
|  |             ctx_p2_idx = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len | ||||||
|  | 
 | ||||||
|  |             label_idx = [label_dict.get(w) for w in labels] | ||||||
|  | 
 | ||||||
|  |             yield word_idx, pred_idx, ctx_n2_idx, ctx_n1_idx, \ | ||||||
|  |               ctx_0_idx, ctx_p1_idx, ctx_p2_idx, mark, label_idx | ||||||
|  | 
 | ||||||
|  |     return reader() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def get_dict(): | ||||||
|  |     word_dict = load_dict( | ||||||
|  |         common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)) | ||||||
|  |     verb_dict = load_dict( | ||||||
|  |         common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)) | ||||||
|  |     label_dict = load_dict( | ||||||
|  |         common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)) | ||||||
|  |     return word_dict, verb_dict, label_dict | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def get_embedding(): | ||||||
|  |     return common.download(EMB_URL, 'conll05st', EMB_MD5) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test(): | ||||||
|  |     word_dict, verb_dict, label_dict = get_dict() | ||||||
|  |     reader = corpus_reader( | ||||||
|  |         common.download(DATA_URL, 'conll05st', DATA_MD5), | ||||||
|  |         words_name='conll05st-release/test.wsj/words/test.wsj.words.gz', | ||||||
|  |         props_name='conll05st-release/test.wsj/props/test.wsj.props.gz') | ||||||
|  |     return reader_creator(reader, word_dict, verb_dict, label_dict) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | if __name__ == '__main__': | ||||||
|  |     print get_embedding() | ||||||
|  |     for f in test(): | ||||||
|  |         print f | ||||||
					Loading…
					
					
				
		Reference in new issue