parent
							
								
									cdecd53be3
								
							
						
					
					
						commit
						06cbd81eec
					
				| @ -0,0 +1,188 @@ | ||||
| import paddle.v2.dataset.common | ||||
| import tarfile | ||||
| import gzip | ||||
| import itertools | ||||
| 
 | ||||
| __all__ = ['test, get_dict', 'get_embedding'] | ||||
| """ | ||||
| Conll 2005 dataset.  Paddle semantic role labeling Book and demo use this | ||||
| dataset as an example. Because Conll 2005 is not free in public, the default | ||||
| downloaded URL is test set of Conll 2005 (which is public). Users can change | ||||
| URL and MD5 to their Conll dataset. | ||||
| """ | ||||
| 
 | ||||
| DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz' | ||||
| DATA_MD5 = '387719152ae52d60422c016e92a742fc' | ||||
| WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt' | ||||
| WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa' | ||||
| VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt' | ||||
| VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c' | ||||
| TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt' | ||||
| TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751' | ||||
| EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb' | ||||
| EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7' | ||||
| 
 | ||||
| UNK_IDX = 0 | ||||
| 
 | ||||
| 
 | ||||
| def load_dict(filename): | ||||
|     d = dict() | ||||
|     with open(filename, 'r') as f: | ||||
|         for i, line in enumerate(f): | ||||
|             d[line.strip()] = i | ||||
|     return d | ||||
| 
 | ||||
| 
 | ||||
| def corpus_reader(data_path, words_name, props_name): | ||||
|     """ | ||||
|     Read one corpus by corpus name. It returns an iterator. Each element of | ||||
|     this iterator is a tuple including sentence and labels. The sentence is | ||||
|     consist of a list of word IDs. The labels include a list of label IDs. | ||||
|     :param name: corpus name. | ||||
|     :type name: basestring | ||||
|     :return: a iterator of data. | ||||
|     :rtype: iterator | ||||
|     """ | ||||
| 
 | ||||
|     def reader(): | ||||
|         tf = tarfile.open(data_path) | ||||
|         wf = tf.extractfile(words_name) | ||||
|         pf = tf.extractfile(props_name) | ||||
|         with gzip.GzipFile(fileobj=wf) as words_file, gzip.GzipFile( | ||||
|                 fileobj=pf) as props_file: | ||||
|             sentences = [] | ||||
|             labels = [] | ||||
|             one_seg = [] | ||||
|             for word, label in itertools.izip(words_file, props_file): | ||||
|                 word = word.strip() | ||||
|                 label = label.strip().split() | ||||
| 
 | ||||
|                 if len(label) == 0:  # end of sentence | ||||
|                     for i in xrange(len(one_seg[0])): | ||||
|                         a_kind_lable = [x[i] for x in one_seg] | ||||
|                         labels.append(a_kind_lable) | ||||
| 
 | ||||
|                     if len(labels) >= 1: | ||||
|                         verb_list = [] | ||||
|                         for x in labels[0]: | ||||
|                             if x != '-': | ||||
|                                 verb_list.append(x) | ||||
| 
 | ||||
|                         for i, lbl in enumerate(labels[1:]): | ||||
|                             cur_tag = 'O' | ||||
|                             is_in_bracket = False | ||||
|                             lbl_seq = [] | ||||
|                             verb_word = '' | ||||
|                             for l in lbl: | ||||
|                                 if l == '*' and is_in_bracket == False: | ||||
|                                     lbl_seq.append('O') | ||||
|                                 elif l == '*' and is_in_bracket == True: | ||||
|                                     lbl_seq.append('I-' + cur_tag) | ||||
|                                 elif l == '*)': | ||||
|                                     lbl_seq.append('I-' + cur_tag) | ||||
|                                     is_in_bracket = False | ||||
|                                 elif l.find('(') != -1 and l.find(')') != -1: | ||||
|                                     cur_tag = l[1:l.find('*')] | ||||
|                                     lbl_seq.append('B-' + cur_tag) | ||||
|                                     is_in_bracket = False | ||||
|                                 elif l.find('(') != -1 and l.find(')') == -1: | ||||
|                                     cur_tag = l[1:l.find('*')] | ||||
|                                     lbl_seq.append('B-' + cur_tag) | ||||
|                                     is_in_bracket = True | ||||
|                                 else: | ||||
|                                     print 'error:', l | ||||
| 
 | ||||
|                             yield sentences, verb_list[i], lbl_seq | ||||
| 
 | ||||
|                     sentences = [] | ||||
|                     labels = [] | ||||
|                     one_seg = [] | ||||
|                 else: | ||||
|                     sentences.append(word) | ||||
|                     one_seg.append(label) | ||||
| 
 | ||||
|     return reader | ||||
| 
 | ||||
| 
 | ||||
| def reader_creator(corpus_reader, | ||||
|                    word_dict=None, | ||||
|                    predicate_dict=None, | ||||
|                    label_dict=None): | ||||
|     def reader(): | ||||
|         for sentence, predicate, labels in corpus_reader(): | ||||
| 
 | ||||
|             sen_len = len(sentence) | ||||
| 
 | ||||
|             verb_index = labels.index('B-V') | ||||
|             mark = [0] * len(labels) | ||||
|             if verb_index > 0: | ||||
|                 mark[verb_index - 1] = 1 | ||||
|                 ctx_n1 = sentence[verb_index - 1] | ||||
|             else: | ||||
|                 ctx_n1 = 'bos' | ||||
| 
 | ||||
|             if verb_index > 1: | ||||
|                 mark[verb_index - 2] = 1 | ||||
|                 ctx_n2 = sentence[verb_index - 2] | ||||
|             else: | ||||
|                 ctx_n2 = 'bos' | ||||
| 
 | ||||
|             mark[verb_index] = 1 | ||||
|             ctx_0 = sentence[verb_index] | ||||
| 
 | ||||
|             if verb_index < len(labels) - 1: | ||||
|                 mark[verb_index + 1] = 1 | ||||
|                 ctx_p1 = sentence[verb_index + 1] | ||||
|             else: | ||||
|                 ctx_p1 = 'eos' | ||||
| 
 | ||||
|             if verb_index < len(labels) - 2: | ||||
|                 mark[verb_index + 2] = 1 | ||||
|                 ctx_p2 = sentence[verb_index + 2] | ||||
|             else: | ||||
|                 ctx_p2 = 'eos' | ||||
| 
 | ||||
|             word_idx = [word_dict.get(w, UNK_IDX) for w in sentence] | ||||
|             pred_idx = [predicate_dict.get(predicate)] * sen_len | ||||
| 
 | ||||
|             ctx_n2_idx = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len | ||||
|             ctx_n1_idx = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len | ||||
|             ctx_0_idx = [word_dict.get(ctx_0, UNK_IDX)] * sen_len | ||||
|             ctx_p1_idx = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len | ||||
|             ctx_p2_idx = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len | ||||
| 
 | ||||
|             label_idx = [label_dict.get(w) for w in labels] | ||||
| 
 | ||||
|             yield word_idx, pred_idx, ctx_n2_idx, ctx_n1_idx, \ | ||||
|               ctx_0_idx, ctx_p1_idx, ctx_p2_idx, mark, label_idx | ||||
| 
 | ||||
|     return reader() | ||||
| 
 | ||||
| 
 | ||||
| def get_dict(): | ||||
|     word_dict = load_dict( | ||||
|         common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)) | ||||
|     verb_dict = load_dict( | ||||
|         common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)) | ||||
|     label_dict = load_dict( | ||||
|         common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)) | ||||
|     return word_dict, verb_dict, label_dict | ||||
| 
 | ||||
| 
 | ||||
| def get_embedding(): | ||||
|     return common.download(EMB_URL, 'conll05st', EMB_MD5) | ||||
| 
 | ||||
| 
 | ||||
| def test(): | ||||
|     word_dict, verb_dict, label_dict = get_dict() | ||||
|     reader = corpus_reader( | ||||
|         common.download(DATA_URL, 'conll05st', DATA_MD5), | ||||
|         words_name='conll05st-release/test.wsj/words/test.wsj.words.gz', | ||||
|         props_name='conll05st-release/test.wsj/props/test.wsj.props.gz') | ||||
|     return reader_creator(reader, word_dict, verb_dict, label_dict) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     print get_embedding() | ||||
|     for f in test(): | ||||
|         print f | ||||
					Loading…
					
					
				
		Reference in new issue