diff --git a/demo/gan/data/download_cifar.sh b/demo/gan/data/download_cifar.sh index ae24ef2b7f..bbadc7c10c 100755 --- a/demo/gan/data/download_cifar.sh +++ b/demo/gan/data/download_cifar.sh @@ -1,3 +1,4 @@ +#!/bin/bash # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/demo/gan/data/get_mnist_data.sh b/demo/gan/data/get_mnist_data.sh old mode 100644 new mode 100755 diff --git a/demo/image_classification/data/download_cifar.sh b/demo/image_classification/data/download_cifar.sh index 52e82d0d98..532178d627 100755 --- a/demo/image_classification/data/download_cifar.sh +++ b/demo/image_classification/data/download_cifar.sh @@ -1,3 +1,4 @@ +#!/bin/bash # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/demo/image_classification/image_provider.py b/demo/image_classification/image_provider.py index 87eed5eebd..6a315ff094 100644 --- a/demo/image_classification/image_provider.py +++ b/demo/image_classification/image_provider.py @@ -21,7 +21,7 @@ from paddle.trainer.PyDataProvider2 import * # # {'img_size': 32, -# 'settings': , +# 'settings': a global object, # 'color': True, # 'mean_img_size': 32, # 'meta': './data/cifar-out/batches/batches.meta', @@ -50,10 +50,10 @@ def hook(settings, img_size, mean_img_size, num_classes, color, meta, use_jpeg, settings.logger.info('Image size: %s', settings.img_size) settings.logger.info('Meta path: %s', settings.meta_path) - settings.input_types = [ - dense_vector(settings.img_raw_size), # image feature - integer_value(settings.num_classes) - ] # labels + settings.input_types = { + 'image': dense_vector(settings.img_raw_size), + 'label': integer_value(settings.num_classes) + } settings.logger.info('DataProvider Initialization finished') @@ -83,4 +83,7 @@ def processData(settings, file_list): img, settings.img_mean, settings.img_size, settings.is_train, settings.color) label = data['labels'][i] - yield img_feat.astype('float32'), int(label) + yield { + 'image': img_feat.astype('float32'), + 'label': int(label) + } diff --git a/demo/introduction/.gitignore b/demo/introduction/.gitignore new file mode 100644 index 0000000000..c54f3f9480 --- /dev/null +++ b/demo/introduction/.gitignore @@ -0,0 +1,5 @@ +dataprovider.pyc +empty.list +train.log +output +train.list diff --git a/demo/introduction/dataprovider.py b/demo/introduction/dataprovider.py index 03c920cc34..5b48aad040 100644 --- a/demo/introduction/dataprovider.py +++ b/demo/introduction/dataprovider.py @@ -17,8 +17,10 @@ import random # define data types of input: 2 real numbers -@provider(input_types=[dense_vector(1), dense_vector(1)], use_seq=False) +@provider( + input_types={'x': dense_vector(1), + 'y': dense_vector(1)}, use_seq=False) def process(settings, input_file): for i in xrange(2000): x = random.random() - yield [x], [2 * x + 0.3] + yield {'x': [x], 'y': [2 * x + 0.3]} diff --git a/demo/introduction/trainer_config.py b/demo/introduction/trainer_config.py index 41cebcf6e1..ecafe955f9 100644 --- a/demo/introduction/trainer_config.py +++ b/demo/introduction/trainer_config.py @@ -15,11 +15,8 @@ from paddle.trainer_config_helpers import * # 1. read data. Suppose you saved above python code as dataprovider.py -data_file = 'empty.list' -with open(data_file, 'w') as f: - f.writelines(' ') define_py_data_sources2( - train_list=data_file, + train_list=['no_matter.txt'], test_list=None, module='dataprovider', obj='process', diff --git a/demo/quick_start/.gitignore b/demo/quick_start/.gitignore index d6bc73105b..f71662563f 100644 --- a/demo/quick_start/.gitignore +++ b/demo/quick_start/.gitignore @@ -8,6 +8,8 @@ data/test.list data/test.txt data/train.list data/train.txt +data/pred.list +data/pred.txt dataprovider_copy_1.py train.log output diff --git a/demo/quick_start/dataprovider_bow.py b/demo/quick_start/dataprovider_bow.py index 8e651d77bf..2745495586 100644 --- a/demo/quick_start/dataprovider_bow.py +++ b/demo/quick_start/dataprovider_bow.py @@ -31,16 +31,16 @@ def initializer(settings, dictionary, **kwargs): # setting.input_types specifies what the data types the data provider # generates. - settings.input_types = [ + settings.input_types = { # The first input is a sparse_binary_vector, # which means each dimension of the vector is either 0 or 1. It is the # bag-of-words (BOW) representation of the texts. - sparse_binary_vector(len(dictionary)), + 'word': sparse_binary_vector(len(dictionary)), # The second input is an integer. It represents the category id of the # sample. 2 means there are two labels in the dataset. # (1 for positive and 0 for negative) - integer_value(2) - ] + 'label': integer_value(2) + } # Delaring a data provider. It has an initializer 'data_initialzer'. @@ -67,12 +67,12 @@ def process(settings, file_name): # Return the features for the current comment. The first is a list # of ids representing a 0-1 binary sparse vector of the text, # the second is the integer id of the label. - yield word_vector, int(label) + yield {'word': word_vector, 'label': int(label)} def predict_initializer(settings, dictionary, **kwargs): settings.word_dict = dictionary - settings.input_types = [sparse_binary_vector(len(dictionary))] + settings.input_types = {'word': sparse_binary_vector(len(dictionary))} # Declaring a data provider for prediction. The difference with process @@ -83,4 +83,4 @@ def process_predict(settings, file_name): for line in f: comment = line.strip().split() word_vector = [settings.word_dict.get(w, UNK_IDX) for w in comment] - yield word_vector + yield {'word': word_vector} diff --git a/demo/quick_start/dataprovider_emb.py b/demo/quick_start/dataprovider_emb.py index b010253a8a..ddfa3ce9b7 100755 --- a/demo/quick_start/dataprovider_emb.py +++ b/demo/quick_start/dataprovider_emb.py @@ -19,13 +19,13 @@ UNK_IDX = 0 def initializer(settings, dictionary, **kwargs): settings.word_dict = dictionary - settings.input_types = [ + settings.input_types = { # Define the type of the first input as sequence of integer. # The value of the integers range from 0 to len(dictrionary)-1 - integer_value_sequence(len(dictionary)), + 'word': integer_value_sequence(len(dictionary)), # Define the second input for label id - integer_value(2) - ] + 'label': integer_value(2) + } @provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM) @@ -35,15 +35,12 @@ def process(settings, file_name): label, comment = line.strip().split('\t') words = comment.split() word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words] - yield word_slot, int(label) + yield {'word': word_slot, 'label': int(label)} def predict_initializer(settings, dictionary, **kwargs): settings.word_dict = dictionary - settings.input_types = [ - integer_value( - len(dictionary), seq_type=SequenceType.SEQUENCE) - ] + settings.input_types = {'word': integer_value_sequence(len(dictionary))} @provider(init_hook=predict_initializer, should_shuffle=False) @@ -52,4 +49,4 @@ def process_predict(settings, file_name): for line in f: comment = line.strip().split() word_slot = [settings.word_dict.get(w, UNK_IDX) for w in comment] - yield word_slot + yield {'word': word_slot} diff --git a/demo/recommendation/common_utils.py b/demo/recommendation/common_utils.py index d4fbdad1d7..c20c652866 100755 --- a/demo/recommendation/common_utils.py +++ b/demo/recommendation/common_utils.py @@ -17,13 +17,14 @@ from paddle.trainer.PyDataProvider2 import * def meta_to_header(meta, name): metas = meta[name]['__meta__']['raw_meta'] for each_meta in metas: + slot_name = each_meta.get('name', '%s_id' % name) if each_meta['type'] == 'id': - yield integer_value(each_meta['max']) + yield slot_name, integer_value(each_meta['max']) elif each_meta['type'] == 'embedding': is_seq = each_meta['seq'] == 'sequence' - yield integer_value( + yield slot_name, integer_value( len(each_meta['dict']), seq_type=SequenceType.SEQUENCE if is_seq else SequenceType.NO_SEQUENCE) elif each_meta['type'] == 'one_hot_dense': - yield dense_vector(len(each_meta['dict'])) + yield slot_name, dense_vector(len(each_meta['dict'])) diff --git a/demo/recommendation/dataprovider.py b/demo/recommendation/dataprovider.py index 80c62d7561..c4ff96d80e 100755 --- a/demo/recommendation/dataprovider.py +++ b/demo/recommendation/dataprovider.py @@ -16,6 +16,14 @@ from paddle.trainer.PyDataProvider2 import * import common_utils # parse +def __list_to_map__(lst): + ret_val = dict() + for each in lst: + k, v = each + ret_val[k] = v + return ret_val + + def hook(settings, meta, **kwargs): """ Init hook is invoked before process data. It will set obj.slots and store @@ -34,12 +42,16 @@ def hook(settings, meta, **kwargs): # second part is user features. # final part is rating score. # header is a list of [USE_SEQ_OR_NOT?, SlotType] - headers = list(common_utils.meta_to_header(meta, 'movie')) - headers.extend(list(common_utils.meta_to_header(meta, 'user'))) - headers.append(dense_vector(1)) # Score + movie_headers = list(common_utils.meta_to_header(meta, 'movie')) + settings.movie_names = [h[0] for h in movie_headers] + headers = movie_headers + user_headers = list(common_utils.meta_to_header(meta, 'user')) + settings.user_names = [h[0] for h in user_headers] + headers.extend(user_headers) + headers.append(("rating", dense_vector(1))) # Score # slot types. - settings.input_types = headers + settings.input_types = __list_to_map__(headers) settings.meta = meta @@ -57,20 +69,20 @@ def process(settings, filename): movie_meta = settings.meta['movie'][movie_id] user_meta = settings.meta['user'][user_id] - outputs = [movie_id - 1] + outputs = [('movie_id', movie_id - 1)] # Then add movie features - for each_meta in movie_meta: - outputs.append(each_meta) + for i, each_meta in enumerate(movie_meta): + outputs.append((settings.movie_names[i + 1], each_meta)) # Then add user id. - outputs.append(user_id - 1) + outputs.append(('user_id', user_id - 1)) # Then add user features. - for each_meta in user_meta: - outputs.append(each_meta) + for i, each_meta in enumerate(user_meta): + outputs.append((settings.user_names[i + 1], each_meta)) # Finally, add score - outputs.append([score]) + outputs.append(('rating', [score])) # Return data to paddle - yield outputs + yield __list_to_map__(outputs) diff --git a/demo/recommendation/prediction.py b/demo/recommendation/prediction.py index 191120188e..8ad993eab3 100755 --- a/demo/recommendation/prediction.py +++ b/demo/recommendation/prediction.py @@ -34,8 +34,8 @@ if __name__ == '__main__': network.loadParameters(model_path) with open('./data/meta.bin', 'rb') as f: meta = pickle.load(f) - headers = list(meta_to_header(meta, 'movie')) - headers.extend(list(meta_to_header(meta, 'user'))) + headers = [h[1] for h in meta_to_header(meta, 'movie')] + headers.extend([h[1] for h in meta_to_header(meta, 'user')]) cvt = DataProviderConverter(headers) while True: movie_id = int(raw_input("Input movie_id: ")) diff --git a/demo/recommendation/preprocess.sh b/demo/recommendation/preprocess.sh index e121e47019..dc6b2cdfc1 100755 --- a/demo/recommendation/preprocess.sh +++ b/demo/recommendation/preprocess.sh @@ -25,7 +25,7 @@ python meta_generator.py $dir meta.bin --config=meta_config.json echo 'split train/test file' python split.py $dir/ratings.dat --delimiter=${delimiter} --test_ratio=0.1 echo 'shuffle train file' -shuf $dir/ratings.dat.train > ratings.dat.train +gshuf $dir/ratings.dat.train > ratings.dat.train cp $dir/ratings.dat.test . echo "./data/ratings.dat.train" > train.list echo "./data/ratings.dat.test" > test.list diff --git a/demo/semantic_role_labeling/.gitignore b/demo/semantic_role_labeling/.gitignore index cd90ca7bbe..65c9b674c7 100644 --- a/demo/semantic_role_labeling/.gitignore +++ b/demo/semantic_role_labeling/.gitignore @@ -8,3 +8,7 @@ data/test.wsj.seq_pair data/test.wsj.words data/tgt.dict output +data/emb +data/targetDict.txt +data/verbDict.txt +data/wordDict.txt diff --git a/demo/semantic_role_labeling/data/get_data.sh b/demo/semantic_role_labeling/data/get_data.sh old mode 100644 new mode 100755 diff --git a/python/paddle/trainer_config_helpers/data_sources.py b/python/paddle/trainer_config_helpers/data_sources.py index 0fcf993d57..d7cb95c477 100644 --- a/python/paddle/trainer_config_helpers/data_sources.py +++ b/python/paddle/trainer_config_helpers/data_sources.py @@ -69,7 +69,7 @@ def define_py_data_source(file_list, """ if isinstance(file_list, list): file_list_name = 'train.list' - if isinstance(cls, TestData): + if cls == TestData: file_list_name = 'test.list' with open(file_list_name, 'w') as f: f.writelines(file_list) diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 8dd6b7b7d2..c10fa671bd 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -970,7 +970,7 @@ def pooling_layer(input, :param layer_attr: The Extra Attributes for layer, such as dropout. :type layer_attr: ExtraLayerAttribute|None :return: LayerOutput object. - :rtype: LayerType + :rtype: LayerOutput """ extra_dict = dict() # noinspection PyUnresolvedReferences