Merge branch 'develop' of https://github.com/baidu/Paddle into cmrnorm

9 years ago · bf32411191
parent 5fddd99e18 8a42a54968
commit bf32411191
117 changed files with 2318 additions and 686 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -29,10 +29,6 @@ addons:
      - python-pip
      - python2.7-dev
      - m4
      - libprotobuf-dev
      - doxygen
      - protobuf-compiler
      - python-protobuf
      - python-numpy
      - python-wheel
      - libgoogle-glog-dev
@ -43,6 +39,8 @@ addons:
      - graphviz
      - swig
      - clang-format-3.8
      - automake
      - libtool
 before_install:
  - |
    if [ ${JOB} == "BUILD_AND_TEST" ]; then
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1 +0,0 @@
 ./doc/howto/contribute_to_paddle_en.md
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1 @@
 ./doc/howto/dev/contribute_to_paddle_en.md
--- a/cmake/FindSphinx.cmake
+++ b/cmake/FindSphinx.cmake
@ -72,7 +72,7 @@ function( Sphinx_add_target target_name builder conf cache source destination )
    ${source}
    ${destination}
    COMMENT "Generating sphinx documentation: ${builder}"
-    COMMAND ln -s ${destination}/index_*.html ${destination}/index.html
+    COMMAND ln -sf ${destination}/index_*.html ${destination}/index.html
    )
  set_property(
--- a/cmake/check_packages.cmake
+++ b/cmake/check_packages.cmake
@ -24,7 +24,6 @@ endif()
 if(WITH_DOC)
  find_package(Sphinx REQUIRED)
  find_package(Doxygen REQUIRED)
  find_python_module(recommonmark REQUIRED)
 endif()
--- a/demo/gan/data/download_cifar.sh
+++ b/demo/gan/data/download_cifar.sh
@ -1,3 +1,4 @@
 #!/bin/bash
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
--- a/demo/gan/data/get_mnist_data.sh
+++ b/demo/gan/data/get_mnist_data.sh
--- a/demo/gan/gan_conf_image.py
+++ b/demo/gan/gan_conf_image.py
@ -87,9 +87,9 @@ def conv_bn(input,
    print(imgSize, output_x, stride, filter_size, padding)
    if trans:
        nameApx = "_conv"
    else:
        nameApx = "_convt"
    else:
        nameApx = "_conv"
    if bn:
        conv = img_conv_layer(
--- a/demo/image_classification/data/download_cifar.sh
+++ b/demo/image_classification/data/download_cifar.sh
@ -1,3 +1,4 @@
 #!/bin/bash
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
--- a/demo/image_classification/image_provider.py
+++ b/demo/image_classification/image_provider.py
@ -21,7 +21,7 @@ from paddle.trainer.PyDataProvider2 import *
 #
 # {'img_size': 32,
-# 'settings': <paddle.trainer.PyDataProviderWrapper.Cls instance at 0x7fea27cb6050>,
+# 'settings': a global object,
 # 'color': True,
 # 'mean_img_size': 32,
 # 'meta': './data/cifar-out/batches/batches.meta',
@ -50,10 +50,10 @@ def hook(settings, img_size, mean_img_size, num_classes, color, meta, use_jpeg,
    settings.logger.info('Image size: %s', settings.img_size)
    settings.logger.info('Meta path: %s', settings.meta_path)
-    settings.input_types = [
+    settings.input_types = {
-        dense_vector(settings.img_raw_size),  # image feature
+        'image': dense_vector(settings.img_raw_size),
-        integer_value(settings.num_classes)
+        'label': integer_value(settings.num_classes)
-    ]  # labels
+    }
    settings.logger.info('DataProvider Initialization finished')
@ -83,4 +83,7 @@ def processData(settings, file_list):
                        img, settings.img_mean, settings.img_size,
                        settings.is_train, settings.color)
                    label = data['labels'][i]
-                    yield img_feat.astype('float32'), int(label)
+                    yield {
                        'image': img_feat.astype('float32'),
                        'label': int(label)
                    }
--- a/demo/introduction/.gitignore
+++ b/demo/introduction/.gitignore
@ -0,0 +1,5 @@
 dataprovider.pyc
 empty.list
 train.log
 output
 train.list
--- a/demo/introduction/dataprovider.py
+++ b/demo/introduction/dataprovider.py
@ -17,8 +17,10 @@ import random
 # define data types of input: 2 real numbers
-@provider(input_types=[dense_vector(1), dense_vector(1)], use_seq=False)
+@provider(
    input_types={'x': dense_vector(1),
                 'y': dense_vector(1)}, use_seq=False)
 def process(settings, input_file):
    for i in xrange(2000):
        x = random.random()
-        yield [x], [2 * x + 0.3]
+        yield {'x': [x], 'y': [2 * x + 0.3]}
--- a/demo/introduction/trainer_config.py
+++ b/demo/introduction/trainer_config.py
@ -15,11 +15,8 @@
 from paddle.trainer_config_helpers import *
 # 1. read data. Suppose you saved above python code as dataprovider.py
 data_file = 'empty.list'
 with open(data_file, 'w') as f:
    f.writelines(' ')
 define_py_data_sources2(
-    train_list=data_file,
+    train_list=['no_matter.txt'],
    test_list=None,
    module='dataprovider',
    obj='process',
--- a/demo/mnist/mnist_provider.py
+++ b/demo/mnist/mnist_provider.py
@ -1,10 +1,12 @@
 from paddle.trainer.PyDataProvider2 import *
 import numpy
 # Define a py data provider
@provider(
    input_types={'pixel': dense_vector(28 * 28),
-                 'label': integer_value(10)})
+                 'label': integer_value(10)},
    cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, filename):  # settings is not used currently.
    imgf = filename + "-images-idx3-ubyte"
    labelf = filename + "-labels-idx1-ubyte"
@ -20,12 +22,13 @@ def process(settings, filename):  # settings is not used currently.
    else:
        n = 10000
-    for i in range(n):
+    images = numpy.fromfile(
-        label = ord(l.read(1))
+        f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32')
-        pixels = []
+    images = images / 255.0 * 2.0 - 1.0
-        for j in range(28 * 28):
+    labels = numpy.fromfile(l, 'ubyte', count=n).astype("int")
-            pixels.append(float(ord(f.read(1))) / 255.0)
+
-        yield {"pixel": pixels, 'label': label}
+    for i in xrange(n):
        yield {"pixel": images[i, :], 'label': labels[i]}
    f.close()
    l.close()
--- a/demo/quick_start/.gitignore
+++ b/demo/quick_start/.gitignore
@ -8,6 +8,8 @@ data/test.list
 data/test.txt
 data/train.list
 data/train.txt
 data/pred.list
 data/pred.txt
 dataprovider_copy_1.py
 train.log
 output
--- a/demo/quick_start/api_predict.py
+++ b/demo/quick_start/api_predict.py
@ -0,0 +1,147 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os, sys
 import numpy as np
 from optparse import OptionParser
 from py_paddle import swig_paddle, DataProviderConverter
 from paddle.trainer.PyDataProvider2 import sparse_binary_vector
 from paddle.trainer.config_parser import parse_config
 """
 Usage: run following command to show help message.
  python api_predict.py -h
 """
 class QuickStartPrediction():
    def __init__(self, train_conf, dict_file, model_dir=None, label_file=None):
        """
        train_conf: trainer configure.
        dict_file: word dictionary file name.
        model_dir: directory of model.
        """
        self.train_conf = train_conf
        self.dict_file = dict_file
        self.word_dict = {}
        self.dict_dim = self.load_dict()
        self.model_dir = model_dir
        if model_dir is None:
            self.model_dir = os.path.dirname(train_conf)
        self.label = None
        if label_file is not None:
            self.load_label(label_file)
        conf = parse_config(train_conf, "is_predict=1")
        self.network = swig_paddle.GradientMachine.createFromConfigProto(
            conf.model_config)
        self.network.loadParameters(self.model_dir)
        input_types = [sparse_binary_vector(self.dict_dim)]
        self.converter = DataProviderConverter(input_types)
    def load_dict(self):
        """
        Load dictionary from self.dict_file.
        """
        for line_count, line in enumerate(open(self.dict_file, 'r')):
            self.word_dict[line.strip().split('\t')[0]] = line_count
        return len(self.word_dict)
    def load_label(self, label_file):
        """
        Load label.
        """
        self.label = {}
        for v in open(label_file, 'r'):
            self.label[int(v.split('\t')[1])] = v.split('\t')[0]
    def get_index(self, data):
        """
        transform word into integer index according to the dictionary.
        """
        words = data.strip().split()
        word_slot = [self.word_dict[w] for w in words if w in self.word_dict]
        return word_slot
    def batch_predict(self, data_batch):
        input = self.converter(data_batch)
        output = self.network.forwardTest(input)
        prob = output[0]["id"].tolist()
        print("predicting labels is:")
        print prob
 def option_parser():
    usage = "python predict.py -n config -w model_dir -d dictionary -i input_file "
    parser = OptionParser(usage="usage: %s [options]" % usage)
    parser.add_option(
        "-n",
        "--tconf",
        action="store",
        dest="train_conf",
        help="network config")
    parser.add_option(
        "-d",
        "--dict",
        action="store",
        dest="dict_file",
        help="dictionary file")
    parser.add_option(
        "-b",
        "--label",
        action="store",
        dest="label",
        default=None,
        help="dictionary file")
    parser.add_option(
        "-c",
        "--batch_size",
        type="int",
        action="store",
        dest="batch_size",
        default=1,
        help="the batch size for prediction")
    parser.add_option(
        "-w",
        "--model",
        action="store",
        dest="model_path",
        default=None,
        help="model path")
    return parser.parse_args()
 def main():
    options, args = option_parser()
    train_conf = options.train_conf
    batch_size = options.batch_size
    dict_file = options.dict_file
    model_path = options.model_path
    label = options.label
    swig_paddle.initPaddle("--use_gpu=0")
    predict = QuickStartPrediction(train_conf, dict_file, model_path, label)
    batch = []
    labels = []
    for line in sys.stdin:
        [label, text] = line.split("\t")
        labels.append(int(label))
        batch.append([predict.get_index(text)])
    print("labels is:")
    print labels
    predict.batch_predict(batch)
 if __name__ == '__main__':
    main()
--- a/demo/quick_start/api_predict.sh
+++ b/demo/quick_start/api_predict.sh
@ -0,0 +1,30 @@
 #!/bin/bash
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 set -e
 #Note the default model is pass-00002, you shold make sure the model path
 #exists or change the mode path.
 #only test on trainer_config.lr.py
 model=output/pass-00001/
 config=trainer_config.lr.py
 label=data/labels.list
 dict=data/dict.txt
 batch_size=20
 head -n$batch_size data/test.txt | python api_predict.py \
     --tconf=$config\
     --model=$model \
     --label=$label \
     --dict=$dict \
     --batch_size=$batch_size
--- a/demo/quick_start/dataprovider_bow.py
+++ b/demo/quick_start/dataprovider_bow.py
@ -31,16 +31,16 @@ def initializer(settings, dictionary, **kwargs):
    # setting.input_types specifies what the data types the data provider
    # generates.
-    settings.input_types = [
+    settings.input_types = {
        # The first input is a sparse_binary_vector,
        # which means each dimension of the vector is either 0 or 1. It is the
        # bag-of-words (BOW) representation of the texts.
-        sparse_binary_vector(len(dictionary)),
+        'word': sparse_binary_vector(len(dictionary)),
        # The second input is an integer. It represents the category id of the
        # sample. 2 means there are two labels in the dataset.
        # (1 for positive and 0 for negative)
-        integer_value(2)
+        'label': integer_value(2)
-    ]
+    }
 # Delaring a data provider. It has an initializer 'data_initialzer'.
@ -67,12 +67,12 @@ def process(settings, file_name):
            # Return the features for the current comment. The first is a list
            # of ids representing a 0-1 binary sparse vector of the text,
            # the second is the integer id of the label.
-            yield word_vector, int(label)
+            yield {'word': word_vector, 'label': int(label)}
 def predict_initializer(settings, dictionary, **kwargs):
    settings.word_dict = dictionary
-    settings.input_types = [sparse_binary_vector(len(dictionary))]
+    settings.input_types = {'word': sparse_binary_vector(len(dictionary))}
 # Declaring a data provider for prediction. The difference with process
@ -83,4 +83,4 @@ def process_predict(settings, file_name):
        for line in f:
            comment = line.strip().split()
            word_vector = [settings.word_dict.get(w, UNK_IDX) for w in comment]
-            yield word_vector
+            yield {'word': word_vector}
--- a/demo/quick_start/dataprovider_emb.py
+++ b/demo/quick_start/dataprovider_emb.py
@ -19,13 +19,13 @@ UNK_IDX = 0
 def initializer(settings, dictionary, **kwargs):
    settings.word_dict = dictionary
-    settings.input_types = [
+    settings.input_types = {
        # Define the type of the first input as sequence of integer.
        # The value of the integers range from 0 to len(dictrionary)-1
-        integer_value_sequence(len(dictionary)),
+        'word': integer_value_sequence(len(dictionary)),
        # Define the second input for label id
-        integer_value(2)
+        'label': integer_value(2)
-    ]
+    }
@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
@ -35,15 +35,12 @@ def process(settings, file_name):
            label, comment = line.strip().split('\t')
            words = comment.split()
            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
-            yield word_slot, int(label)
+            yield {'word': word_slot, 'label': int(label)}
 def predict_initializer(settings, dictionary, **kwargs):
    settings.word_dict = dictionary
-    settings.input_types = [
+    settings.input_types = {'word': integer_value_sequence(len(dictionary))}
        integer_value(
            len(dictionary), seq_type=SequenceType.SEQUENCE)
    ]
@provider(init_hook=predict_initializer, should_shuffle=False)
@ -52,4 +49,4 @@ def process_predict(settings, file_name):
        for line in f:
            comment = line.strip().split()
            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in comment]
-            yield word_slot
+            yield {'word': word_slot}
--- a/demo/recommendation/common_utils.py
+++ b/demo/recommendation/common_utils.py
@ -17,13 +17,14 @@ from paddle.trainer.PyDataProvider2 import *
 def meta_to_header(meta, name):
    metas = meta[name]['__meta__']['raw_meta']
    for each_meta in metas:
        slot_name = each_meta.get('name', '%s_id' % name)
        if each_meta['type'] == 'id':
-            yield integer_value(each_meta['max'])
+            yield slot_name, integer_value(each_meta['max'])
        elif each_meta['type'] == 'embedding':
            is_seq = each_meta['seq'] == 'sequence'
-            yield integer_value(
+            yield slot_name, integer_value(
                len(each_meta['dict']),
                seq_type=SequenceType.SEQUENCE
                if is_seq else SequenceType.NO_SEQUENCE)
        elif each_meta['type'] == 'one_hot_dense':
-            yield dense_vector(len(each_meta['dict']))
+            yield slot_name, dense_vector(len(each_meta['dict']))
--- a/demo/recommendation/dataprovider.py
+++ b/demo/recommendation/dataprovider.py
@ -16,6 +16,14 @@ from paddle.trainer.PyDataProvider2 import *
 import common_utils  # parse
 def __list_to_map__(lst):
    ret_val = dict()
    for each in lst:
        k, v = each
        ret_val[k] = v
    return ret_val
 def hook(settings, meta, **kwargs):
    """
    Init hook is invoked before process data. It will set obj.slots and store
@ -34,12 +42,16 @@ def hook(settings, meta, **kwargs):
    #    second part is user features.
    #    final part is rating score.
    # header is a list of [USE_SEQ_OR_NOT?, SlotType]
-    headers = list(common_utils.meta_to_header(meta, 'movie'))
+    movie_headers = list(common_utils.meta_to_header(meta, 'movie'))
-    headers.extend(list(common_utils.meta_to_header(meta, 'user')))
+    settings.movie_names = [h[0] for h in movie_headers]
-    headers.append(dense_vector(1))  # Score
+    headers = movie_headers
    user_headers = list(common_utils.meta_to_header(meta, 'user'))
    settings.user_names = [h[0] for h in user_headers]
    headers.extend(user_headers)
    headers.append(("rating", dense_vector(1)))  # Score
    # slot types.
-    settings.input_types = headers
+    settings.input_types = __list_to_map__(headers)
    settings.meta = meta
@ -57,20 +69,20 @@ def process(settings, filename):
            movie_meta = settings.meta['movie'][movie_id]
            user_meta = settings.meta['user'][user_id]
-            outputs = [movie_id - 1]
+            outputs = [('movie_id', movie_id - 1)]
            # Then add movie features
-            for each_meta in movie_meta:
+            for i, each_meta in enumerate(movie_meta):
-                outputs.append(each_meta)
+                outputs.append((settings.movie_names[i + 1], each_meta))
            # Then add user id.
-            outputs.append(user_id - 1)
+            outputs.append(('user_id', user_id - 1))
            # Then add user features.
-            for each_meta in user_meta:
+            for i, each_meta in enumerate(user_meta):
-                outputs.append(each_meta)
+                outputs.append((settings.user_names[i + 1], each_meta))
            # Finally, add score
-            outputs.append([score])
+            outputs.append(('rating', [score]))
            # Return data to paddle
-            yield outputs
+            yield __list_to_map__(outputs)
--- a/demo/recommendation/prediction.py
+++ b/demo/recommendation/prediction.py
@ -34,8 +34,8 @@ if __name__ == '__main__':
    network.loadParameters(model_path)
    with open('./data/meta.bin', 'rb') as f:
        meta = pickle.load(f)
-        headers = list(meta_to_header(meta, 'movie'))
+        headers = [h[1] for h in meta_to_header(meta, 'movie')]
-        headers.extend(list(meta_to_header(meta, 'user')))
+        headers.extend([h[1] for h in meta_to_header(meta, 'user')])
        cvt = DataProviderConverter(headers)
        while True:
            movie_id = int(raw_input("Input movie_id: "))
--- a/demo/recommendation/preprocess.sh
+++ b/demo/recommendation/preprocess.sh
@ -14,6 +14,15 @@
 # limitations under the License.
 set -e
 UNAME_STR=`uname`
 if [[ ${UNAME_STR} == 'Linux' ]]; then
 	SHUF_PROG='shuf'
 else
 	SHUF_PROG='gshuf'
 fi
 cd "$(dirname "$0")"
 delimiter='::'
 dir=ml-1m
@ -25,7 +34,7 @@ python meta_generator.py $dir meta.bin --config=meta_config.json
 echo 'split train/test file'
 python split.py $dir/ratings.dat --delimiter=${delimiter} --test_ratio=0.1
 echo 'shuffle train file'
-shuf $dir/ratings.dat.train > ratings.dat.train
+${SHUF_PROG} $dir/ratings.dat.train > ratings.dat.train
 cp $dir/ratings.dat.test .
 echo "./data/ratings.dat.train" > train.list
 echo "./data/ratings.dat.test" > test.list
--- a/demo/semantic_role_labeling/.gitignore
+++ b/demo/semantic_role_labeling/.gitignore
@ -8,3 +8,7 @@ data/test.wsj.seq_pair
 data/test.wsj.words
 data/tgt.dict
 output
 data/emb
 data/targetDict.txt
 data/verbDict.txt
 data/wordDict.txt
--- a/demo/semantic_role_labeling/data/get_data.sh
+++ b/demo/semantic_role_labeling/data/get_data.sh
--- a/Show More
+++ b/Show More