Merge pull request #1 from PaddlePaddle/develop

Update from the origin
8 years ago · 2a9d71a5fb
parent 0955977c25 e7ca8b27ee
commit 2a9d71a5fb
292 changed files with 11147 additions and 2954 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -4,6 +4,7 @@ cache:
    - $HOME/third_party
    - $HOME/.ccache
    - $HOME/.cache/pip
    - $HOME/Library/Caches/Homebrew
 sudo: required
 dist: trusty
 os:
@ -25,9 +26,9 @@ addons:
    packages:
      - gcc-4.8
      - g++-4.8
      - gfortran-4.8
      - git
      - build-essential
      - libatlas-base-dev
      - python
      - python-pip
      - python2.7-dev
@ -54,7 +55,9 @@ before_install:
    fi
  - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
  - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
-  - pip install numpy wheel protobuf sphinx recommonmark sphinx_rtd_theme virtualenv pre-commit requests==2.9.2 LinkChecker
+  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python 
  # protobuf version.
  - pip install numpy wheel 'protobuf==3.1' sphinx recommonmark sphinx_rtd_theme virtualenv pre-commit requests==2.9.2 LinkChecker
 script:
  - paddle/scripts/travis/main.sh
 notifications:
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@ -16,7 +16,8 @@
 set(CBLAS_FOUND OFF)
 ## Find MKL First.
-set(MKL_ROOT $ENV{MKLROOT} CACHE PATH "Folder contains MKL")
+set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
 set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL")
 find_path(MKL_INCLUDE_DIR mkl.h PATHS
  ${MKL_ROOT}/include)
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -29,12 +29,14 @@ INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
 ExternalProject_Add(
    glog
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS gflags
    GIT_REPOSITORY  "https://github.com/google/glog.git"
    PREFIX          ${GLOG_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    CMAKE_ARGS      -DWITH_GFLAGS=OFF
+    CMAKE_ARGS      -DWITH_GFLAGS=ON
    CMAKE_ARGS      -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
    CMAKE_ARGS      -DBUILD_TESTING=OFF
 )
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@ -15,7 +15,6 @@
 INCLUDE(cblas)
 IF(NOT ${CBLAS_FOUND})
    MESSAGE(FATAL_ERROR "Please install OpenBlas, MKL or ATLAS.")
    INCLUDE(ExternalProject)
    SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
@ -28,20 +27,40 @@ IF(NOT ${CBLAS_FOUND})
        SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/libopenblas.a" CACHE FILEPATH "openblas library" FORCE)
    ENDIF(WIN32)
    IF(CMAKE_COMPILER_IS_GNUCC)
        ENABLE_LANGUAGE(Fortran)
        LIST(APPEND CBLAS_LIBRARIES gfortran pthread)
    ENDIF(CMAKE_COMPILER_IS_GNUCC)
    IF(NOT CMAKE_Fortran_COMPILER)
        MESSAGE(FATAL_ERROR "To build lapack in libopenblas, "
                "you need to set gfortran compiler: cmake .. -DCMAKE_Fortran_COMPILER=...")
    ENDIF(NOT CMAKE_Fortran_COMPILER)
    ExternalProject_Add(
        openblas
        ${EXTERNAL_PROJECT_LOG_ARGS}
-        URL                 "https://github.com/xianyi/OpenBLAS/archive/v0.2.19.tar.gz"
+        GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
        GIT_TAG             v0.2.19
        PREFIX              ${CBLAS_SOURCES_DIR}
        INSTALL_DIR         ${CBLAS_INSTALL_DIR}
        BUILD_IN_SOURCE     1
-        CONFIGURE_COMMAND   ""
+        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} FC=${CMAKE_Fortran_COMPILER} CC=${CMAKE_C_COMPILER} HOSTCC=${CMAKE_C_COMPILER} NO_SHARED=1 libs netlib
-        BUILD_COMMAND       make CC=${CMAKE_C_COMPILER} FC=${CMAKE_Fortran_COMPILER}
+        INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 PREFIX=<INSTALL_DIR>
        INSTALL_COMMAND     make install PREFIX=<INSTALL_DIR>
        UPDATE_COMMAND      ""
        CONFIGURE_COMMAND   ""
    )
    ExternalProject_Add_Step(
        openblas lapacke_install
        COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h" "${CBLAS_INSTALL_DIR}/include/lapacke_mangling.h"
        COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke.h" "${CBLAS_INSTALL_DIR}/include/lapacke.h"
        COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke_config.h" "${CBLAS_INSTALL_DIR}/include/lapacke_config.h"
        COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke_utils.h" "${CBLAS_INSTALL_DIR}/include/lapacke_utils.h"
        DEPENDEES install
    )
    LIST(APPEND external_project_dependencies openblas)
-ENDIF()
+ENDIF(NOT ${CBLAS_FOUND})
 INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@ -29,17 +29,12 @@ IF(WIN32)
        "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.lib" CACHE FILEPATH "protoc library." FORCE)
  SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc.exe" CACHE FILEPATH "protobuf executable." FORCE)
 ELSE(WIN32)
  IF(${HOST_SYSTEM} STREQUAL "centos")
    SET(LIB "lib64")
  ELSE()
    SET(LIB "lib")
  ENDIF()
  SET(PROTOBUF_LITE_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/${LIB}/libprotobuf-lite.a" CACHE FILEPATH "protobuf lite library." FORCE)
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.a" CACHE FILEPATH "protobuf lite library." FORCE)
  SET(PROTOBUF_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/${LIB}/libprotobuf.a" CACHE FILEPATH "protobuf library." FORCE)
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.a" CACHE FILEPATH "protobuf library." FORCE)
  SET(PROTOBUF_PROTOC_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/${LIB}/libprotoc.a" CACHE FILEPATH "protoc library." FORCE)
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.a" CACHE FILEPATH "protoc library." FORCE)
  SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc" CACHE FILEPATH "protobuf executable." FORCE)
 ENDIF(WIN32)
@ -58,6 +53,7 @@ ExternalProject_Add(
    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
    -DCMAKE_BUILD_TYPE=Release
    -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
    -DCMAKE_INSTALL_LIBDIR=lib
 )
 LIST(APPEND external_project_dependencies protobuf)
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@ -26,11 +26,12 @@ IF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
    find_python_module(wheel REQUIRED)
    find_python_module(google.protobuf REQUIRED)
    FIND_PACKAGE(NumPy REQUIRED)
-    IF(${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
+    IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
        MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, "
-        "please use pip to upgrade protobuf.")
+        "please use pip to upgrade protobuf. pip install -U protobuf")
-    ENDIF(${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
+    ENDIF()
 ELSE(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
    MESSAGE(FATAL_ERROR "Please install python 2.7 before building PaddlePaddle.")
    ##################################### PYTHON ########################################
    SET(PYTHON_SOURCES_DIR ${THIRD_PARTY_PATH}/python)
    SET(PYTHON_INSTALL_DIR ${THIRD_PARTY_PATH}/install/python)
--- a/cmake/external/swig.cmake
+++ b/cmake/external/swig.cmake
@ -38,14 +38,6 @@ IF(NOT SWIG_FOUND)
        SET(SWIG_DIR ${SWIG_SOURCES_DIR} CACHE FILEPATH "SWIG Directory" FORCE)
        SET(SWIG_EXECUTABLE ${SWIG_SOURCES_DIR}/swig.exe  CACHE FILEPATH "SWIG Executable" FORCE)
    ELSE(WIN32)
        # From PCRE configure
        ExternalProject_Add(pcre
            ${EXTERNAL_PROJECT_LOG_ARGS}
            GIT_REPOSITORY https://github.com/svn2github/pcre.git
            PREFIX ${SWIG_SOURCES_DIR}/pcre
            CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SWIG_INSTALL_DIR}/pcre
        )
        # swig uses bison find it by cmake and pass it down
        FIND_PACKAGE(BISON)
@ -54,16 +46,11 @@ IF(NOT SWIG_FOUND)
            GIT_REPOSITORY      https://github.com/swig/swig.git
            GIT_TAG             rel-3.0.10
            PREFIX              ${SWIG_SOURCES_DIR}
-            CONFIGURE_COMMAND   cd ${SWIG_SOURCES_DIR}/src/swig && ./autogen.sh
+            CONFIGURE_COMMAND   cd <SOURCE_DIR> && ./autogen.sh && ./configure
-            CONFIGURE_COMMAND   cd ${SWIG_SOURCES_DIR}/src/swig &&
+                                --prefix=${SWIG_INSTALL_DIR} --without-pcre
-            env "PCRE_LIBS=${SWIG_INSTALL_DIR}/pcre/lib/libpcre.a ${SWIG_INSTALL_DIR}/pcre/lib/libpcrecpp.a ${SWIG_INSTALL_DIR}/pcre/lib/libpcreposix.a"
+            BUILD_COMMAND       cd <SOURCE_DIR> && make
-            ./configure
+            INSTALL_COMMAND     cd <SOURCE_DIR> && make install
-                --prefix=${SWIG_INSTALL_DIR}
+            UPDATE_COMMAND      ""
                --with-pcre-prefix=${SWIG_INSTALL_DIR}/pcre
            BUILD_COMMAND   cd ${SWIG_SOURCES_DIR}/src/swig && make
            INSTALL_COMMAND cd ${SWIG_SOURCES_DIR}/src/swig && make install
            UPDATE_COMMAND  ""
            DEPENDS pcre
        )
        SET(SWIG_DIR ${SWIG_INSTALL_DIR}/share/swig/${SWIG_TARGET_VERSION})
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@ -54,6 +54,7 @@ ExternalProject_Add(
    CMAKE_ARGS      -DWITH_GPU=${WITH_GPU}
    CMAKE_ARGS      -DWITH_OMP=${USE_OMP}
    CMAKE_ARGS      -DWITH_TORCH=OFF
    CMAKE_ARGS      -DCMAKE_DISABLE_FIND_PACKAGE_Torch=TRUE
    CMAKE_ARGS      -DBUILD_SHARED=ON
 )
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@ -12,6 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Detects the OS and sets appropriate variables.
 # CMAKE_SYSTEM_NAME only give us a coarse-grained name,
 # but the name like centos is necessary in some scenes
 # to distinguish system for customization.
 #
 # for instance, protobuf libs path is <install_dir>/lib64
 # on CentOS, but <install_dir>/lib on other systems.
 IF(WIN32)
    SET(HOST_SYSTEM "win32")
 ELSE(WIN32)
@ -21,6 +29,7 @@ ELSE(WIN32)
        SET(MACOS_VERSION ${VERSION})
        SET(HOST_SYSTEM "macosx")
    ELSE(APPLE)
        IF(EXISTS "/etc/issue")
            FILE(READ "/etc/issue" LINUX_ISSUE)
            IF(LINUX_ISSUE MATCHES "CentOS")
@ -29,8 +38,24 @@ ELSE(WIN32)
                SET(HOST_SYSTEM "debian")
            ELSEIF(LINUX_ISSUE MATCHES "Ubuntu")
                SET(HOST_SYSTEM "ubuntu")
            ELSEIF(LINUX_ISSUE MATCHES "Red Hat")
                SET(HOST_SYSTEM "redhat")
            ELSEIF(LINUX_ISSUE MATCHES "Fedora")
                SET(HOST_SYSTEM "fedora")
            ENDIF()
        ENDIF(EXISTS "/etc/issue")
        IF(EXISTS "/etc/redhat-release")
            FILE(READ "/etc/redhat-release" LINUX_ISSUE)
            IF(LINUX_ISSUE MATCHES "CentOS")
                SET(HOST_SYSTEM "centos")
            ENDIF()
        ENDIF(EXISTS "/etc/redhat-release")
        IF(NOT HOST_SYSTEM)
            SET(HOST_SYSTEM ${CMAKE_SYSTEM_NAME})
        ENDIF()
    ENDIF(APPLE)
 ENDIF(WIN32)
@ -47,7 +72,7 @@ SET(EXTERNAL_PROJECT_LOG_ARGS
    LOG_DOWNLOAD    0     # Wrap download in script to log output
    LOG_UPDATE      1     # Wrap update in script to log output
    LOG_CONFIGURE   1     # Wrap configure in script to log output
-    LOG_BUILD       1     # Wrap build in script to log output
+    LOG_BUILD       0     # Wrap build in script to log output
    LOG_TEST        1     # Wrap test in script to log output
-    LOG_INSTALL     1     # Wrap install in script to log output
+    LOG_INSTALL     0     # Wrap install in script to log output
 )
--- a/demo/image_classification/prediction.py
+++ b/demo/image_classification/prediction.py
@ -126,7 +126,7 @@ class ImageClassifier():
        # For oversampling, average predictions across crops.
        # If not, the shape of output[name]: (1, class_number),
        # the mean is also applicable.
-        return output[output_layer].mean(0)
+        return output[output_layer]['value'].mean(0)
    def predict(self, image=None, output_layer=None):
        assert isinstance(image, basestring)
--- a/demo/mnist/api_train.py
+++ b/demo/mnist/api_train.py
@ -6,33 +6,15 @@ passed to C++ side of Paddle.
 The user api could be simpler and carefully designed.
 """
 import py_paddle.swig_paddle as api
 from py_paddle import DataProviderConverter
 import paddle.trainer.PyDataProvider2 as dp
 import numpy as np
 import random
 from mnist_util import read_from_mnist
 from paddle.trainer_config_helpers import *
 def optimizer_config():
    settings(
        learning_rate=1e-4,
        learning_method=AdamOptimizer(),
        batch_size=1000,
        model_average=ModelAverage(average_window=0.5),
        regularization=L2Regularization(rate=0.5))
 import numpy as np
 import paddle.v2 as paddle_v2
 import py_paddle.swig_paddle as api
 from paddle.trainer_config_helpers import *
 from py_paddle import DataProviderConverter
-def network_config():
+from mnist_util import read_from_mnist
    imgs = data_layer(name='pixel', size=784)
    hidden1 = fc_layer(input=imgs, size=200)
    hidden2 = fc_layer(input=hidden1, size=200)
    inference = fc_layer(input=hidden2, size=10, act=SoftmaxActivation())
    cost = classification_cost(
        input=inference, label=data_layer(
            name='label', size=10))
    outputs(cost)
 def init_parameter(network):
@ -75,19 +57,35 @@ def input_order_converter(generator):
 def main():
    api.initPaddle("-use_gpu=false", "-trainer_count=4")  # use 4 cpu cores
-    # get enable_types for each optimizer.
+    optimizer = paddle_v2.optimizer.Adam(
-    # enable_types = [value, gradient, momentum, etc]
+        learning_rate=1e-4,
-    # For each optimizer(SGD, Adam), GradientMachine should enable different
+        batch_size=1000,
-    # buffers.
+        model_average=ModelAverage(average_window=0.5),
-    opt_config_proto = parse_optimizer_config(optimizer_config)
+        regularization=L2Regularization(rate=0.5))
-    opt_config = api.OptimizationConfig.createFromProto(opt_config_proto)
+
-    _temp_optimizer_ = api.ParameterOptimizer.create(opt_config)
+    # Create Local Updater. Local means not run in cluster.
-    enable_types = _temp_optimizer_.getParameterTypes()
+    # For a cluster training, here we can change to createRemoteUpdater
    # in future.
    updater = optimizer.create_local_updater()
    assert isinstance(updater, api.ParameterUpdater)
    # define network
    images = paddle_v2.layer.data(
        name='pixel', type=paddle_v2.data_type.dense_vector(784))
    label = paddle_v2.layer.data(
        name='label', type=paddle_v2.data_type.integer_value(10))
    hidden1 = paddle_v2.layer.fc(input=images, size=200)
    hidden2 = paddle_v2.layer.fc(input=hidden1, size=200)
    inference = paddle_v2.layer.fc(input=hidden2,
                                   size=10,
                                   act=paddle_v2.activation.Softmax())
    cost = paddle_v2.layer.classification_cost(input=inference, label=label)
    # Create Simple Gradient Machine.
-    model_config = parse_network_config(network_config)
+    model_config = paddle_v2.layer.parse_network(cost)
-    m = api.GradientMachine.createFromConfigProto(
+    m = api.GradientMachine.createFromConfigProto(model_config,
-        model_config, api.CREATE_MODE_NORMAL, enable_types)
+                                                  api.CREATE_MODE_NORMAL,
                                                  optimizer.enable_types())
    # This type check is not useful. Only enable type hint in IDE.
    # Such as PyCharm
@ -96,19 +94,12 @@ def main():
    # Initialize Parameter by numpy.
    init_parameter(network=m)
    # Create Local Updater. Local means not run in cluster.
    # For a cluster training, here we can change to createRemoteUpdater
    # in future.
    updater = api.ParameterUpdater.createLocalUpdater(opt_config)
    assert isinstance(updater, api.ParameterUpdater)
    # Initialize ParameterUpdater.
    updater.init(m)
    # DataProvider Converter is a utility convert Python Object to Paddle C++
    # Input. The input format is as same as Paddle's DataProvider.
-    converter = DataProviderConverter(
+    converter = DataProviderConverter(input_types=[images.type, label.type])
        input_types=[dp.dense_vector(784), dp.integer_value(10)])
    train_file = './data/raw_data/train'
    test_file = './data/raw_data/t10k'
--- a/demo/mnist/api_train_v2.py
+++ b/demo/mnist/api_train_v2.py
@ -0,0 +1,62 @@
 import paddle.v2 as paddle
 def main():
    paddle.init(use_gpu=False, trainer_count=1)
    # define network topology
    images = paddle.layer.data(
        name='pixel', type=paddle.data_type.dense_vector(784))
    label = paddle.layer.data(
        name='label', type=paddle.data_type.integer_value(10))
    hidden1 = paddle.layer.fc(input=images, size=200)
    hidden2 = paddle.layer.fc(input=hidden1, size=200)
    inference = paddle.layer.fc(input=hidden2,
                                size=10,
                                act=paddle.activation.Softmax())
    cost = paddle.layer.classification_cost(input=inference, label=label)
    parameters = paddle.parameters.create(cost)
    adam_optimizer = paddle.optimizer.Adam(learning_rate=0.01)
    trainer = paddle.trainer.SGD(cost=cost,
                                 parameters=parameters,
                                 update_equation=adam_optimizer)
    def event_handler(event):
        if isinstance(event, paddle.event.EndIteration):
            if event.batch_id % 1000 == 0:
                result = trainer.test(reader=paddle.reader.batched(
                    paddle.dataset.mnist.test(), batch_size=256))
                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
                    event.pass_id, event.batch_id, event.cost, event.metrics,
                    result.metrics)
        else:
            pass
    trainer.train(
        reader=paddle.reader.batched(
            paddle.reader.shuffle(
                paddle.dataset.mnist.train(), buf_size=8192),
            batch_size=32),
        event_handler=event_handler)
    # output is a softmax layer. It returns probabilities.
    # Shape should be (100, 10)
    probs = paddle.infer(
        output=inference,
        parameters=parameters,
        reader=paddle.reader.batched(
            paddle.reader.firstn(
                paddle.reader.map_readers(lambda item: (item[0], ),
                                          paddle.dataset.mnist.test()),
                n=100),
            batch_size=32))
    print probs.shape
 if __name__ == '__main__':
    main()
--- a/demo/model_zoo/resnet/classify.py
+++ b/demo/model_zoo/resnet/classify.py
@ -156,7 +156,7 @@ class ImageClassifier():
            # For oversampling, average predictions across crops.
            # If not, the shape of output[name]: (1, class_number),
            # the mean is also applicable.
-            res[name] = output[name].mean(0)
+            res[name] = output[name]['value'].mean(0)
        return res
--- a/demo/sentiment/dataprovider.py
+++ b/demo/sentiment/dataprovider.py
@ -32,4 +32,6 @@ def process(settings, file_name):
            word_slot = [
                settings.word_dict[w] for w in words if w in settings.word_dict
            ]
            if not word_slot:
                continue
            yield word_slot, label
--- a/demo/sentiment/predict.py
+++ b/demo/sentiment/predict.py
@ -138,7 +138,11 @@ def main():
    batch = []
    for line in sys.stdin:
-        batch.append([predict.get_index(line)])
+        words = predict.get_index(line)
        if words:
            batch.append([words])
        else:
            print('All the words in [%s] are not in the dictionary.' % line)
        if len(batch) == batch_size:
            predict.batch_predict(batch)
            batch = []
--- a/demo/traffic_prediction/predict.sh
+++ b/demo/traffic_prediction/predict.sh
@ -25,6 +25,6 @@ paddle train \
    --config_args=is_predict=1 \
    --predict_output_dir=. 
-python gen_result.py > result.txt
+python gen_result.py > result.csv
 rm -rf rank-00000
--- a/doc/api/trainer_config_helpers/layers.rst
+++ b/doc/api/trainer_config_helpers/layers.rst
@ -139,24 +139,12 @@ lstmemory
    :members: lstmemory
    :noindex:
 lstm_step_layer
 ---------------
 ..  automodule:: paddle.trainer_config_helpers.layers
    :members: lstm_step_layer
    :noindex:
 grumemory
 ---------
 ..  automodule:: paddle.trainer_config_helpers.layers
    :members: grumemory
    :noindex:
 gru_step_layer
 ---------------
 ..  automodule:: paddle.trainer_config_helpers.layers
    :members: gru_step_layer
    :noindex:
 Recurrent Layer Group
 =====================
@ -172,6 +160,18 @@ recurrent_group
    :members: recurrent_group
    :noindex:
 lstm_step_layer
 ---------------
 ..  automodule:: paddle.trainer_config_helpers.layers
    :members: lstm_step_layer
    :noindex:
 gru_step_layer
 ---------------
 ..  automodule:: paddle.trainer_config_helpers.layers
    :members: gru_step_layer
    :noindex:
 beam_search
 ------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@ -279,6 +279,12 @@ concat_layer
    :members: concat_layer
    :noindex:
 seq_concat_layer
 ----------------
 ..  automodule:: paddle.trainer_config_helpers.layers
    :members: seq_concat_layer
    :noindex:
 Reshaping Layers
 ================
@ -302,6 +308,18 @@ repeat_layer
    :members: repeat_layer
    :noindex:
 rotate_layer
 ------------
 ..  automodule:: paddle.trainer_config_helpers.layers
    :members: rotate_layer
    :noindex:
 seq_reshape_layer
 -----------------
 ..  automodule:: paddle.trainer_config_helpers.layers
    :members: seq_reshape_layer
    :noindex:
 Math Layers
 ===========
@ -382,6 +400,15 @@ sampling_id_layer
    :members: sampling_id_layer
    :noindex:
 Slicing and Joining Layers
 ==========================
 pad_layer
 -----------
 ..  automodule:: paddle.trainer_config_helpers.layers
    :members: pad_layer
    :noindex:
 ..  _api_trainer_config_helpers_layers_cost_layers:
 Cost Layers
@ -441,6 +468,12 @@ ctc_layer
    :members: ctc_layer
    :noindex:
 warp_ctc_layer
 --------------
 ..  automodule:: paddle.trainer_config_helpers.layers
    :members: warp_ctc_layer
    :noindex:
 nce_layer
 -----------
 ..  automodule:: paddle.trainer_config_helpers.layers
--- a/doc/design/api.md
+++ b/doc/design/api.md
--- a/doc/design/reader/README.md
+++ b/doc/design/reader/README.md
@ -0,0 +1,161 @@
 # Python Data Reader Design Doc
 At training and testing time, PaddlePaddle programs need to read data. To ease the users' work to write data reading code, we define that
 - A *reader* is a function that reads data (from file, network, random number generator, etc) and yields data items.
 - A *reader creator* is a function that returns a reader function.
 - A *reader* decorator is a function, which accepts one or more readers, and returns a reader.
 and provide frequently used reader creators and reader decorators.
 ## Data Reader Interface
 Indeed, *data reader* doesn't have to be a function that reads and yields data items. It can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`):
 ```
 iterable = data_reader()
 ```
 Element produced from the iterable should be a **single** entry of data, **not** a mini batch. That entry of data could be a single item, or a tuple of items. Item should be of [supported type](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int)
 An example implementation for single item data reader creator:
 ```python
 def reader_creator_random_image(width, height):
 	def reader():
 		while True:
 			yield numpy.random.uniform(-1, 1, size=width*height)
 	return reader
 ```
 An example implementation for multiple item data reader creator:
 ```python
 def reader_creator_random_imageand_label(widht, height, label):
 	def reader():
 		while True:
 			yield numpy.random.uniform(-1, 1, size=width*height), label
 	return reader
 ```
 ## Usage
 data reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`:
 ```python
 # two data layer is created:
 image_layer = paddle.layer.data("image", ...)
 label_layer = paddle.layer.data("label", ...)
 # ...
 paddle.train(paddle.dataset.mnist, {"image":0, "label":1}, 128, 10, ...)
 ```
 ## Data Reader Decorator
 *Data reader decorator* takes a single or multiple data reader, returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` syntax.
 Since we have a strict interface for data readers (no parameter, return a single data item). Data reader can be used flexiable via data reader decorators. Following are a few examples:
 ### Prefetch Data
 Since reading data may take time and training can not proceed without data. It is generally a good idea to prefetch data.
 Use `paddle.reader.buffered` to prefetch data:
 ```python
 buffered_reader = paddle.reader.buffered(paddle.dataset.mnist, 100)
 ```
 `buffered_reader` will try to buffer (prefetch) `100` data entries.
 ### Compose Multiple Data Readers
 For example, we want to use a source of real images (reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
 We can do:
 ```python
 def reader_creator_random_image(width, height):
 	def reader():
 		while True:
 			yield numpy.random.uniform(-1, 1, size=width*height)
 	return reader
 def reader_creator_bool(t):
 	def reader:
 		while True:
 			yield t
 	return reader
 true_reader = reader_creator_bool(True)
 false_reader = reader_creator_bool(False)
 reader = paddle.reader.compose(paddle.dataset.mnist, data_reader_creator_random_image(20, 20), true_reader, false_reader)
 # Skipped 1 because paddle.dataset.mnist produces two items per data entry.
 # And we don't care second item at this time.
 paddle.train(reader, {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...)
 ```
 ### Shuffle
 Given shuffle buffer size `n`, `paddle.reader.shuffle` will return a data reader that buffers `n` data entries and shuffle them before a data entry is read.
 Example:
 ```python
 reader = paddle.reader.shuffle(paddle.dataset.mnist, 512)
 ```
 ## Q & A
 ### Why return only a single entry, but not a mini batch?
 If a mini batch is returned, data reader need to take care of batch size. But batch size is a concept for training, it makes more sense for user to specify batch size as a parameter for `train`.
 Practically, always return a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2).
 ### Why use a dictionary but not a list to provide mapping?
 We decided to use dictionary (`{"image":0, "label":1}`) instead of list (`["image", "label"]`) is because that user can easily resue item (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or skip item (e.g., using `{"image_a":0, "label":2}`).
 ### How to create custom data reader creator
 ```python
 def image_reader_creator(image_path, label_path, n):
 	def reader():
 		f = open(image_path)
 		l = open(label_path)
 		images = numpy.fromfile(
 			f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32')
 		images = images / 255.0 * 2.0 - 1.0
 		labels = numpy.fromfile(l, 'ubyte', count=n).astype("int")
 		for i in xrange(n):
 			yield images[i, :], labels[i] # a single entry of data is created each time
 		f.close()
 		l.close()
 	return reader
 # images_reader_creator creates a reader
 reader = image_reader_creator("/path/to/image_file", "/path/to/label_file", 1024)
 paddle.train(reader, {"image":0, "label":1}, ...)
 ```
 ### How is `paddle.train` implemented
 An example implementation of paddle.train could be:
 ```python
 def make_minibatch(reader, minibatch_size):
 	def ret():
 		r = reader()
 		buf = [r.next() for x in xrange(minibatch_size)]
 		while len(buf) > 0:
 			yield buf
 			buf = [r.next() for x in xrange(minibatch_size)]
 	return ret
 def train(reader, mapping, batch_size, total_pass):
 	for pass_idx in range(total_pass):
 		for mini_batch in make_minibatch(reader): # this loop will never end in online learning.
 			do_forward_backward(mini_batch, mapping)
 ```
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@ -4,6 +4,8 @@ Installing from Sources
 * [1. Download and Setup](#download)
 * [2. Requirements](#requirements)
 * [3. Build on Ubuntu](#ubuntu)
 * [4. Build on Centos](#centos)
 ## <span id="download">Download and Setup</span> 
 You can download PaddlePaddle from the [github source](https://github.com/PaddlePaddle/Paddle).
@ -16,9 +18,10 @@ cd paddle
 To compile the source code, your computer must be equipped with the following dependencies.
- **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1)
+- **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1) and gfortran compiler
- **CMake**: version >= 3.0 (at least CMake 3.4 on Mac OS X)
+- **CMake**: CMake >= 3.0 (at least CMake 3.4 on Mac OS X)
 - **BLAS**: MKL, OpenBlas or ATLAS
 - **Python**: only support Python 2.7
 **Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported!
 For CUDA 8.0, GCC versions later than 5.3 are not supported!
@ -64,7 +67,8 @@ As a simple example, consider the following:
 1. **BLAS Dependencies(optional)**
-    Paddle will find BLAS from system's default path. But you can specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
+    CMake will search BLAS libraries from system. If not found, OpenBLAS will be downloaded, built and installed automatically.
    To utilize preinstalled BLAS， you can simply specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
    ```bash
    # specify MKL
@ -94,12 +98,78 @@ As a simple example, consider the following:
 ### Install Dependencies
- **CPU Dependencies**
+- **Paddle Dependencies**
    ```bash
    # necessary
    sudo apt-get update
-    sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev git
+    sudo apt-get install -y git curl gcc g++ gfortran make build-essential automake
    sudo apt-get install -y python python-pip python-numpy libpython-dev bison
    sudo pip install 'protobuf==3.1.0.post1'
    # install cmake 3.4
    curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
        cd cmake-3.4.1 && ./bootstrap && make -j4 && sudo make install && \
        cd .. && rm -rf cmake-3.4.1
    ```
 - **GPU Dependencies (optional)**
    To build GPU version, you will need the following installed:
        1. a CUDA-capable GPU
        2. A supported version of Linux with a gcc compiler and toolchain
        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
        4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
    The CUDA development environment relies on tight integration with the host development environment,
    including the host compiler and C runtime libraries, and is therefore only supported on
    distribution versions that have been qualified for this CUDA Toolkit release.
    After downloading cuDNN library, issue the following commands:
    ```bash
    sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
    sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
    ```
    Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
    ```bash
    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
    export PATH=/usr/local/cuda/bin:$PATH
    ```
 ### Build and Install
 As usual, the best option is to create build folder under paddle project directory.
 ```bash
 mkdir build && cd build
 ``` 
 Finally, you can build and install PaddlePaddle:
 ```bash
 # you can add build option here, such as:    
 cmake .. -DCMAKE_INSTALL_PREFIX=<path to install>
 # please use sudo make install, if you want to install PaddlePaddle into the system
 make -j `nproc` && make install
 # set PaddlePaddle installation path in ~/.bashrc
 export PATH=<path to install>/bin:$PATH
 # install PaddlePaddle Python modules.
 sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
 ```
 ## <span id="centos">Build on Centos 7</span>
 ### Install Dependencies
 - **CPU Dependencies**
    ```bash
    # necessary
    sudo yum update
    sudo yum install -y epel-release
    sudo yum install -y make cmake3 python-devel python-pip gcc-gfortran swig git
    sudo pip install wheel numpy
    sudo pip install 'protobuf>=3.0.0'
    ```
@ -142,7 +212,7 @@ Finally, you can build and install PaddlePaddle:
 ```bash
 # you can add build option here, such as:    
-cmake .. -DCMAKE_INSTALL_PREFIX=<path to install>
+cmake3 .. -DCMAKE_INSTALL_PREFIX=<path to install>
 # please use sudo make install, if you want to install PaddlePaddle into the system
 make -j `nproc` && make install
 # set PaddlePaddle installation path in ~/.bashrc
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@ -12,7 +12,7 @@ PaddlePaddle项目提供官方 `Docker <https://www.docker.com/>`_ 镜像。Dock
 PaddlePaddle提供的Docker镜像版本
 --------------------------------
-我们提供了12个 `Docker image <https://hub.docker.com/r/paddledev/paddle/tags/>`_ ，他们的image name都是 :code:`paddle-dev/paddle` ，tag分别为
+我们提供了12个 `Docker image <https://hub.docker.com/r/paddledev/paddle/tags/>`_ ，他们的image name都是 :code:`paddledev/paddle` ，tag分别为
 +-----------------+------------------+------------------------+-----------------------+
 |                 |   normal         |           devel        |          demo         |
@ -45,7 +45,7 @@ PaddlePaddle提供的Docker镜像版本
    if cat /proc/cpuinfo | grep -q avx ; then echo "Support AVX"; else echo "Not support AVX"; fi
-如果输出 :code:`Support AVX`，则可以选择上表中的AVX版本PaddlePaddle。否则需要选择非AVX的PaddlePaddle。选择普通CPU版本的devel版本的image，则可以使用 :code:`paddle-dev/paddle:cpu-devel-latest` 来引用这个image。
+如果输出 :code:`Support AVX`，则可以选择上表中的AVX版本PaddlePaddle。否则需要选择非AVX的PaddlePaddle。选择普通CPU版本的devel版本的image，则可以使用 :code:`paddledev/paddle:cpu-devel-latest` 来引用这个image。
 PaddlePaddle提供的镜像并不包含任何命令运行，想要运行PaddlePaddle，您需要进入镜像运行PaddlePaddle
 程序或者自定义一个含有启动脚本的image。具体请参考注意事项中的 :code:`使用ssh访问PaddlePaddle镜像`
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@ -16,70 +16,71 @@ Developers can work on PaddlePaddle using Docker.  This allows
 developers to work on different platforms -- Linux, Mac OS X, and
 Windows -- in a consistent way.
-The general development workflow with Docker and CMake is as follows:
+1. Build the Development Environment as a Docker Image
 1. Get the source code of Paddle:
   .. code-block:: bash
-      git clone https://github.com/PaddlePaddle/Paddle.git
+      git clone --recursive https://github.com/PaddlePaddle/Paddle
      cd Paddle
      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .
-2. Build a development Docker image :code:`paddle:dev` from the source
+   Note that by default :code:`docker build` wouldn't import source
-   code.  This image contains all the development tools and
+   tree into the image and build it.  If we want to do that, we need
-   dependencies of PaddlePaddle.
+   to set a build arg:
   .. code-block:: bash
-      cd paddle
+      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile --build-arg BUILD_AND_INSTALL=ON .
-      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .
+
 2. Run the Development Environment
-   Sometimes docker build might suffer from a slow network connection to the official Ubuntu apt-source servers. In such case, we can specify an apt-source mirror server that is geologically nearer to us. In the following example, we specified an apt-source server that responds fast in China.You can specify the UBUNTU MIRROR with :code:`--build-arg UBUNTU_MIRROR` like the example below.
+   Once we got the image :code:`paddle:dev`, we can use it to develop
   Paddle by mounting the local source code tree into a container that
   runs the image:
   .. code-block:: bash
-      docker build \
+      docker run -d -p 2202:22 -v $PWD:/paddle paddle:dev
-       --build-arg UBUNTU_MIRROR="http://mirrors.163.com" \
+
-       -t paddle:dev \
+   This runs a container of the development environment Docker image
-       -f paddle/scripts/docker/Dockerfile .
+   with the local source tree mounted to :code:`/paddle` of the
   container.
   Note that the default entry-point of :code:`paddle:dev` is
   :code:`sshd`, and above :code:`docker run` commands actually starts
   an SSHD server listening on port 2202.  This allows us to log into
   this container with:
   .. code-block:: bash
      ssh root@localhost -p 2202
-3. Run the image as a container and mounting local source code
+   Usually, I run above commands on my Mac.  I can also run them on a
-   directory into the container.  This allows us to change the code on
+   GPU server :code:`xxx.yyy.zzz.www` and ssh from my Mac to it:
   the host and build it within the container.
   .. code-block:: bash
-      docker run       \
+      my-mac$ ssh root@xxx.yyy.zzz.www -p 2202
       -d              \
       --name paddle   \
       -p 2022:22      \
       -v $PWD:/paddle \
       paddle:dev
-   where :code:`-d` makes the container running in background,
+3. Build and Install Using the Development Environment
   :code:`--name paddle` allows us to run a nginx container to serve
   documents in this container, :code:`-p 2022:22` allows us to SSH
   into this container, :code:`-v $PWD:/paddle` shares the source code
   on the host with the container.
-4. SSH into the container:
+   Once I am in the container, I can use
   :code:`paddle/scripts/docker/build.sh` to build, install, and test
   Paddle:
   .. code-block:: bash
-      ssh root@localhost -p 2022
+      /paddle/paddle/scripts/docker/build.sh
-5. We can edit the source code in the container or on this host.  Then
+   This builds everything about Paddle in :code:`/paddle/build`.  And
-   we can build using cmake
+   we can run unit tests there:
   .. code-block:: bash
-      cd /paddle # where paddle source code has been mounted into the container
+      cd /paddle/build
-      mkdir -p build
+      ctest
      cd build
      cmake -DWITH_TESTING=ON ..
      make -j `nproc`
      CTEST_OUTPUT_ON_FAILURE=1 ctest
 CPU-only and GPU Images
--- a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
+++ b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
@ -32,7 +32,7 @@ pooling_layer 的使用示例如下，详细见 :ref:`api_trainer_config_helpers
 - `pooling_type` 目前支持两种，分别是：MaxPooling()和AvgPooling()。
- `agg_level=AggregateLevel.TIMESTEP` 时（默认值）：
+- `agg_level=AggregateLevel.EACH_TIMESTEP` 时（默认值）：
  - 作用：双层序列经过运算变成一个0层序列，或单层序列经过运算变成一个0层序列
  - 输入：一个双层序列，或一个单层序列
@ -54,7 +54,7 @@ last_seq 的使用示例如下（ :ref:`api_trainer_config_helpers_layers_first_
        last = last_seq(input=layer,
                        agg_level=AggregateLevel.EACH_SEQUENCE)
- `agg_level=AggregateLevel.TIMESTEP` 时（默认值）：
+- `agg_level=AggregateLevel.EACH_TIMESTEP` 时（默认值）：
  - 作用：一个双层序列经过运算变成一个0层序列，或一个单层序列经过运算变成一个0层序列
  - 输入：一个双层序列或一个单层序列
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@ -10,6 +10,7 @@
  usage/cmd_parameter/index_cn.rst
  usage/concepts/use_concepts_cn.rst
  usage/cluster/cluster_train_cn.md
  usage/k8s/k8s_basis_cn.md
  usage/k8s/k8s_cn.md
  usage/k8s/k8s_distributed_cn.md
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
@ -6,7 +6,7 @@
 在本文中，我们将阐释如何在集群上运行分布式 Paddle 训练作业。我们将以[推荐系统](https://github.com/baidu/Paddle/tree/develop/demo/recommendation)为例创建分布式的单进程训练。
-在本文中使用的[脚本](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train)通过 SSH 运行分布式作业。 它们还可以供那些运行更复杂的集群管理系统（如 MPI 和 [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/k8s) ）的用户参考。
+在本文中使用的[脚本](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train)通过 SSH 运行分布式作业。 它们还可以供那些运行更复杂的集群管理系统（如 MPI 和 [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/k8s) ）的用户参考。
 ## 前提条件
--- a/Show More
+++ b/Show More