Merge pull request #484 from PaddlePaddle/release/v0.9.0

Release v0.9.0
9 years ago · 4c2b3b6ea3
parent d539e78012 0ad33b5407
commit 4c2b3b6ea3
475 changed files with 26662 additions and 11172 deletions
--- a/.clang-format
+++ b/.clang-format
@ -13,8 +13,6 @@
 # The document of clang-format is 
 #   http://clang.llvm.org/docs/ClangFormat.html
 #   http://clang.llvm.org/docs/ClangFormatStyleOptions.html
 #
 # TODO(yuyang18): Add python and other language code style
 ---
 Language:        Cpp
 BasedOnStyle:  Google
@ -22,8 +20,9 @@ IndentWidth:     2
 TabWidth:        2
 ContinuationIndentWidth: 4
 AccessModifierOffset: -2  # The private/protected/public has no indent in class
 PointerAlignment: Left    # int* p/int& p, not int *p/int &p
 Standard:  Cpp11 
 AllowAllParametersOfDeclarationOnNextLine: true
 BinPackParameters: false
 BinPackArguments: false
 ...
--- a/.gitignore
+++ b/.gitignore
@ -4,3 +4,7 @@ build/
 .vscode
 .idea
 .project
 .cproject
 .pydevproject
 Makefile
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,24 @@
 -   repo: https://github.com/Lucas-C/pre-commit-hooks.git
    sha: c25201a00e6b0514370501050cf2a8538ac12270
    hooks:
    -   id: remove-crlf
 -   repo: https://github.com/reyoung/mirrors-yapf.git
    sha: v0.13.2
    hooks:
    -   id: yapf
 -   repo: https://github.com/pre-commit/pre-commit-hooks
    sha: 4ef03c4223ad322c7adaa6c6c0efb26b57df3b71
    hooks:
    -   id: check-added-large-files
    -   id: check-merge-conflict
    -   id: check-symlinks
    -   id: detect-private-key
    -   id: end-of-file-fixer
 # TODO(yuyang): trailing whitespace has some bugs on markdown 
 # files now, please not add it to pre-commit hook now
 #    -   id: trailing-whitespace
 #
 # TODO(yuyang): debug-statements not fit for Paddle, because
 # not all of our python code is runnable. Some are used for 
 # documenation
 #    -   id: debug-statements
--- a/.style.yapf
+++ b/.style.yapf
@ -0,0 +1,3 @@
 [style]
 based_on_style = pep8
 column_limit = 80
--- a/.travis.yml
+++ b/.travis.yml
@ -2,9 +2,17 @@ language: cpp
 cache: ccache
 sudo: required
 dist: trusty
 os:
  - linux
  - osx
 env:
  - JOB=DOCS
  - JOB=BUILD_AND_TEST
 matrix:
  exclude:
    - os: osx
      env: JOB=DOCS  # Only generate documentation in linux
 addons:
  apt:
    packages:
@ -27,9 +35,22 @@ addons:
      - libgoogle-glog-dev
      - libgflags-dev
      - libgtest-dev
      - curl
      - lcov
      - graphviz
      - swig
 before_install:
-  - pip install wheel protobuf sphinx breathe recommonmark
+  - |
-  - sudo paddle/scripts/travis/before_install.sh
+    if [ ${JOB} == "BUILD_AND_TEST" ]; then
      if ! git diff --name-only $TRAVIS_COMMIT_RANGE | grep -qvE '(\.md$)'
      then
        echo "Only markdown docs were updated, stopping build process."
        exit
      fi
    fi
  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
  - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
  - pip install wheel protobuf sphinx breathe recommonmark virtualenv numpy
 script:
  - paddle/scripts/travis/main.sh
 notifications:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -2,14 +2,14 @@ cmake_minimum_required(VERSION 2.8)
 project(paddle CXX C)
 set(PADDLE_MAJOR_VERSION 0)
-set(PADDLE_MINOR_VERSION 8)
+set(PADDLE_MINOR_VERSION 9)
-set(PADDLE_PATCH_VERSION 0b1)
+set(PADDLE_PATCH_VERSION 0)
 set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
 set(PROJ_ROOT ${CMAKE_SOURCE_DIR})
 include(package)
-include(swig)
+find_package(SWIG 2.0)
 find_package(CUDA QUIET)
 find_package(Protobuf REQUIRED)
 find_package(PythonLibs 2.7 REQUIRED)
@ -40,6 +40,9 @@ option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ${GTEST_FOUND})
 option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
 option(WITH_SWIG_PY "Compile PaddlePaddle with py PaddlePaddle prediction api" ${SWIG_FOUND})
 option(ON_TRAVIS "Running test on travis-ci or not." OFF)
 option(ON_COVERALLS "Generating code coverage data on coveralls or not." OFF)
 option(COVERALLS_UPLOAD "Uploading the generated coveralls json." ON)
 if(NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING 
        "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
@ -49,11 +52,16 @@ endif()
 include(enableCXX11)
 include(cpplint)
 include(ccache)
 if(WITH_RDMA)
  include(rdma)
 endif()
 include(util)
 include(flags)
 include(cudnn)
 include(FindPythonModule)
 include(check_packages)
 include(swig)
 include(coveralls)
 # add PaddlePaddle version
 if(DEFINED ENV{PADDLE_VERSION})
@ -65,6 +73,19 @@ else()
            Subversion_WC_INFO(${PROJ_ROOT} Project)
            add_definitions(-DPADDLE_VERSION=${Project_WC_REVISION})
        endif()
    elseif(EXISTS ${PROJ_ROOT}/.git/)
        find_package(Git REQUIRED)
        execute_process(
            COMMAND ${GIT_EXECUTABLE} log -1 --format=%H
            WORKING_DIRECTORY ${PROJ_ROOT}
            OUTPUT_VARIABLE GIT_SHA1
            RESULT_VARIABLE GIT_RESULT
            ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
        if(NOT ${GIT_RESULT})
            add_definitions(-DPADDLE_VERSION=\"${GIT_SHA1}\")
        else()
            message(WARNING "Cannot add paddle version from git tag")
        endif()
    endif()
 endif()
@ -74,11 +95,24 @@ if(NOT WITH_GPU)
    add_definitions(-DHPPL_STUB_FUNC)
    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
    if(${CUDA_VERSION_MAJOR} GREATER 6)
        if(COMPILER_SUPPORT_CXX11)
            LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
        endif()
    endif()
    # TODO(yuyang18): Change it to remove std=c++11 in cuda compile.
    set(CUDA_PROPAGATE_HOST_FLAGS OFF)
    if(NOT CUDNN_FOUND)
        message(FATAL_ERROR "Paddle need cudnn to compile")
    endif()
    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-g -O3 --use_fast_math")
    if(WITH_AVX)
        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${AVX_FLAG}")
    else(WITH_AVX)
        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
    endif(WITH_AVX)
    if(WITH_DSO)
        set(CUDA_LIBRARIES "")
@ -91,7 +125,7 @@ else()
 endif(NOT WITH_GPU)
 if(WITH_DOUBLE)
-    add_definitions(-DPADDLE_TYPE_DOUBLE -DHPPL_TYPE_DOUBLE)
+    add_definitions(-DPADDLE_TYPE_DOUBLE)
    set(ACCURACY double)
 else(WITH_DOUBLE)
    set(ACCURACY float)
@ -102,11 +136,11 @@ if(NOT WITH_TIMER)
 endif(NOT WITH_TIMER)
 if(WITH_AVX)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAGS}")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")
 else(WITH_AVX)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse3")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SSE3_FLAG}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse3")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SSE3_FLAG}")
 endif(WITH_AVX)
 if(WITH_PYTHON)
@ -116,12 +150,15 @@ else(WITH_PYTHON)
    add_definitions(-DPADDLE_NO_PYTHON)
 endif(WITH_PYTHON)
-if(NOT WITH_RDMA)
+if(WITH_RDMA)
  include_directories("${RDMA_INC_DIR}")
 else(WITH_RDMA)
  add_definitions(-DPADDLE_DISABLE_RDMA)
-endif()
+endif(WITH_RDMA)
 if(WITH_GLOG)
    add_definitions(-DPADDLE_USE_GLOG)
    include_directories(${LIBGLOG_INCLUDE_DIR})
 endif()
 if(WITH_GFLAGS)
--- a/ISSUE_TEMPLATE.md
+++ b/ISSUE_TEMPLATE.md
@ -0,0 +1,14 @@
 Thank you for contributing to PaddlePaddle. Submitting an issue is a great help for us.
 Both Chinese and English issues are welcome.
 It's hard to solve a problem when important details are missing.
 Before submitting the issue, look over the following criteria before handing your request in.
 - [ ] Was there a similar issue submitted or resolved before ? You could search issue in the github.
 - [ ] Did you retrieve your issue from widespread search engines ?
 - [ ] Is my description of the issue clear enough to reproduce this problem?
   * If some errors occurred, we need details about `how do you run your code?`, `what system do you use?`, `Are you using GPU or not?`, etc.
   * If you use an recording [asciinema](https://asciinema.org/) to show what you are doing to make it happen, that's awesome! We could help you solve the problem more quickly.
 - [ ] Is my description of the issue use the github markdown correctly?
   * Please use the proper markdown syntaxes for styling all forms of writing, e.g, source code, error information, etc.
   * Check out [this page](https://guides.github.com/features/mastering-markdown/) to find out much more about markdown.
--- a/README.md
+++ b/README.md
@ -1,17 +1,21 @@
 # PaddlePaddle
 [![Build Status](https://travis-ci.org/baidu/Paddle.svg?branch=master)](https://travis-ci.org/baidu/Paddle)
 Welcome to the PaddlePaddle GitHub.
-The software will be released on Sept. 30 with full documentation and installation support. 
+[![Build Status](https://travis-ci.org/baidu/Paddle.svg?branch=master)](https://travis-ci.org/baidu/Paddle)
 [![Coverage Status](https://coveralls.io/repos/github/baidu/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/baidu/Paddle?branch=develop)
 [![Join the chat at https://gitter.im/PaddlePaddle/Deep_Learning](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/PaddlePaddle/Deep_Learning?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 [![License](https://img.shields.io/badge/license-Apache%202.0-green.svg)](LICENSE)
-A pre-release version is available now for those who are eager to take a look.
+Welcome to the PaddlePaddle GitHub.
 PaddlePaddle (PArallel Distributed Deep LEarning) is an easy-to-use,
 efficient, flexible and scalable deep learning platform, which is originally
 developed by Baidu scientists and engineers for the purpose of applying deep
 learning to many products at Baidu.
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/baidu/Paddle/releases) to track the latest feature of PaddlePaddle. 
 ## Features
 - **Flexibility**
@ -27,11 +31,12 @@ learning to many products at Baidu.
    optimization occurs at different levels of PaddlePaddle, including
    computing, memory, architecture and communication. The following are some
    examples:
-  1. Optimized math operations through SSE/AVX intrinsics, BLAS libraries
+
      - Optimized math operations through SSE/AVX intrinsics, BLAS libraries
      (e.g. MKL, ATLAS, cuBLAS) or customized CPU/GPU kernels. 
-  2. Highly optimized recurrent networks which can handle **variable-length** 
+      - Highly optimized recurrent networks which can handle **variable-length** 
      sequence without padding.
-  3. Optimized local and distributed training for models with high dimensional
+      - Optimized local and distributed training for models with high dimensional
      sparse data.
 - **Scalability**
@ -51,10 +56,12 @@ learning to many products at Baidu.
    the capability of PaddlePaddle to make a huge impact for your product.
 ## Installation
-See [Installation Guide](http://paddlepaddle.org/doc/build/) to install from pre-built package or build from the source code. (Note: The installation packages are still in pre-release state and your experience of installation may not be smooth.).
+Check out the [Install Guide](http://paddlepaddle.org/doc/build/) to install from
 pre-built packages (**docker image**, **deb package**) or 
 directly build on **Linux** and **Mac OS X** from the source code.
 ## Documentation
- [Chinese Documentation](http://paddlepaddle.org/doc_cn/) <br>
+Both [English Docs](http://paddlepaddle.org/doc/) and [Chinese Docs](http://paddlepaddle.org/doc_cn/) are provided for our users and developers.
 - [Quick Start](http://paddlepaddle.org/doc/demo/quick_start/index_en) <br>
   You can follow the quick start tutorial to learn how use PaddlePaddle
@ -81,9 +88,9 @@ See [Installation Guide](http://paddlepaddle.org/doc/build/) to install from pre
 - [Source Code Documents](http://paddlepaddle.org/doc/source/) <br>
 ## Ask Questions
-
+Please join the [**gitter chat**](https://gitter.im/PaddlePaddle/Deep_Learning) or send email to
-If you want to ask questions and discuss about methods and models, welcome
+**paddle-dev@baidu.com** to ask questions and talk about methods and models.
-to send email to paddle-dev@baidu.com. Framework development discussions and
+Framework development discussions and
 bug reports are collected on [Issues](https://github.com/baidu/paddle/issues).
 ## Copyright and License
--- a/RELEASE.md
+++ b/RELEASE.md
@ -0,0 +1,69 @@
 # Release v0.9.0
 ## New Features:
 * New Layers
  * bilinear interpolation layer.
  * spatial pyramid-pool layer.
  * de-convolution layer.
  * maxout layer.
 * Support rectangle padding, stride, window and input for Pooling Operation.
 * Add —job=time in trainer, which can be used to print time info without compiler option -WITH_TIMER=ON.
 * Expose cost_weight/nce_layer in `trainer_config_helpers`
 * Add FAQ, concepts, h-rnn docs.
 * Add Bidi-LSTM and DB-LSTM to quick start demo @alvations
 * Add usage track scripts.
 ## Improvements
 * Add Travis-CI for Mac OS X. Enable swig unittest in Travis-CI. Skip Travis-CI when only docs are changed.
 * Add code coverage tools.
 * Refine convolution layer to speedup and reduce GPU memory.
 * Speed up PyDataProvider2
 * Add ubuntu deb package build scripts.
 * Make Paddle use git-flow branching model.
 * PServer support no parameter blocks.
 ## Bug Fixes
 * add zlib link to py_paddle
 * add input sparse data check for sparse layer at runtime
 * Bug fix for sparse matrix multiplication
 * Fix floating-point overflow problem of tanh
 * Fix some nvcc compile options
 * Fix a bug in yield dictionary in DataProvider
 * Fix SRL hang when exit.
 # Release v0.8.0beta.1
 New features:
 * Mac OSX is supported by source code. #138
   * Both GPU and CPU versions of PaddlePaddle are supported.
 * Support CUDA 8.0
 * Enhance `PyDataProvider2`
   * Add dictionary yield format. `PyDataProvider2` can yield a dictionary with key is data_layer's name, value is features.
   * Add `min_pool_size` to control memory pool in provider.
 * Add `deb` install package & docker image for no_avx machines.
   * Especially for cloud computing and virtual machines
 * Automatically disable `avx` instructions in cmake when machine's CPU don't support `avx` instructions.
 * Add Parallel NN api in trainer_config_helpers.
 * Add `travis ci` for Github
 Bug fixes:
 * Several bugs in trainer_config_helpers. Also complete the unittest for trainer_config_helpers
 * Check if PaddlePaddle is installed when unittest.
 * Fix bugs in GTX series GPU
 * Fix bug in MultinomialSampler
 Also more documentation was written since last release.
 # Release v0.8.0beta.0
 PaddlePaddle v0.8.0beta.0 release. The install package is not stable yet and it's a pre-release version.
--- a/cmake/FindAVX.cmake
+++ b/cmake/FindAVX.cmake
@ -3,36 +3,55 @@
 INCLUDE(CheckCXXSourceRuns)
 SET(FIND_AVX_10)
 SET(FIND_AVX_20)
 SET(AVX_FLAGS)
 SET(AVX_FOUND)
 # Check AVX 2
 SET(CMAKE_REQUIRED_FLAGS)
 IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-  SET(CMAKE_REQUIRED_FLAGS "-mavx2")
+    set(MMX_FLAG "-mmmx")
-ELSEIF(MSVC AND NOT CMAKE_CL_64)  # reserve for WINDOWS
+    set(SSE2_FLAG "-msse2")
-  SET(CMAKE_REQUIRED_FLAGS "/arch:AVX2")
+    set(SSE3_FLAG "-msse3")
    SET(AVX_FLAG "-mavx")
    SET(AVX2_FLAG "-mavx2")
 ELSEIF(MSVC)
    set(MMX_FLAG "/arch:MMX")
    set(SSE2_FLAG "/arch:SSE2")
    set(SSE3_FLAG "/arch:SSE3")
    SET(AVX_FLAG "/arch:AVX")
    SET(AVX2_FLAG "/arch:AVX2")
 ENDIF()
 # Check  MMX
 set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG})
 CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
+#include <mmintrin.h>
 int main()
 {
-    __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
+    _mm_setzero_si64();
    __m256i result = _mm256_abs_epi32 (a);
    return 0;
-}" FIND_AVX_20)
+}" MMX_FOUND)
-# Check AVX
+# Check SSE2
-SET(CMAKE_REQUIRED_FLAGS)
+set(CMAKE_REQUIRED_FLAGS ${SSE2_FLAG})
-IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+CHECK_CXX_SOURCE_RUNS("
-    SET(CMAKE_REQUIRED_FLAGS "-mavx")
+#include <emmintrin.h>
-ELSEIF(MSVC AND NOT CMAKE_CL_64)
+int main()
-    SET(CMAKE_REQUIRED_FLAGS "/arch:AVX")
+{
-endif()
+    _mm_setzero_si128();
    return 0;
 }" SSE2_FOUND)
 # Check SSE3
 set(CMAKE_REQUIRED_FLAGS ${SSE3_FLAG})
 CHECK_CXX_SOURCE_RUNS("
 #include <pmmintrin.h>
 int main()
 {
    __m128d a = _mm_set1_pd(6.28);
    __m128d b = _mm_set1_pd(3.14);
    __m128d result = _mm_addsub_pd(a, b);
    result = _mm_movedup_pd(result);
    return 0;
 }" SSE3_FOUND)
 # Check AVX
 set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
 CHECK_CXX_SOURCE_RUNS("
 #include <immintrin.h>
 int main()
@ -41,25 +60,17 @@ int main()
    __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
    __m256 result = _mm256_add_ps (a, b);
    return 0;
-}" FIND_AVX_10)
+}" AVX_FOUND)
-IF(${FIND_AVX_20})
+# Check AVX 2
-    IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
-        SET(AVX_FLAGS "${AVX_FLAGS} -mavx2")
+CHECK_CXX_SOURCE_RUNS("
-    ELSEIF(MSVC)
+#include <immintrin.h>
-        SET(AVX_FLAGS "${AVX_FLAGS} /arch:AVX2")
+int main()
-    ENDIF()
+{
-ENDIF()
+    __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
-
+    __m256i result = _mm256_abs_epi32 (a);
-IF(${FIND_AVX_10})
+    return 0;
-    IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+}" AVX2_FOUND)
        SET(AVX_FLAGS "${AVX_FLAGS} -mavx")
    ELSEIF(MSVC)
        SET(AVX_FLAGS "${AVX_FLAGS} /arch:AVX")
    ENDIF()
 ENDIF()
-IF("${FIND_AVX_10}" OR "${FIND_AVX_20}")
+mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND)
    SET(AVX_FOUND TRUE)
    MESSAGE(STATUS "Find CPU supports ${AVX_FLAGS}.")
 ENDIF()
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@ -1,4 +1,4 @@
-# Find the CBlas libraries
+# Find the CBlas and lapack libraries
 #
 # It will search MKL, atlas, OpenBlas, reference-cblas in order.
 #
@ -17,10 +17,19 @@
 ## Find MKL First.
 set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains MKL")
-find_path(MKL_INCLUDE_DIR mkl.h PATHS ${MKL_ROOT}/include)
+find_path(MKL_INCLUDE_DIR mkl.h PATHS
-find_library(MKL_CORE_LIB NAMES mkl_core PATHS ${MKL_ROOT}/lib)
+  ${MKL_ROOT}/include)
-find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS ${MKL_ROOT}/lib)
+find_path(MKL_INCLUDE_DIR mkl_lapacke.h PATHS
-find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS ${MKL_ROOT}/lib)
+  ${MKL_ROOT}/include)
 find_library(MKL_CORE_LIB NAMES mkl_core PATHS
  ${MKL_ROOT}/lib
  ${MKL_ROOT}/lib/intel64)
 find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS
  ${MKL_ROOT}/lib
  ${MKL_ROOT}/lib/intel64)
 find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
  ${MKL_ROOT}/lib
  ${MKL_ROOT}/lib/intel64)
 if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
@ -30,6 +39,7 @@ if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
          ${MKL_SEQUENTIAL_LIB}
          ${MKL_CORE_LIB})
  add_definitions(-DPADDLE_USE_MKL)
  message(STATUS "Found MKL (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
  return() # return file.
 endif()
@ -48,15 +58,19 @@ set(ATLAS_LIB_SEARCH_PATHS
    )
 find_path(ATLAS_INC_DIR NAMES cblas.h 
  PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
 find_path(ATLAS_CLAPACK_INC_DIR NAMES clapack.h
  PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
 find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3 
  PATHS ${ATLAS_LIB_SEARCH_PATHS})
-find_library(ATLAS_LIB NAMES atlas libatlas.so.3
+find_library(ATLAS_LIB NAMES lapack_atlas liblapack_atlas.so.3
  PATHS ${ATLAS_LIB_SEARCH_PATHS})
 if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB)
  set(CBLAS_PROVIDER ATLAS)
-  set(CBLAS_INC_DIR ${ATLAS_INC_DIR})
+  set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
  set(CBLAS_LIBS ${ATLAS_LIB} ${ATLAS_CBLAS_LIB})
  add_definitions(-DPADDLE_USE_ATLAS)  
  message(STATUS "Found Atlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
  return()
 endif()
@ -76,6 +90,8 @@ set(OPENBLAS_LIB_SEARCH_PATHS
 find_path(OPENBLAS_INC_DIR NAMES cblas.h
  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
 find_path(OPENBLAS_LAPACKE_INC_DIR NAMES lapacke.h
  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
 find_library(OPENBLAS_LIB NAMES openblas
  PATHS ${OPENBLAS_LIB_SEARCH_PATHS})
@ -83,6 +99,7 @@ if(OPENBLAS_INC_DIR AND OPENBLAS_LIB)
  set(CBLAS_PROVIDER OPENBLAS)
  set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR})
  set(CBLAS_LIBS ${OPENBLAS_LIB})
  message(STATUS "Found OpenBlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
  return()
 endif()
--- a/cmake/coveralls.cmake
+++ b/cmake/coveralls.cmake
@ -0,0 +1,103 @@
 # CMake script for code coverage.
 # If _COVERALLS_UPLOAD is ON, it will upload json files to overalls.io automatically.
 # Param _COVERAGE_SRCS          A list of coverage source files.
 # Param _COVERALLS_UPLOAD       Upload the result to coveralls.
 # Param _CMAKE_SCRIPT_PATH      CMake script path.
 function(code_coverage _COVERAGE_SRCS _COVERALLS_UPLOAD _CMAKE_SCRIPT_PATH)
    # clean previous gcov data.
    file(REMOVE_RECURSE ${PROJECT_BINARY_DIR}/*.gcda)
    # find curl for upload JSON soon.
    if (_COVERALLS_UPLOAD)
        find_program(CURL_EXECUTABLE curl)
        if (NOT CURL_EXECUTABLE)
            message(FATAL_ERROR "Coveralls: curl not found!")
        endif()
    endif()
    # When passing a CMake list to an external process, the list
    # will be converted from the format "1;2;3" to "1 2 3".
    set(COVERAGE_SRCS "")
    foreach (SINGLE_SRC ${_COVERAGE_SRCS})
        set(COVERAGE_SRCS "${COVERAGE_SRCS}*${SINGLE_SRC}")
    endforeach()
    # query number of logical cores
    cmake_host_system_information(RESULT core_size QUERY NUMBER_OF_LOGICAL_CORES)
    # coveralls json file.
    set(COVERALLS_FILE ${PROJECT_BINARY_DIR}/coveralls.json)
    add_custom_target(coveralls_generate
        # Run regress tests.
        COMMAND ${CMAKE_CTEST_COMMAND}
                -j ${core_size}
                --output-on-failure
        # Generate Gcov and translate it into coveralls JSON.
        COMMAND ${CMAKE_COMMAND}
                -DCOVERAGE_SRCS="${COVERAGE_SRCS}"
                -DCOVERALLS_OUTPUT_FILE="${COVERALLS_FILE}"
                -DCOV_PATH="${PROJECT_BINARY_DIR}"
                -DPROJECT_ROOT="${PROJECT_SOURCE_DIR}"
                -P "${_CMAKE_SCRIPT_PATH}/coverallsGcovJsons.cmake"
        WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
        COMMENT "Coveralls: generating coveralls output..."
    )
    if (_COVERALLS_UPLOAD)
        message("COVERALLS UPLOAD: ON")
        # Upload the JSON to coveralls.
        add_custom_target(coveralls_upload
            COMMAND ${CURL_EXECUTABLE}
                    -S -F json_file=@${COVERALLS_FILE}
                    https://coveralls.io/api/v1/jobs
            DEPENDS coveralls_generate
            WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
            COMMENT "Coveralls: uploading coveralls output...")
        add_custom_target(coveralls DEPENDS coveralls_upload)
    else()
        message("COVERALLS UPLOAD: OFF")
        add_custom_target(coveralls DEPENDS coveralls_generate)
    endif()
 endfunction()
 if(ON_COVERALLS)
    set(CMAKE_BUILD_TYPE "Debug")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
    set(EXCLUDE_DIRS
        "demo/"
        "build/"
        "tests/"
        ".test_env/"
    )
    if(WITH_GPU)
        file(GLOB_RECURSE PADDLE_SOURCES RELATIVE "${PROJECT_SOURCE_DIR}" "*.cpp" "*.cc" ".c" "*.cu")
    else()
        file(GLOB_RECURSE PADDLE_SOURCES RELATIVE "${PROJECT_SOURCE_DIR}" "*.cpp" "*.cc" "*.c")
    endif()
    # exclude trivial files in PADDLE_SOURCES
    foreach(EXCLUDE_DIR ${EXCLUDE_DIRS})
        foreach(TMP_PATH ${PADDLE_SOURCES})
            string(FIND ${TMP_PATH} ${EXCLUDE_DIR} EXCLUDE_DIR_FOUND)
            if(NOT ${EXCLUDE_DIR_FOUND} EQUAL -1)
                list(REMOVE_ITEM PADDLE_SOURCES ${TMP_PATH})
            endif()
        endforeach(TMP_PATH)
    endforeach()
    # convert to absolute path
    set(PADDLE_SRCS "")
    foreach(PADDLE_SRC ${PADDLE_SOURCES})
        set(PADDLE_SRCS "${PADDLE_SRCS};${PROJECT_SOURCE_DIR}/${PADDLE_SRC}")
    endforeach()
    code_coverage(
        "${PADDLE_SRCS}"
        ${COVERALLS_UPLOAD}
        "${PROJECT_SOURCE_DIR}/cmake"
    )
 endif()
--- a/cmake/coverallsGcovJsons.cmake
+++ b/cmake/coverallsGcovJsons.cmake
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@ -21,12 +21,6 @@ function(safe_set_flag is_c src_list flag_name)
    endif()
    if(${safe_name})
        set(${src_list} "${${src_list}} ${flag_name}" PARENT_SCOPE)
        if(is_c)
          set(CUDA_NVCC_FLAGS
              --compiler-options;${flag_name}
              ${CUDA_NVCC_FLAGS}
              PARENT_SCOPE)
        endif()
    endif()
 endfunction()
@ -40,6 +34,20 @@ macro(safe_set_cxxflag src_list flag_name)
    safe_set_flag(OFF ${src_list} ${flag_name})
 endmacro()
 # helper macro to set nvcc flag
 macro(safe_set_nvflag flag_name)
    string(REPLACE "-" "_" safe_name ${flag_name})
    string(REPLACE "=" "_" safe_name ${safe_name})
    CHECK_C_COMPILER_FLAG(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name})
    set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name})
    if(${safe_name})
        set(CUDA_NVCC_FLAGS
            --compiler-options;${flag_name}
            ${CUDA_NVCC_FLAGS})
    endif()
 endmacro()
 CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
 if(NOT UINT64_MAX_EXISTS)
  set(CMAKE_REQUIRED_DEFINITIONS -D__STDC_LIMIT_MACROS)
@ -63,14 +71,44 @@ set(COMMON_FLAGS
    -Wnon-virtual-dtor
    -Wdelete-non-virtual-dtor
    -Wno-unused-parameter
    -Wno-unused-function
    -Wno-error=literal-suffix
    -Wno-error=unused-local-typedefs)
 set(GPU_COMMON_FLAGS
    -fPIC
    -fno-omit-frame-pointer
    -Wnon-virtual-dtor
    -Wdelete-non-virtual-dtor
    -Wno-unused-parameter
    -Wno-unused-function
    -Wno-error=literal-suffix
    -Wno-error=unused-local-typedefs
    -Wno-error=unused-function  # Warnings in Numpy Header.
 )
 if (APPLE)
    # On Mac OS X build fat binaries with x86_64 architectures by default.
    set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
 else()
    set(GPU_COMMON_FLAGS
        -Wall
        -Wextra
        -Werror
        ${GPU_COMMON_FLAGS})
 endif()
 foreach(flag ${COMMON_FLAGS})
    safe_set_cflag(CMAKE_C_FLAGS ${flag})
    safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
 endforeach()
 foreach(flag ${GPU_COMMON_FLAGS})
    safe_set_nvflag(${flag})
 endforeach()
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
@ -91,12 +129,12 @@ foreach(capability 30 35 50)
      list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}")
 endforeach()
-if (CUDA_VERSION VERSION_GREATER "7.0")
+if (CUDA_VERSION VERSION_GREATER "7.0" OR CUDA_VERSION VERSION_EQUAL "7.0")
      list(APPEND __arch_flags " -gencode arch=compute_52,code=sm_52")
 endif()
 # Modern gpu architectures: Pascal
-if (CUDA_VERSION VERSION_GREATER "8.0")
+if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0")
      list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
 endif()
--- a/cmake/rdma.cmake
+++ b/cmake/rdma.cmake
@ -0,0 +1,76 @@
 # user should download rdma first from subversion repository
 # execute following instruction to download svn mannally
 # svn co https://svn.baidu.com/sys/ip/trunk/rdma/sockrdmav1 rdma/
 # svn co https://svn.baidu.com/sys/ip/trunk/rdma/thirdparty rdma/
 # we use static output in svn repositories to avoid implict bugs from not standard runtime env.
 set(RDMA_ROOT $ENV{RDMA_ROOT} CACHE PATH "Folder contains RDMA sock library and thirdparty library")
 function(generate_rdma_links)
  #redirect to current DIR to isolate the pollution from system runtime environment
  #it can benifits unified control for different gcc environment. 
  #e.g, by default gcc48 did not refer /usr/lib64 which could contain low version
  #runtime libraries that will crash process while loading it. That redirect trick
  #can fix it.
  execute_process(
    COMMAND mkdir -p librdma
    COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so.1
    COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so
    COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so.1
    COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so 
    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
  )
 endfunction(generate_rdma_links)
 #check and set headers
 find_path(RDMA_INC_SXISOCK sxi_sock.h PATHS ${RDMA_ROOT}/sockrdmav1/output/include)
 find_path(RDMA_INC_XIO libxio.h PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
 find_path(RDMA_INC_EVENT event2 PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
 find_path(RDMA_INC_NUMA numa.h PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
 #check and set libs
 find_library(RDMA_LIB_SXISOCK NAMES sxisock PATHS ${RDMA_ROOT}/sockrdmav1/output)
 find_library(RDMA_LIB_XIO NAMES xio PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
 find_library(RDMA_LIB_EVENT NAMES event PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
 find_library(RDMA_LIB_EVENT_CORE NAMES event_core PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
 find_library(RDMA_LIB_EVENT_EXTRA NAMES event_extra PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
 find_library(RDMA_LIB_EVENT_PTHREADS NAMES event_pthreads PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
 find_library(RDMA_LIB_NUMA NAMES numa PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
 if(
    RDMA_INC_SXISOCK AND
    RDMA_INC_XIO AND
    RDMA_INC_EVENT AND
    RDMA_INC_NUMA AND
    RDMA_LIB_SXISOCK AND 
    RDMA_LIB_XIO AND
    RDMA_LIB_EVENT AND
    RDMA_LIB_EVENT_CORE AND
    RDMA_LIB_EVENT_EXTRA AND
    RDMA_LIB_EVENT_PTHREADS AND
    RDMA_LIB_NUMA
    )
  set(RDMA_INC_DIR 
    ${RDMA_INC_SXISOCK} 
    ${RDMA_INC_XIO}
    ${RDMA_INC_EVENT}
    ${RDMA_INC_NUMA})
  set(RDMA_LIBS  
    ${RDMA_LIB_SXISOCK} 
    ${RDMA_LIB_XIO} 
    ${RDMA_LIB_EVENT} 
    ${RDMA_LIB_EVENT_CORE} 
    ${RDMA_LIB_EVENT_EXTRA} 
    ${RDMA_LIB_EVENT_PTHREADS} 
    ${RDMA_LIB_NUMA} 
    )
  set(RDMA_LD_FLAGS "-L./librdma -libverbs -lrdmacm -Xlinker -rpath ./librdma")
  return()
 endif()
 #if this module is not called, RDMA_INC_DIR RDMA_LIBS will be null, so top module always refer this variable
 message(FATAL_ERROR, "RDMA libraries are not found, try to set RDMA_ROOT or check all related libraries.")
--- a/cmake/swig.cmake
+++ b/cmake/swig.cmake
@ -1,25 +1,3 @@
 find_program(
    SWIG_BINARY_PATH
    swig)
 if(${SWIG_BINARY_PATH} STREQUAL "SWIG_BINARY_PATH-NOTFOUND")
    set(SWIG_FOUND OFF)
 else()
    set(SWIG_FOUND ON)
 endif()
 set(MIN_SWIG_VERSION 2)
 if(SWIG_FOUND)
    execute_process(COMMAND sh -c "${SWIG_BINARY_PATH} -version | grep Version | cut -f3 -d' '"
        OUTPUT_VARIABLE _SWIG_VERSION
        OUTPUT_STRIP_TRAILING_WHITESPACE)
    if(${_SWIG_VERSION} VERSION_LESS ${MIN_SWIG_VERSION})
        message("swig version ${MIN_SWIG_VERSION} or greater is needed for generating python api. "
                 "Only version ${_SWIG_VERSION} is found. Set SWIG_FOUND to FALSE")
        set(SWIG_FOUND FALSE)
    endif(${_SWIG_VERSION} VERSION_LESS ${MIN_SWIG_VERSION})
 endif(SWIG_FOUND)
 function(generate_python_api target_name)
    add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
                              ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
@ -27,6 +5,7 @@ function(generate_python_api target_name)
        COMMAND swig -python -c++ -outcurrentdir -I../ api/Paddle.swig
                && mv ${PROJ_ROOT}/paddle/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
        DEPENDS ${PROJ_ROOT}/paddle/api/Paddle.swig
                ${PROJ_ROOT}/paddle/api/PaddleAPI.h
        WORKING_DIRECTORY ${PROJ_ROOT}/paddle
        COMMENT "Generate Python API from swig")
    add_custom_target(${target_name} ALL DEPENDS
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@ -67,6 +67,10 @@ endmacro()
 #
 # It will handle WITH_PYTHON/WITH_GLOG etc.
 function(link_paddle_exe TARGET_NAME)
    if(WITH_RDMA)
        generate_rdma_links()
    endif()
    if(WITH_METRIC)
        if(WITH_GPU)
            set(METRIC_LIBS paddle_metric_learning paddle_dserver_lib metric metric_cpu)
@ -110,6 +114,12 @@ function(link_paddle_exe TARGET_NAME)
        ${INTERAL_LIBS}
        ${CMAKE_DL_LIBS})
    if(WITH_RDMA)
        target_link_libraries(${TARGET_NAME}
            ${RDMA_LD_FLAGS}
            ${RDMA_LIBS})
    endif()
    if(WITH_PYTHON)
        target_link_libraries(${TARGET_NAME}
            ${PYTHON_LIBRARIES})
@ -178,9 +188,18 @@ macro(add_simple_unittest TARGET_NAME)
    add_unittest(${TARGET_NAME} ${TARGET_NAME}.cpp)
 endmacro()
-macro(add_paddle_culib TARGET_NAME)
+# Creates C resources file from files in given resource file
-    set(NVCC_FLAG ${CUDA_NVCC_FLAGS})
+function(create_resources res_file output)
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--use_fast_math)
+    # Create empty output file
-    cuda_add_library(${TARGET_NAME} STATIC ${ARGN})
+    file(WRITE ${output} "")
-    set(CUDA_NVCC_FLAGS ${NVCC_FLAG})
+    # Get short filename
-endmacro()
+    string(REGEX MATCH "([^/]+)$" filename ${res_file})
    # Replace filename spaces & extension separator for C compatibility
    string(REGEX REPLACE "\\.| |-" "_" filename ${filename})
    # Read hex data from file
    file(READ ${res_file} filedata HEX)
    # Convert hex data for C compatibility
    string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," filedata ${filedata})
    # Append data to output file
    file(APPEND ${output} "const unsigned char ${filename}[] = {${filedata}};\nconst unsigned ${filename}_size = sizeof(${filename});\n")
 endfunction()
--- a/demo/image_classification/.gitignore
+++ b/demo/image_classification/.gitignore
@ -5,3 +5,5 @@ plot.png
 train.log
 image_provider_copy_1.py
 *pyc
 train.list
 test.list
--- a/demo/image_classification/data/download_cifar.sh
+++ b/demo/image_classification/data/download_cifar.sh
--- a/demo/image_classification/data/process_cifar.py
+++ b/demo/image_classification/data/process_cifar.py
@ -16,7 +16,6 @@ import numpy as np
 import sys
 import os
 import PIL.Image as Image
 """
  Usage: python process_cifar input_dir output_dir
 """
@ -30,6 +29,7 @@ def mkdir_not_exist(path):
    if not os.path.exists(path):
        os.mkdir(path)
 def create_dir_structure(output_dir):
    """
    Create the directory structure for the directory.
@ -39,8 +39,8 @@ def create_dir_structure(output_dir):
    mkdir_not_exist(os.path.join(output_dir, "train"))
    mkdir_not_exist(os.path.join(output_dir, "test"))
-def convert_batch(batch_path, label_set, label_map,
+
-                  output_dir, data_split):
+def convert_batch(batch_path, label_set, label_map, output_dir, data_split):
    """
    Convert CIFAR batch to the structure of Paddle format.
    batch_path: the batch to be converted.
@ -67,11 +67,23 @@ if __name__ == '__main__':
    output_dir = sys.argv[2]
    num_batch = 5
    create_dir_structure(output_dir)
-    label_map = {0: "airplane", 1: "automobile", 2: "bird", 3: "cat", 4: "deer",
+    label_map = {
-                 5: "dog", 6: "frog", 7: "horse", 8: "ship", 9: "truck"}
+        0: "airplane",
        1: "automobile",
        2: "bird",
        3: "cat",
        4: "deer",
        5: "dog",
        6: "frog",
        7: "horse",
        8: "ship",
        9: "truck"
    }
    labels = {}
    for i in range(1, num_batch + 1):
-        convert_batch(os.path.join(input_dir, "data_batch_%d" % i), labels,
+        convert_batch(
-                      label_map, output_dir, "train")
+            os.path.join(input_dir, "data_batch_%d" % i), labels, label_map,
-    convert_batch(os.path.join(input_dir, "test_batch"), {},
+            output_dir, "train")
-                  label_map, output_dir, "test")
+    convert_batch(
        os.path.join(input_dir, "test_batch"), {}, label_map, output_dir,
        "test")
--- a/demo/image_classification/image_provider.py
+++ b/demo/image_classification/image_provider.py
@ -46,26 +46,31 @@ def hook(settings, img_size, mean_img_size, num_classes, color, meta, use_jpeg,
    settings.img_mean = image_util.load_meta(settings.meta_path,
                                             settings.mean_img_size,
-                                             settings.img_size,
+                                             settings.img_size, settings.color)
                                             settings.color)
    settings.logger.info('Image size: %s', settings.img_size)
    settings.logger.info('Meta path: %s', settings.meta_path)
    settings.input_types = [
        dense_vector(settings.img_raw_size),  # image feature
-        integer_value(settings.num_classes)]  # labels
+        integer_value(settings.num_classes)
    ]  # labels
    settings.logger.info('DataProvider Initialization finished')
-@provider(init_hook=hook)
+@provider(init_hook=hook, min_pool_size=0)
-def processData(settings, file_name):
+def processData(settings, file_list):
    """
    The main function for loading data.
    Load the batch, iterate all the images and labels in this batch.
-    file_name: the batch file name.
+    file_list: the batch file list.
    """
-    data = cPickle.load(io.open(file_name, 'rb'))
+    with open(file_list, 'r') as fdata:
        lines = [line.strip() for line in fdata]
        random.shuffle(lines)
        for file_name in lines:
            with io.open(file_name.strip(), 'rb') as file:
                data = cPickle.load(file)
                indexes = list(range(len(data['images'])))
                if settings.is_train:
                    random.shuffle(indexes)
@ -74,8 +79,8 @@ def processData(settings, file_name):
                        img = image_util.decode_jpeg(data['images'][i])
                    else:
                        img = data['images'][i]
-        img_feat = image_util.preprocess_img(img, settings.img_mean,
+                    img_feat = image_util.preprocess_img(
-                                             settings.img_size, settings.is_train,
+                        img, settings.img_mean, settings.img_size,
-                                             settings.color)
+                        settings.is_train, settings.color)
                    label = data['labels'][i]
-        yield img_feat.tolist(), int(label)
+                    yield img_feat.astype('float32'), int(label)
--- a/demo/image_classification/image_util.py
+++ b/demo/image_classification/image_util.py
@ -16,6 +16,7 @@ import numpy as np
 from PIL import Image
 from cStringIO import StringIO
 def resize_image(img, target_size):
    """
    Resize an image so that the shorter edge has length target_size.
@ -23,10 +24,12 @@ def resize_image(img, target_size):
    target_size: the target resized image size.
    """
    percent = (target_size / float(min(img.size[0], img.size[1])))
-    resized_size = int(round(img.size[0] * percent)), int(round(img.size[1] * percent))
+    resized_size = int(round(img.size[0] * percent)), int(
        round(img.size[1] * percent))
    img = img.resize(resized_size, Image.ANTIALIAS)
    return img
 def flip(im):
    """
    Return the flipped image.
@ -38,6 +41,7 @@ def flip(im):
    else:
        return im[:, ::-1]
 def crop_img(im, inner_size, color=True, test=True):
    """
    Return cropped image.
@ -50,7 +54,8 @@ def crop_img(im, inner_size, color=True, test=True):
      If True, crop the center of images.
    """
    if color:
-        height, width = max(inner_size, im.shape[1]), max(inner_size, im.shape[2])
+        height, width = max(inner_size, im.shape[1]), max(inner_size,
                                                          im.shape[2])
        padded_im = np.zeros((3, height, width))
        startY = (height - im.shape[1]) / 2
        startX = (width - im.shape[2]) / 2
@ -58,7 +63,8 @@ def crop_img(im, inner_size, color=True, test=True):
        padded_im[:, startY:endY, startX:endX] = im
    else:
        im = im.astype('float32')
-        height, width = max(inner_size, im.shape[0]), max(inner_size, im.shape[1])
+        height, width = max(inner_size, im.shape[0]), max(inner_size,
                                                          im.shape[1])
        padded_im = np.zeros((height, width))
        startY = (height - im.shape[0]) / 2
        startX = (width - im.shape[1]) / 2
@ -79,12 +85,14 @@ def crop_img(im, inner_size, color=True, test=True):
        pic = flip(pic)
    return pic
 def decode_jpeg(jpeg_string):
    np_array = np.array(Image.open(StringIO(jpeg_string)))
    if len(np_array.shape) == 3:
        np_array = np.transpose(np_array, (2, 0, 1))
    return np_array
 def preprocess_img(im, img_mean, crop_size, is_train, color=True):
    """
    Does data augmentation for images.
@ -99,6 +107,7 @@ def preprocess_img(im, img_mean, crop_size, is_train, color=True):
    pic -= img_mean
    return pic.flatten()
 def load_meta(meta_path, mean_img_size, crop_size, color=True):
    """
    Return the loaded meta file.
@ -111,15 +120,16 @@ def load_meta(meta_path, mean_img_size, crop_size, color=True):
    if color:
        assert (mean_img_size * mean_img_size * 3 == mean.shape[0])
        mean = mean.reshape(3, mean_img_size, mean_img_size)
-        mean = mean[:, border: border + crop_size,
+        mean = mean[:, border:border + crop_size, border:border +
-                       border: border + crop_size].astype('float32')
+                    crop_size].astype('float32')
    else:
        assert (mean_img_size * mean_img_size == mean.shape[0])
        mean = mean.reshape(mean_img_size, mean_img_size)
-        mean = mean[border: border + crop_size,
+        mean = mean[border:border + crop_size, border:border +
-                    border: border + crop_size].astype('float32')
+                    crop_size].astype('float32')
    return mean
 def load_image(img_path, is_color=True):
    """
    Load image and return. 
@ -130,6 +140,7 @@ def load_image(img_path, is_color=True):
    img.load()
    return img
 def oversample(img, crop_dims):
    """
    image : iterable of (H x W x K) ndarrays
@ -152,15 +163,14 @@ def oversample(img, crop_dims):
        for j in w_indices:
            crops_ix[curr] = (i, j, i + crop_dims[0], j + crop_dims[1])
            curr += 1
-    crops_ix[4] = np.tile(im_center, (1, 2)) + np.concatenate([
+    crops_ix[4] = np.tile(im_center, (1, 2)) + np.concatenate(
-        -crop_dims / 2.0,
+        [-crop_dims / 2.0, crop_dims / 2.0])
         crop_dims / 2.0
    ])
    crops_ix = np.tile(crops_ix, (2, 1))
    # Extract crops
-    crops = np.empty((10 * len(img), crop_dims[0], crop_dims[1],
+    crops = np.empty(
-                      im_shape[-1]), dtype=np.float32)
+        (10 * len(img), crop_dims[0], crop_dims[1], im_shape[-1]),
        dtype=np.float32)
    ix = 0
    for im in img:
        for crop in crops_ix:
@ -169,9 +179,13 @@ def oversample(img, crop_dims):
        crops[ix - 5:ix] = crops[ix - 5:ix, :, ::-1, :]  # flip for mirrors
    return crops
 class ImageTransformer:
-    def __init__(self, transpose = None,
+    def __init__(self,
-                 channel_swap = None, mean = None, is_color = True):
+                 transpose=None,
                 channel_swap=None,
                 mean=None,
                 is_color=True):
        self.transpose = transpose
        self.channel_swap = None
        self.mean = None
--- a/demo/image_classification/prediction.py
+++ b/demo/image_classification/prediction.py
@ -24,9 +24,11 @@ from py_paddle import swig_paddle, DataProviderConverter
 from paddle.trainer.PyDataProvider2 import dense_vector
 from paddle.trainer.config_parser import parse_config
-logging.basicConfig(format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
+logging.basicConfig(
    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
 logging.getLogger().setLevel(logging.INFO)
 class ImageClassifier():
    def __init__(self,
                 train_conf,
@ -69,7 +71,8 @@ class ImageClassifier():
        conf_args = "is_test=1,use_gpu=%d,is_predict=1" % (gpu)
        conf = parse_config(train_conf, conf_args)
        swig_paddle.initPaddle("--use_gpu=%d" % (gpu))
-        self.network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
+        self.network = swig_paddle.GradientMachine.createFromConfigProto(
            conf.model_config)
        assert isinstance(self.network, swig_paddle.GradientMachine)
        self.network.loadParameters(self.model_dir)
@ -90,14 +93,14 @@ class ImageClassifier():
            # image_util.resize_image: short side is self.resize_dim
            image = image_util.resize_image(image, self.resize_dim)
            image = np.array(image)
-            input = np.zeros((1, image.shape[0], image.shape[1], 3),
+            input = np.zeros(
-                             dtype=np.float32)
+                (1, image.shape[0], image.shape[1], 3), dtype=np.float32)
            input[0] = image.astype(np.float32)
            input = image_util.oversample(input, self.crop_dims)
        else:
            image = image.resize(self.crop_dims, Image.ANTIALIAS)
-            input = np.zeros((1, self.crop_dims[0], self.crop_dims[1], 3),
+            input = np.zeros(
-                             dtype=np.float32)
+                (1, self.crop_dims[0], self.crop_dims[1], 3), dtype=np.float32)
            input[0] = np.array(image).astype(np.float32)
        data_in = []
@ -133,6 +136,7 @@ class ImageClassifier():
        lab = np.argsort(-prob)
        logging.info("Label of %s is: %d", image, lab[0])
 if __name__ == '__main__':
    image_size = 32
    crop_size = 32
@ -144,7 +148,8 @@ if __name__ == '__main__':
    image = sys.argv[2]
    use_gpu = bool(int(sys.argv[3]))
-    obj = ImageClassifier(train_conf=config,
+    obj = ImageClassifier(
        train_conf=config,
        model_dir=model_path,
        resize_dim=image_size,
        crop_dim=crop_size,
--- a/demo/image_classification/preprocess.py
+++ b/demo/image_classification/preprocess.py
@ -19,22 +19,36 @@ from optparse import OptionParser
 def option_parser():
    parser = OptionParser(usage="usage: python preprcoess.py "\
                          "-i data_dir [options]")
-    parser.add_option("-i", "--input", action="store",
+    parser.add_option(
-                      dest="input", help="Input data directory.")
+        "-i",
-    parser.add_option("-s", "--size", action="store",
+        "--input",
-                      dest="size", help="Processed image size.")
+        action="store",
-    parser.add_option("-c", "--color", action="store",
+        dest="input",
-                      dest="color", help="whether to use color images.")
+        help="Input data directory.")
    parser.add_option(
        "-s",
        "--size",
        action="store",
        dest="size",
        help="Processed image size.")
    parser.add_option(
        "-c",
        "--color",
        action="store",
        dest="color",
        help="whether to use color images.")
    return parser.parse_args()
 if __name__ == '__main__':
    options, args = option_parser()
    data_dir = options.input
    processed_image_size = int(options.size)
    color = options.color == "1"
-     data_creator = ImageClassificationDatasetCreater(data_dir,
+    data_creator = ImageClassificationDatasetCreater(
-                                                      processed_image_size,
+        data_dir, processed_image_size, color)
-                                                      color)
+    data_creator.train_list_name = "train.txt"
    data_creator.test_list_name = "test.txt"
    data_creator.num_per_batch = 1000
    data_creator.overwrite = True
    data_creator.create_batches()
--- a/demo/image_classification/preprocess.sh
+++ b/demo/image_classification/preprocess.sh
@ -17,3 +17,6 @@ set -e
 data_dir=./data/cifar-out
 python preprocess.py -i $data_dir -s 32 -c 1
 echo "data/cifar-out/batches/train.txt" > train.list
 echo "data/cifar-out/batches/test.txt" > test.list
--- a/Show More
+++ b/Show More