Merge branch 'develop' of github.com:baidu/Paddle into feature/c_api

8 years ago · 18a3588b24
parent ddbb610fd3 c51ab429f3
commit 18a3588b24
122 changed files with 2794 additions and 877 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -12,19 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License

-cmake_minimum_required(VERSION 3.0)
-
-project(paddle CXX C)
-
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
 set(PROJ_ROOT ${CMAKE_SOURCE_DIR})

+include(system)
+
+if(ANDROID)
+    cmake_minimum_required(VERSION 3.7)
+else()
+    cmake_minimum_required(VERSION 3.0)
+endif()
+
+project(paddle CXX C)
+
 find_package(Sphinx)
-find_package(CUDA QUIET)
+if(NOT CMAKE_CROSSCOMPILING)
+    find_package(CUDA QUIET)
+endif(NOT CMAKE_CROSSCOMPILING)
 find_package(Git REQUIRED)
 find_package(Threads REQUIRED)

-include(system)
 include(simd)

 ################################ Configurations #######################################
@ -52,6 +59,21 @@ if(NOT CMAKE_BUILD_TYPE)
      FORCE)
 endif()

+if(ANDROID)
+    if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
+        message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 21")
+    endif()
+
+    set(WITH_GPU OFF CACHE STRING
+        "Disable GPU when cross-compiling for Android" FORCE)
+    set(WITH_AVX OFF CACHE STRING
+        "Disable AVX when cross-compiling for Android" FORCE)
+    set(WITH_PYTHON OFF CACHE STRING
+        "Disable PYTHON when cross-compiling for Android" FORCE)
+    set(WITH_RDMA OFF CACHE STRING
+        "Disable RDMA when cross-compiling for Android" FORCE)
+endif(ANDROID)
+
 set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING
  "A path setting third party libraries download & build directories.")

@ -72,6 +94,7 @@ include(external/python)    # download, build, install python
 include(external/openblas)  # download, build, install openblas
 include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
+include(external/any)       # download libn::any

 include(package)            # set paddle packages
 include(cpplint)            # set paddle c++ style
@ -82,7 +105,6 @@ include(flags)              # set paddle compile flags
 include(cudnn)              # set cudnn libraries
 include(version)            # set PADDLE_VERSION
 include(coveralls)          # set code coverage
-
 include(configure)          # add paddle env configuration

 include_directories("${PROJ_ROOT}")
--- a/11
+++ b/11
@ -1,19 +1,18 @@
 # A image for building paddle binaries
 # Use cuda devel base image for both cpu and gpu environment
-FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
+FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu14.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>

 ARG UBUNTU_MIRROR
 RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'

 # ENV variables
-ARG BUILD_WOBOQ
 ARG WITH_GPU
 ARG WITH_AVX
 ARG WITH_DOC
 ARG WITH_STYLE_CHECK

-ENV BUILD_WOBOQ=${BUILD_WOBOQ:-OFF}
+ENV WOBOQ OFF
 ENV WITH_GPU=${WITH_AVX:-OFF}
 ENV WITH_AVX=${WITH_AVX:-ON}
 ENV WITH_DOC=${WITH_DOC:-OFF}
@ -37,18 +36,20 @@ RUN git config --global credential.helper store
 # Fix locales to en_US.UTF-8
 RUN localedef -i en_US -f UTF-8 en_US.UTF-8

+# FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
+# version util jupyter fixes this issue.
 RUN pip install --upgrade pip && \
    pip install -U 'protobuf==3.1.0' && \
    pip install -U wheel pillow BeautifulSoup && \
    pip install -U docopt PyYAML sphinx && \
    pip install -U sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip install -U pre-commit 'requests==2.9.2' jupyter
+    pip install pre-commit 'requests==2.9.2' 'ipykernel==4.6.0' 'jupyter==1.0.0'

 RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
    cd cmake-3.4.1 && ./bootstrap && make -j `nproc` && make install && \
    cd .. && rm -rf cmake-3.4.1

-VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
+VOLUME ["/woboq_out"]

 # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
 RUN mkdir /var/run/sshd
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@ -19,9 +19,9 @@ set(CBLAS_FOUND OFF)
 set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
 set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL")

-find_path(MKL_INCLUDE_DIR mkl.h PATHS
+find_path(MKL_INC_DIR mkl.h PATHS
  ${MKL_ROOT}/include)
-find_path(MKL_INCLUDE_DIR mkl_lapacke.h PATHS
+find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS
  ${MKL_ROOT}/include)
 find_library(MKL_CORE_LIB NAMES mkl_core PATHS
  ${MKL_ROOT}/lib
@ -34,15 +34,19 @@ find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
  ${MKL_ROOT}/lib/intel64)


-if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
+if(MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
  set(CBLAS_PROVIDER MKL)
-  set(CBLAS_INC_DIR ${MKL_INCLUDE_DIR})
+  set(CBLAS_INC_DIR ${MKL_INC_DIR})
  set(CBLAS_LIBRARIES ${MKL_INTEL_LP64}
          ${MKL_SEQUENTIAL_LIB}
          ${MKL_CORE_LIB})
  add_definitions(-DPADDLE_USE_MKL)
  message(STATUS "Found MKL (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
  set(CBLAS_FOUND ON)
+  if(${MKL_LAPACK_INC_DIR})
+    add_definitions(-DPADDLE_USE_LAPACK)
+    message(STATUS "Found lapack in MKL (include: ${MKL_LAPACK_INC_DIR})")
+  endif()
  return() # return file.
 endif()

@ -68,13 +72,17 @@ find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3
 find_library(ATLAS_LIB NAMES lapack_atlas liblapack_atlas.so.3
  PATHS ${ATLAS_LIB_SEARCH_PATHS})

-if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB)
+if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB AND NOT CBLAS_FOUND)
  set(CBLAS_PROVIDER ATLAS)
-  set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
+  set(CBLAS_INC_DIR ${ATLAS_INC_DIR})
  set(CBLAS_LIBRARIES ${ATLAS_LIB} ${ATLAS_CBLAS_LIB})
  add_definitions(-DPADDLE_USE_ATLAS)  
-  message(STATUS "Found Atlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  message(STATUS "Found ATLAS (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
  set(CBLAS_FOUND ON)
+  if(ATLAS_CLAPACK_INC_DIR)
+    add_definitions(-DPADDLE_USE_LAPACK)
+    message(STATUS "Found lapack in ATLAS (include: ${ATLAS_CLAPACK_INC_DIR})")
+  endif()
  return()
 endif()

@ -103,8 +111,12 @@ if(OPENBLAS_INC_DIR AND OPENBLAS_LIB)
  set(CBLAS_PROVIDER OPENBLAS)
  set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR})
  set(CBLAS_LIBRARIES ${OPENBLAS_LIB})
-  message(STATUS "Found OpenBlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  message(STATUS "Found OpenBLAS (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
  set(CBLAS_FOUND ON)
+  if(OPENBLAS_LAPACKE_INC_DIR)
+    add_definitions(-DPADDLE_USE_LAPACK)
+    message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})")
+  endif()
  return()
 endif()

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -32,6 +32,14 @@ if(NOT WITH_PROFILER)
    add_definitions(-DPADDLE_DISABLE_PROFILER)
 endif(NOT WITH_PROFILER)

+if(NOT CMAKE_CROSSCOMPILING)
+    if(WITH_AVX AND AVX_FOUND)
+        set(SIMD_FLAG ${AVX_FLAG})
+    elseif(SSE3_FOUND)
+        set(SIMD_FLAG ${SSE3_FLAG})
+    endif()
+endif()
+
 if(NOT WITH_GPU)
    add_definitions(-DPADDLE_ONLY_CPU)
    add_definitions(-DHPPL_STUB_FUNC)
@ -48,21 +56,12 @@ else()
        message(FATAL_ERROR "Paddle need cudnn to compile")
    endif()

-    if(WITH_AVX)
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${AVX_FLAG}")
-    else(WITH_AVX)
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
-    endif(WITH_AVX)
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")

    # Include cuda and cudnn
    include_directories(${CUDNN_INCLUDE_DIR})
    include_directories(${CUDA_TOOLKIT_INCLUDE})
 endif(NOT WITH_GPU)

-if(WITH_AVX)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")
-else(WITH_AVX)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SSE3_FLAG}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SSE3_FLAG}")
-endif(WITH_AVX)
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@ -1,3 +1,7 @@
+if(NOT WITH_GPU)
+    return()
+endif()
+
 set(CUDNN_ROOT "" CACHE PATH "CUDNN ROOT")
 find_path(CUDNN_INCLUDE_DIR cudnn.h
    PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include
@ -11,6 +15,7 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
    ${CUDNN_ROOT}
    ${CUDNN_ROOT}/lib64
    ${CUDNN_ROOT}/lib
+    ${CUDNN_ROOT}/lib/x86_64-linux-gnu
    $ENV{CUDNN_ROOT}
    $ENV{CUDNN_ROOT}/lib64
    $ENV{CUDNN_ROOT}/lib
--- a/cmake/external/any.cmake
+++ b/cmake/external/any.cmake
@ -0,0 +1,20 @@
+INCLUDE(ExternalProject)
+
+SET(ANY_SOURCE_DIR ${THIRD_PARTY_PATH}/any)
+
+INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/linb_any)
+
+ExternalProject_Add(
+    linb_any
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/thelink2012/any.git"
+    GIT_TAG         "8fef1e93710a0edf8d7658999e284a1142c4c020"
+    PREFIX          ${ANY_SOURCE_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND     ""
+    INSTALL_COMMAND   ""
+    TEST_COMMAND      ""
+)
+
+add_definitions(-DANY_IMPL_ANY_CAST_MOVEABLE)
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@ -31,9 +31,17 @@ ExternalProject_Add(
    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
    PREFIX          ${GFLAGS_SOURCES_DIR}
    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
    CMAKE_ARGS      -DBUILD_TESTING=OFF
+    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=Release
 )

 LIST(APPEND external_project_dependencies gflags)
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@ -33,11 +33,19 @@ ExternalProject_Add(
    GIT_REPOSITORY  "https://github.com/google/glog.git"
    PREFIX          ${GLOG_SOURCES_DIR}
    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
    CMAKE_ARGS      -DWITH_GFLAGS=ON
    CMAKE_ARGS      -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
    CMAKE_ARGS      -DBUILD_TESTING=OFF
+    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=Release
 )

 LIST(APPEND external_project_dependencies glog)
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@ -41,11 +41,19 @@ IF(WITH_TESTING)
        GIT_TAG         "release-1.8.0"
        PREFIX          ${GTEST_SOURCES_DIR}
        UPDATE_COMMAND  ""
-        CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
+        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+        CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+        CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+        CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+        CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
        CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
        CMAKE_ARGS      -DBUILD_GMOCK=ON
        CMAKE_ARGS      -Dgtest_disable_pthreads=ON
        CMAKE_ARGS      -Dgtest_force_shared_crt=ON
+        CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+        CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
+                         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                         -DCMAKE_BUILD_TYPE:STRING=Release
    )
    LIST(APPEND external_project_dependencies gtest)
 ENDIF(WITH_TESTING)
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@ -29,7 +29,24 @@ IF(NOT ${CBLAS_FOUND})

    IF(CMAKE_COMPILER_IS_GNUCC)
        ENABLE_LANGUAGE(Fortran)
-        LIST(APPEND CBLAS_LIBRARIES gfortran pthread)
+        if (NOT CMAKE_Fortran_COMPILER_VERSION)
+          # cmake < 3.4 cannot get CMAKE_Fortran_COMPILER_VERSION directly.
+          execute_process(COMMAND ${CMAKE_Fortran_COMPILER} -dumpversion
+                    OUTPUT_VARIABLE CMAKE_Fortran_COMPILER_VERSION)
+        endif()
+        string(REGEX MATCHALL "[0-9]+" Fortran_VERSION ${CMAKE_Fortran_COMPILER_VERSION})
+        list(GET Fortran_VERSION 0 Fortran_MAJOR)
+        list(GET Fortran_VERSION 1 Fortran_MINOR)
+        find_library(GFORTRAN_LIBRARY NAMES gfortran PATHS 
+                     /lib
+                     /usr/lib
+                     /usr/lib/gcc/x86_64-linux-gnu/${Fortran_MAJOR}.${Fortran_MINOR}/
+                     /usr/lib/gcc/x86_64-linux-gnu/${Fortran_MAJOR}/)
+        if (NOT GFORTRAN_LIBRARY)
+            message(FATAL_ERROR "Cannot found gfortran library which it is used by openblas")
+        endif()
+        find_package(Threads REQUIRED)
+        LIST(APPEND CBLAS_LIBRARIES ${GFORTRAN_LIBRARY} ${CMAKE_THREAD_LIBS_INIT})
    ENDIF(CMAKE_COMPILER_IS_GNUCC)

    IF(NOT CMAKE_Fortran_COMPILER)
@ -37,6 +54,8 @@ IF(NOT ${CBLAS_FOUND})
                "you need to set gfortran compiler: cmake .. -DCMAKE_Fortran_COMPILER=...")
    ENDIF(NOT CMAKE_Fortran_COMPILER)

+    ADD_DEFINITIONS(-DPADDLE_USE_LAPACK)
+
    ExternalProject_Add(
        openblas
        ${EXTERNAL_PROJECT_LOG_ARGS}
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@ -58,12 +58,20 @@ IF(NOT PROTOBUF_FOUND)
        GIT_TAG         "9f75c5aa851cd877fb0d93ccc31b8567a6706546"
        CONFIGURE_COMMAND
        ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake
-        -Dprotobuf_BUILD_TESTS=OFF
-        -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
-        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-        -DCMAKE_BUILD_TYPE=Release
-        -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
-        -DCMAKE_INSTALL_LIBDIR=lib
+            -Dprotobuf_BUILD_TESTS=OFF
+            -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
+            -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+            -DCMAKE_BUILD_TYPE=Release
+            -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
+            -DCMAKE_INSTALL_LIBDIR=lib
+        CMAKE_CACHE_ARGS
+            -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
+            -DCMAKE_BUILD_TYPE:STRING=Release
+            -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+            -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+            -DZLIB_ROOT:STRING=${ZLIB_ROOT}
    )

    LIST(APPEND external_project_dependencies protobuf)
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@ -219,9 +219,9 @@ ELSE(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)

 ENDIF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)

-INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
-INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
-
-IF(NOT WITH_PYTHON)
+IF(WITH_PYTHON)
+    INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
+    INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
+ELSE()
    SET(PYTHON_LIBRARIES "")
 ENDIF()
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@ -50,12 +50,19 @@ ExternalProject_Add(
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
    CMAKE_ARGS      -DWITH_GPU=${WITH_GPU}
    CMAKE_ARGS      -DWITH_OMP=${USE_OMP}
    CMAKE_ARGS      -DWITH_TORCH=OFF
-    CMAKE_ARGS      -DCMAKE_DISABLE_FIND_PACKAGE_Torch=TRUE
+    CMAKE_ARGS      -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
    CMAKE_ARGS      -DBUILD_SHARED=ON
+    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+    CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=Release
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
 )

 LIST(APPEND external_project_dependencies warpctc)
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@ -22,7 +22,7 @@ SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include dire
 IF(WIN32)
  SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" CACHE FILEPATH "zlib library." FORCE)
 ELSE(WIN32)
-  set(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
+  SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
 ENDIF(WIN32)

 INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
@ -34,10 +34,18 @@ ExternalProject_Add(
    GIT_TAG         "v1.2.8"
    PREFIX          ${ZLIB_SOURCES_DIR}
    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
    CMAKE_ARGS      -DBUILD_SHARED_LIBS=OFF
    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
    CMAKE_ARGS      -DCMAKE_MACOSX_RPATH=ON
+    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR}
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=Release
 )

 LIST(APPEND external_project_dependencies zlib)
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@ -2,6 +2,7 @@
 include(CheckCXXCompilerFlag)
 include(CheckCCompilerFlag)
 include(CheckCXXSymbolExists)
+include(CheckTypeSize)

 function(CheckCompilerCXX11Flag)
    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
@ -25,7 +26,7 @@ function(CheckCompilerCXX11Flag)
 endfunction()

 CheckCompilerCXX11Flag()
-LIST(APPEND CMAKE_CXX_FLAGS -std=c++11)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")

 # safe_set_flag
 #
@ -83,6 +84,17 @@ if(NOT UINT64_MAX_EXISTS)
  endif()
 endif()

+SET(CMAKE_EXTRA_INCLUDE_FILES "pthread.h")
+CHECK_TYPE_SIZE(pthread_spinlock_t SPINLOCK_FOUND)
+CHECK_TYPE_SIZE(pthread_barrier_t BARRIER_FOUND)
+if(SPINLOCK_FOUND)
+  add_definitions(-DPADDLE_USE_PTHREAD_SPINLOCK)
+endif(SPINLOCK_FOUND)
+if(BARRIER_FOUND)
+  add_definitions(-DPADDLE_USE_PTHREAD_BARRIER)
+endif(BARRIER_FOUND)
+SET(CMAKE_EXTRA_INCLUDE_FILES "")
+
 # Common flags. the compiler flag used for C/C++ sources whenever release or debug
 # Do not care if this flag is support for gcc.
 set(COMMON_FLAGS
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@ -2,6 +2,7 @@
 # so that PaddlePaddle can unleash the vectorization power of muticore.

 INCLUDE(CheckCXXSourceRuns)
+INCLUDE(CheckCXXSourceCompiles)

 IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
    set(MMX_FLAG "-mmmx")
@ -17,6 +18,8 @@ ELSEIF(MSVC)
    SET(AVX2_FLAG "/arch:AVX2")
 ENDIF()

+set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS})
+
 # Check  MMX
 set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG})
 CHECK_CXX_SOURCE_RUNS("
@ -73,4 +76,5 @@ int main()
    return 0;
 }" AVX2_FOUND)

+set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
 mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND)
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@ -67,6 +67,12 @@ MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES)
 MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}")
 MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")

+IF(DEFINED CMAKE_SYSTEM_NAME)
+    IF(${CMAKE_SYSTEM_NAME} STREQUAL "Android")
+        SET(ANDROID TRUE)
+    ENDIF()
+ENDIF()
+
 # external dependencies log output
 SET(EXTERNAL_PROJECT_LOG_ARGS
    LOG_DOWNLOAD    0     # Wrap download in script to log output
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@ -90,6 +90,10 @@ function(link_paddle_exe TARGET_NAME)
        ${RDMA_LD_FLAGS}
        ${RDMA_LIBS})

+    if(ANDROID)
+        target_link_libraries(${TARGET_NAME} log)
+    endif(ANDROID)
+
    add_dependencies(${TARGET_NAME} ${external_project_dependencies})
 endfunction()

--- a/demo/seqToseq/api_train_v2.py
+++ b/demo/seqToseq/api_train_v2.py
@ -1,13 +1,17 @@
 import sys
+
 import paddle.v2 as paddle


-def seqToseq_net(source_dict_dim, target_dict_dim):
+def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
    ### Network Architecture
    word_vector_dim = 512  # dimension of word vector
    decoder_size = 512  # dimension of hidden unit in GRU Decoder network
    encoder_size = 512  # dimension of hidden unit in GRU Encoder network

+    beam_size = 3
+    max_length = 250
+
    #### Encoder
    src_word_id = paddle.layer.data(
        name='source_language_word',
@ -67,79 +71,143 @@ def seqToseq_net(source_dict_dim, target_dict_dim):
    group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True)
    group_inputs = [group_input1, group_input2]

-    trg_embedding = paddle.layer.embedding(
-        input=paddle.layer.data(
-            name='target_language_word',
-            type=paddle.data_type.integer_value_sequence(target_dict_dim)),
-        size=word_vector_dim,
-        param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
-    group_inputs.append(trg_embedding)
-
-    # For decoder equipped with attention mechanism, in training,
-    # target embeding (the groudtruth) is the data input,
-    # while encoded source sequence is accessed to as an unbounded memory.
-    # Here, the StaticInput defines a read-only memory
-    # for the recurrent_group.
-    decoder = paddle.layer.recurrent_group(
-        name=decoder_group_name,
-        step=gru_decoder_with_attention,
-        input=group_inputs)
-
-    lbl = paddle.layer.data(
-        name='target_language_next_word',
-        type=paddle.data_type.integer_value_sequence(target_dict_dim))
-    cost = paddle.layer.classification_cost(input=decoder, label=lbl)
-
-    return cost
+    if not is_generating:
+        trg_embedding = paddle.layer.embedding(
+            input=paddle.layer.data(
+                name='target_language_word',
+                type=paddle.data_type.integer_value_sequence(target_dict_dim)),
+            size=word_vector_dim,
+            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+        group_inputs.append(trg_embedding)
+
+        # For decoder equipped with attention mechanism, in training,
+        # target embeding (the groudtruth) is the data input,
+        # while encoded source sequence is accessed to as an unbounded memory.
+        # Here, the StaticInput defines a read-only memory
+        # for the recurrent_group.
+        decoder = paddle.layer.recurrent_group(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs)
+
+        lbl = paddle.layer.data(
+            name='target_language_next_word',
+            type=paddle.data_type.integer_value_sequence(target_dict_dim))
+        cost = paddle.layer.classification_cost(input=decoder, label=lbl)
+
+        return cost
+    else:
+        # In generation, the decoder predicts a next target word based on
+        # the encoded source sequence and the last generated target word.
+
+        # The encoded source sequence (encoder's output) must be specified by
+        # StaticInput, which is a read-only memory.
+        # Embedding of the last generated word is automatically gotten by
+        # GeneratedInputs, which is initialized by a start mark, such as <s>,
+        # and must be included in generation.
+
+        trg_embedding = paddle.layer.GeneratedInputV2(
+            size=target_dict_dim,
+            embedding_name='_target_language_embedding',
+            embedding_size=word_vector_dim)
+        group_inputs.append(trg_embedding)
+
+        beam_gen = paddle.layer.beam_search(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs,
+            bos_id=0,
+            eos_id=1,
+            beam_size=beam_size,
+            max_length=max_length)
+
+        return beam_gen


 def main():
    paddle.init(use_gpu=False, trainer_count=1)
+    is_generating = False

    # source and target dict dim.
    dict_size = 30000
    source_dict_dim = target_dict_dim = dict_size

-    # define network topology
-    cost = seqToseq_net(source_dict_dim, target_dict_dim)
-    parameters = paddle.parameters.create(cost)
-
-    # define optimize method and trainer
-    optimizer = paddle.optimizer.Adam(
-        learning_rate=5e-5,
-        regularization=paddle.optimizer.L2Regularization(rate=1e-3))
-    trainer = paddle.trainer.SGD(cost=cost,
-                                 parameters=parameters,
-                                 update_equation=optimizer)
-
-    # define data reader
-    feeding = {
-        'source_language_word': 0,
-        'target_language_word': 1,
-        'target_language_next_word': 2
-    }
-
-    wmt14_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(dict_size=dict_size), buf_size=8192),
-        batch_size=5)
-
-    # define event_handler callback
-    def event_handler(event):
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 10 == 0:
-                print "\nPass %d, Batch %d, Cost %f, %s" % (
-                    event.pass_id, event.batch_id, event.cost, event.metrics)
+    # train the network
+    if not is_generating:
+        cost = seqToseq_net(source_dict_dim, target_dict_dim)
+        parameters = paddle.parameters.create(cost)
+
+        # define optimize method and trainer
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=5e-5,
+            regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+        trainer = paddle.trainer.SGD(cost=cost,
+                                     parameters=parameters,
+                                     update_equation=optimizer)
+        # define data reader
+        wmt14_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.wmt14.train(dict_size), buf_size=8192),
+            batch_size=5)
+
+        # define event_handler callback
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndIteration):
+                if event.batch_id % 10 == 0:
+                    print "\nPass %d, Batch %d, Cost %f, %s" % (
+                        event.pass_id, event.batch_id, event.cost,
+                        event.metrics)
+                else:
+                    sys.stdout.write('.')
+                    sys.stdout.flush()
+
+        # start to train
+        trainer.train(
+            reader=wmt14_reader, event_handler=event_handler, num_passes=2)
+
+    # generate a english sequence to french
+    else:
+        # use the first 3 samples for generation
+        gen_creator = paddle.dataset.wmt14.gen(dict_size)
+        gen_data = []
+        gen_num = 3
+        for item in gen_creator():
+            gen_data.append((item[0], ))
+            if len(gen_data) == gen_num:
+                break
+
+        beam_gen = seqToseq_net(source_dict_dim, target_dict_dim, is_generating)
+        # get the pretrained model, whose bleu = 26.92
+        parameters = paddle.dataset.wmt14.model()
+        # prob is the prediction probabilities, and id is the prediction word. 
+        beam_result = paddle.infer(
+            output_layer=beam_gen,
+            parameters=parameters,
+            input=gen_data,
+            field=['prob', 'id'])
+
+        # get the dictionary
+        src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+
+        # the delimited element of generated sequences is -1,
+        # the first element of each generated sequence is the sequence length
+        seq_list = []
+        seq = []
+        for w in beam_result[1]:
+            if w != -1:
+                seq.append(w)
            else:
-                sys.stdout.write('.')
-                sys.stdout.flush()
-
-    # start to train
-    trainer.train(
-        reader=wmt14_reader,
-        event_handler=event_handler,
-        num_passes=10000,
-        feeding=feeding)
+                seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]]))
+                seq = []
+
+        prob = beam_result[0]
+        beam_size = 3
+        for i in xrange(gen_num):
+            print "\n*******************************************************\n"
+            print "src:", ' '.join(
+                [src_dict.get(w) for w in gen_data[i][0]]), "\n"
+            for j in xrange(beam_size):
+                print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j]


 if __name__ == '__main__':
--- a/doc/design/dist/README.md
+++ b/doc/design/dist/README.md
@ -0,0 +1,172 @@
+# Design Doc: Distributed Training
+
+## Objective
+
+In [this slides](https://www.slideshare.net/cxwangyi/paddlepaddle-a-complete-solution-for-businesses), we explained that we'd like PaddlePaddle running on general-purpose clusters like those managed by Kubernetes, so to address demands for AI from both Internet and non-Internet industries.
+
+This poses technical challenges to PaddlePaddle:
+
+1. Support fault-recovery.
+1. Support both offline and online training.
+1. [Serverless computing](https://en.wikipedia.org/wiki/Serverless_computing) of distributed training.
+
+
+## Training Job
+
+A training job will be created once user asks Paddle cloud to train a model. The training job is made up of different processes that collaboratively consume data and produce a trained model. There are three kinds of processes:
+
+1. the *master process*, which dispatches tasks to
+1. one or more *trainer processes*, which run distributed training and synchronize gradients/models via
+1. one or more *parameter server processes*, where each holds a shard of the global model.
+
+Their relation is illustrated in the following graph:
+
+<img src="src/paddle-model-sharding.png"/>
+
+### Master Process
+
+The master process will:
+
+- Partition a dataset into [tasks](#task) and dispatch tasks to trainers.
+- Keep track of training progress on the dataset with [task queue](#task-queue). A training job will iterate on the dataset for a full pass until it goes into next pass.
+
+
+#### Task 
+
+A task is a data shard to be trained. The total number of tasks will be much bigger than the total number of trainers. The number of data instances inside a task will be much bigger than the mini-batch size.
+
+#### Task Queue
+
+The master process has three task queues to track training progress. As illustrated in the graph below, Job A and Job B both have one master process. Each master process has three task queues.
+
+<img src="src/paddle-task-queues.png"/>
+
+- The todo queue holds tasks to be dispatched. When a job starts, the master process fills in the todo queue with all tasks.
+- The pending queue holds tasks that are currently training by trainers.
+- the done queue holds tasks that are already trained.
+
+The life cycle of a single task is illustrated below:
+
+<img src="src/paddle-task-states.png"/>
+
+1. When a new pass of training starts, all tasks will be placed in the todo queue.
+1. The master process will dispatch few tasks to each trainer at a time, puts them in the pending queue and waits for completion.
+1. The trainer will work on its tasks and tell the master process once a task is completed. The master process will dispatch a new task to that trainer.
+1. If a task timeout. the master process will move it back to the todo queue. The timeout count will increase by one. If the timeout count is above a threshold, the task is likely to cause a trainer to crash, so it will be discarded.
+1. The master process will move completed task to the done queue. When the todo queue is empty, the master process will start a new pass by moving all tasks in the done queue to todo queue and reset the timeout counter of all tasks to zero.
+
+### Trainer Process
+
+The trainer process will:
+
+- Receive tasks from the master.
+- Work on the tasks: calculate and upload gradient to parameter servers, and update local model by downloading new parameters from parameter servers.
+
+### Parameter Server Process
+
+Parameter server processes hold the parameters collaboratively. The parameters are partitioned on different parameter servers.
+
+The parameter server will:
+
+- Receive gradient from the trainers, update its parameters, and give the trainers the latest parameters.
+- Periodically save its parameters to distributed file system by overriding the previous save.
+
+### Optimization Algorithms
+
+The communication pattern between the trainers and the parameter servers depends on the category of optimization algorithm:
+
+- Synchronous Stochastic Gradient Descent (sync-SGD)
+
+	Parameter server will wait for all trainer finish n-th mini-batch calculation and send their gradients before broadcasting new parameters to every trainer. Every trainer will wait for the new parameters before starting n+1-th mini-batch.
+  
+- Asynchronous Stochastic Gradient Descent (async-SGD)
+
+	There will no synchronization between different trainers, and parameter server updates its parameter as soon as it receives new gradient:
+
+	- Each trainer uploads its accumulated gradient every n mini-batches.
+	- Every m mini-batches, the trainer downloads new parameters from parameter server.
+	- n and m do not have to be equal.
+
+## Fault Tolerant
+
+The training job will pause if the master processes is dead, or any of the parameter server process is dead. They will be started by [Kubernetes](https://kubernetes.io/) and recover in few minutes. Please refer to [fault recovery](#fault-recovery).
+
+The training job will continue to make progress if there is at least one training process running. The strategy depends on the type of optimization algorithm:
+
+- sync-SGD
+
+	TODO
+
+- async-SGD
+
+	Since async-SGD does not require synchronization between mini-batches, the system will by definition make process if at least one trainer is running.
+
+## Fault Recovery
+
+PaddlePaddle uses [etcd](https://github.com/coreos/etcd) to keep track of the states of processes. Because etcd is a distributed reliable key-value store, the restarted process can recover its states from etcd. The model parameters are periodically saved into distributed file system, so a restarted parameter server can recover its parameters from the saved file.
+
+Now we will introduce how each process recovers from a failure, the graph below shows how etcd is used:
+
+<img src="src/paddle-etcd.png"/>
+
+### Master Process
+
+When the master is started by the Kubernetes, it executes the following steps at startup:
+
+1. Grabs a unique *master* lock in etcd, which prevents concurrent master instantiations.
+1. Recovers the task queues from etcd if they already exist, otherwise, the master will create them.
+1. Watches the trainer prefix keys `/trainer/` on etcd to find the live trainers.
+1. Starts dispatching the tasks to the trainers, and updates task queue using an etcd transaction to ensure lock is held during the update.
+
+The master process will kill itself if its etcd lease expires.
+
+When the master process is dead for any reason, Kubernetes will restart it. It will be online again with all states recovered from etcd in few minutes.
+
+### Trainer Process
+
+When the trainer is started by the Kubernetes, it executes the following steps at startup:
+
+1. Watches the available parameter server prefix keys `/ps/` on etcd and waits until the count of parameter servers reaches the desired count.
+1. Generates a unique ID, and sets key `/trainer/<unique ID>` with its contact address as value. The key will be deleted when the lease expires, so the master will be aware of the trainer being online and offline.
+1. Waits for tasks from the master to start training.
+
+If trainer's etcd lease expires, it will try set key `/trainer/<unique ID>` again so that the master process can discover the trainer again.
+
+### Parameter Server Process
+
+When the parameter server is started by Kubernetes, it executes the following steps at startup:
+
+1. Read desired total number of parameter servers from etcd `/ps_desired`
+1. Search through etcd keys `/ps/<index>` (`/ps/0`, `/ps/1`, ...) to find the first non-existant key whose index is smaller than the total number of parameter servers. Set the key using a transaction to avoid concurrent writes. The parameter server's index is inferred from the key name.
+
+	The desired number of parameter servers is 3:
+	
+	<img src="src/paddle-ps-0.png"/>
+	
+	The third parameter server joined:
+	
+	<img src="src/paddle-ps-1.png"/>
+
+1. The parameter server can load parameters if there are already saved parameters in the save path (inferred from its index).
+1. Now the parameter server is ready for the trainers' requests.
+
+If the parameter server's etcd lease expires, the parameter server will kill itself.
+
+
+## Dynamic Scaling
+
+### Trainer Scaling
+
+TODO
+
+### Parameter Server Scaling
+
+Not planned for v1.
+
+## Training Dataset Format
+
+TODO
+
+## User Interface
+
+TODO
--- a/doc/design/dist/src/paddle-etcd.graffle
+++ b/doc/design/dist/src/paddle-etcd.graffle
--- a/doc/design/dist/src/paddle-etcd.png
+++ b/doc/design/dist/src/paddle-etcd.png
--- a/doc/design/dist/src/paddle-model-sharding.graffle
+++ b/doc/design/dist/src/paddle-model-sharding.graffle
--- a/doc/design/dist/src/paddle-model-sharding.png
+++ b/doc/design/dist/src/paddle-model-sharding.png
--- a/doc/design/dist/src/paddle-ps-0.png
+++ b/doc/design/dist/src/paddle-ps-0.png
--- a/Show More
+++ b/Show More