diff --git a/.gitignore b/.gitignore
index 6aae076a49..ee7c6ec370 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@ build/
 .project
 .cproject
 .pydevproject
+.settings/
 Makefile
 .test_env/
 third_party/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4b0682c4fe..1a59db8c71 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,19 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-cmake_minimum_required(VERSION 3.0)
-
-project(paddle CXX C)
-
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
 set(PROJ_ROOT ${CMAKE_SOURCE_DIR})
 
+include(system)
+
+if(ANDROID)
+    cmake_minimum_required(VERSION 3.7)
+else()
+    cmake_minimum_required(VERSION 3.0)
+endif()
+
+project(paddle CXX C)
+
 find_package(Sphinx)
-find_package(CUDA QUIET)
+if(NOT CMAKE_CROSSCOMPILING)
+    find_package(CUDA QUIET)
+endif(NOT CMAKE_CROSSCOMPILING)
 find_package(Git REQUIRED)
 find_package(Threads REQUIRED)
 
-include(system)
 include(simd)
 
 ################################ Configurations #######################################
@@ -40,7 +47,7 @@ option(WITH_RDMA        "Compile PaddlePaddle with RDMA support"        OFF)
 option(WITH_TIMER       "Compile PaddlePaddle with stats timer"         OFF)
 option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler"        OFF)
 option(WITH_DOC         "Compile PaddlePaddle with documentation"       OFF)
-option(ON_COVERALLS     "Compile PaddlePaddle with code coverage"       OFF)
+option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
 
@@ -51,6 +58,21 @@ if(NOT CMAKE_BUILD_TYPE)
       FORCE)
 endif()
 
+if(ANDROID)
+    if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
+        message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 21")
+    endif()
+
+    set(WITH_GPU OFF CACHE STRING
+        "Disable GPU when cross-compiling for Android" FORCE)
+    set(WITH_AVX OFF CACHE STRING
+        "Disable AVX when cross-compiling for Android" FORCE)
+    set(WITH_PYTHON OFF CACHE STRING
+        "Disable PYTHON when cross-compiling for Android" FORCE)
+    set(WITH_RDMA OFF CACHE STRING
+        "Disable RDMA when cross-compiling for Android" FORCE)
+endif(ANDROID)
+
 set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING
   "A path setting third party libraries download & build directories.")
 ########################################################################################
@@ -64,6 +86,7 @@ include(external/python)    # download, build, install python
 include(external/openblas)  # download, build, install openblas
 include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
+include(external/any)       # download libn::any
 
 include(package)            # set paddle packages
 include(cpplint)            # set paddle c++ style
@@ -74,7 +97,6 @@ include(flags)              # set paddle compile flags
 include(cudnn)              # set cudnn libraries
 include(version)            # set PADDLE_VERSION
 include(coveralls)          # set code coverage
-
 include(configure)          # add paddle env configuration
 
 include_directories("${PROJ_ROOT}")
@@ -82,14 +104,21 @@ include_directories("${PROJ_ROOT}/paddle/cuda/include")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
 
 set(EXTERNAL_LIBS
-    # have not include gtest here.
     ${GFLAGS_LIBRARIES}
     ${GLOG_LIBRARIES}
     ${CBLAS_LIBRARIES}
     ${PROTOBUF_LIBRARY}
     ${ZLIB_LIBRARIES}
+    ${PYTHON_LIBRARIES}
 )
 
+if(WITH_GPU)
+    list(APPEND EXTERNAL_LIB ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
+    if(NOT WITH_DSO)
+        list(APPEND EXTERNAL_LIB ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
+    endif(NOT WITH_DSO)
+endif(WITH_GPU)
+
 add_subdirectory(proto)
 add_subdirectory(paddle)
 add_subdirectory(python)
diff --git a/Dockerfile b/Dockerfile
index ccd43be668..97947adf45 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,19 +1,18 @@
 # A image for building paddle binaries
 # Use cuda devel base image for both cpu and gpu environment
-FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
+FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu14.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
 ARG UBUNTU_MIRROR
 RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
 
 # ENV variables
-ARG BUILD_WOBOQ
 ARG WITH_GPU
 ARG WITH_AVX
 ARG WITH_DOC
 ARG WITH_STYLE_CHECK
 
-ENV BUILD_WOBOQ=${BUILD_WOBOQ:-OFF}
+ENV WOBOQ OFF
 ENV WITH_GPU=${WITH_AVX:-OFF}
 ENV WITH_AVX=${WITH_AVX:-ON}
 ENV WITH_DOC=${WITH_DOC:-OFF}
@@ -37,18 +36,20 @@ RUN git config --global credential.helper store
 # Fix locales to en_US.UTF-8
 RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 
+# FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
+# version util jupyter fixes this issue.
 RUN pip install --upgrade pip && \
     pip install -U 'protobuf==3.1.0' && \
     pip install -U wheel pillow BeautifulSoup && \
     pip install -U docopt PyYAML sphinx && \
     pip install -U sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip install -U pre-commit 'requests==2.9.2' jupyter
+    pip install pre-commit 'requests==2.9.2' 'ipykernel==4.6.0' 'jupyter==1.0.0'
 
 RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
     cd cmake-3.4.1 && ./bootstrap && make -j `nproc` && make install && \
     cd .. && rm -rf cmake-3.4.1
 
-VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
+VOLUME ["/woboq_out"]
 
 # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
 RUN mkdir /var/run/sshd
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 235c95f017..b8bf1bb07a 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -19,9 +19,9 @@ set(CBLAS_FOUND OFF)
 set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
 set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL")
 
-find_path(MKL_INCLUDE_DIR mkl.h PATHS
+find_path(MKL_INC_DIR mkl.h PATHS
   ${MKL_ROOT}/include)
-find_path(MKL_INCLUDE_DIR mkl_lapacke.h PATHS
+find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS
   ${MKL_ROOT}/include)
 find_library(MKL_CORE_LIB NAMES mkl_core PATHS
   ${MKL_ROOT}/lib
@@ -34,15 +34,19 @@ find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
   ${MKL_ROOT}/lib/intel64)
 
 
-if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
+if(MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
   set(CBLAS_PROVIDER MKL)
-  set(CBLAS_INC_DIR ${MKL_INCLUDE_DIR})
+  set(CBLAS_INC_DIR ${MKL_INC_DIR})
   set(CBLAS_LIBRARIES ${MKL_INTEL_LP64}
           ${MKL_SEQUENTIAL_LIB}
           ${MKL_CORE_LIB})
   add_definitions(-DPADDLE_USE_MKL)
   message(STATUS "Found MKL (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
   set(CBLAS_FOUND ON)
+  if(${MKL_LAPACK_INC_DIR})
+    add_definitions(-DPADDLE_USE_LAPACK)
+    message(STATUS "Found lapack in MKL (include: ${MKL_LAPACK_INC_DIR})")
+  endif()
   return() # return file.
 endif()
 
@@ -68,13 +72,17 @@ find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3
 find_library(ATLAS_LIB NAMES lapack_atlas liblapack_atlas.so.3
   PATHS ${ATLAS_LIB_SEARCH_PATHS})
 
-if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB)
+if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB AND NOT CBLAS_FOUND)
   set(CBLAS_PROVIDER ATLAS)
-  set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
+  set(CBLAS_INC_DIR ${ATLAS_INC_DIR})
   set(CBLAS_LIBRARIES ${ATLAS_LIB} ${ATLAS_CBLAS_LIB})
   add_definitions(-DPADDLE_USE_ATLAS)  
-  message(STATUS "Found Atlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  message(STATUS "Found ATLAS (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
   set(CBLAS_FOUND ON)
+  if(ATLAS_CLAPACK_INC_DIR)
+    add_definitions(-DPADDLE_USE_LAPACK)
+    message(STATUS "Found lapack in ATLAS (include: ${ATLAS_CLAPACK_INC_DIR})")
+  endif()
   return()
 endif()
 
@@ -103,8 +111,12 @@ if(OPENBLAS_INC_DIR AND OPENBLAS_LIB)
   set(CBLAS_PROVIDER OPENBLAS)
   set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR})
   set(CBLAS_LIBRARIES ${OPENBLAS_LIB})
-  message(STATUS "Found OpenBlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  message(STATUS "Found OpenBLAS (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
   set(CBLAS_FOUND ON)
+  if(OPENBLAS_LAPACKE_INC_DIR)
+    add_definitions(-DPADDLE_USE_LAPACK)
+    message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})")
+  endif()
   return()
 endif()
 
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 0bb016201d..5e507e78f7 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -32,6 +32,14 @@ if(NOT WITH_PROFILER)
     add_definitions(-DPADDLE_DISABLE_PROFILER)
 endif(NOT WITH_PROFILER)
 
+if(NOT CMAKE_CROSSCOMPILING)
+    if(WITH_AVX AND AVX_FOUND)
+        set(SIMD_FLAG ${AVX_FLAG})
+    elseif(SSE3_FOUND)
+        set(SIMD_FLAG ${SSE3_FLAG})
+    endif()
+endif()
+
 if(NOT WITH_GPU)
     add_definitions(-DPADDLE_ONLY_CPU)
     add_definitions(-DHPPL_STUB_FUNC)
@@ -48,21 +56,12 @@ else()
         message(FATAL_ERROR "Paddle need cudnn to compile")
     endif()
 
-    if(WITH_AVX)
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${AVX_FLAG}")
-    else(WITH_AVX)
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
-    endif(WITH_AVX)
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")
 
     # Include cuda and cudnn
     include_directories(${CUDNN_INCLUDE_DIR})
     include_directories(${CUDA_TOOLKIT_INCLUDE})
 endif(NOT WITH_GPU)
 
-if(WITH_AVX)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")
-else(WITH_AVX)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SSE3_FLAG}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SSE3_FLAG}")
-endif(WITH_AVX)
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
diff --git a/cmake/coveralls.cmake b/cmake/coveralls.cmake
index 9be7643819..ca1471cabb 100644
--- a/cmake/coveralls.cmake
+++ b/cmake/coveralls.cmake
@@ -61,7 +61,7 @@ function(code_coverage _COVERAGE_SRCS _COVERALLS_UPLOAD _CMAKE_SCRIPT_PATH)
     endif()
 endfunction()
 
-if(ON_COVERALLS)
+if(WITH_COVERAGE)
     set(CMAKE_BUILD_TYPE "Debug")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
diff --git a/cmake/coverallsGcovJsons.cmake b/cmake/coverallsGcovJsons.cmake
index 6d1a1a7e9b..4641184fcf 100644
--- a/cmake/coverallsGcovJsons.cmake
+++ b/cmake/coverallsGcovJsons.cmake
@@ -134,7 +134,7 @@ foreach(GCDA ${GCDA_FILES})
 	# If -p is not specified then the file is named only "the_file.c.gcov"
 	#
 	execute_process(
-		COMMAND "${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA}"
+		COMMAND ${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA} >/dev/null
 		WORKING_DIRECTORY ${GCDA_DIR}
 	)
 endforeach()
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index e5b59be193..af9be86961 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -1,3 +1,7 @@
+if(NOT WITH_GPU)
+    return()
+endif()
+
 set(CUDNN_ROOT "" CACHE PATH "CUDNN ROOT")
 find_path(CUDNN_INCLUDE_DIR cudnn.h
     PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include
@@ -11,6 +15,7 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
     ${CUDNN_ROOT}
     ${CUDNN_ROOT}/lib64
     ${CUDNN_ROOT}/lib
+    ${CUDNN_ROOT}/lib/x86_64-linux-gnu
     $ENV{CUDNN_ROOT}
     $ENV{CUDNN_ROOT}/lib64
     $ENV{CUDNN_ROOT}/lib
diff --git a/cmake/external/any.cmake b/cmake/external/any.cmake
new file mode 100644
index 0000000000..8116f235d5
--- /dev/null
+++ b/cmake/external/any.cmake
@@ -0,0 +1,20 @@
+INCLUDE(ExternalProject)
+
+SET(ANY_SOURCE_DIR ${THIRD_PARTY_PATH}/any)
+
+INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/linb_any)
+
+ExternalProject_Add(
+    linb_any
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/thelink2012/any.git"
+    GIT_TAG         "8fef1e93710a0edf8d7658999e284a1142c4c020"
+    PREFIX          ${ANY_SOURCE_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND     ""
+    INSTALL_COMMAND   ""
+    TEST_COMMAND      ""
+)
+
+add_definitions(-DANY_IMPL_ANY_CAST_MOVEABLE)
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index 2a49d76eb3..0afb3ab9af 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -31,9 +31,17 @@ ExternalProject_Add(
     GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
     PREFIX          ${GFLAGS_SOURCES_DIR}
     UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
     CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
     CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
     CMAKE_ARGS      -DBUILD_TESTING=OFF
+    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=Release
 )
 
 LIST(APPEND external_project_dependencies gflags)
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index ab105611c8..4a9e2ecc6b 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -33,11 +33,19 @@ ExternalProject_Add(
     GIT_REPOSITORY  "https://github.com/google/glog.git"
     PREFIX          ${GLOG_SOURCES_DIR}
     UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
     CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
     CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
     CMAKE_ARGS      -DWITH_GFLAGS=ON
     CMAKE_ARGS      -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
     CMAKE_ARGS      -DBUILD_TESTING=OFF
+    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=Release
 )
 
 LIST(APPEND external_project_dependencies glog)
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index 11d829a9e2..49c7d71443 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -41,11 +41,19 @@ IF(WITH_TESTING)
         GIT_TAG         "release-1.8.0"
         PREFIX          ${GTEST_SOURCES_DIR}
         UPDATE_COMMAND  ""
-        CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
+        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+        CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+        CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+        CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+        CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
         CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
         CMAKE_ARGS      -DBUILD_GMOCK=ON
         CMAKE_ARGS      -Dgtest_disable_pthreads=ON
         CMAKE_ARGS      -Dgtest_force_shared_crt=ON
+        CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+        CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
+                         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                         -DCMAKE_BUILD_TYPE:STRING=Release
     )
     LIST(APPEND external_project_dependencies gtest)
 ENDIF(WITH_TESTING)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 29d17691db..92ea23c763 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -29,7 +29,24 @@ IF(NOT ${CBLAS_FOUND})
 
     IF(CMAKE_COMPILER_IS_GNUCC)
         ENABLE_LANGUAGE(Fortran)
-        LIST(APPEND CBLAS_LIBRARIES gfortran pthread)
+        if (NOT CMAKE_Fortran_COMPILER_VERSION)
+          # cmake < 3.4 cannot get CMAKE_Fortran_COMPILER_VERSION directly.
+          execute_process(COMMAND ${CMAKE_Fortran_COMPILER} -dumpversion
+                    OUTPUT_VARIABLE CMAKE_Fortran_COMPILER_VERSION)
+        endif()
+        string(REGEX MATCHALL "[0-9]+" Fortran_VERSION ${CMAKE_Fortran_COMPILER_VERSION})
+        list(GET Fortran_VERSION 0 Fortran_MAJOR)
+        list(GET Fortran_VERSION 1 Fortran_MINOR)
+        find_library(GFORTRAN_LIBRARY NAMES gfortran PATHS 
+                     /lib
+                     /usr/lib
+                     /usr/lib/gcc/x86_64-linux-gnu/${Fortran_MAJOR}.${Fortran_MINOR}/
+                     /usr/lib/gcc/x86_64-linux-gnu/${Fortran_MAJOR}/)
+        if (NOT GFORTRAN_LIBRARY)
+            message(FATAL_ERROR "Cannot found gfortran library which it is used by openblas")
+        endif()
+        find_package(Threads REQUIRED)
+        LIST(APPEND CBLAS_LIBRARIES ${GFORTRAN_LIBRARY} ${CMAKE_THREAD_LIBS_INIT})
     ENDIF(CMAKE_COMPILER_IS_GNUCC)
 
     IF(NOT CMAKE_Fortran_COMPILER)
@@ -37,6 +54,8 @@ IF(NOT ${CBLAS_FOUND})
                 "you need to set gfortran compiler: cmake .. -DCMAKE_Fortran_COMPILER=...")
     ENDIF(NOT CMAKE_Fortran_COMPILER)
 
+    ADD_DEFINITIONS(-DPADDLE_USE_LAPACK)
+
     ExternalProject_Add(
         openblas
         ${EXTERNAL_PROJECT_LOG_ARGS}
@@ -45,7 +64,7 @@ IF(NOT ${CBLAS_FOUND})
         PREFIX              ${CBLAS_SOURCES_DIR}
         INSTALL_DIR         ${CBLAS_INSTALL_DIR}
         BUILD_IN_SOURCE     1
-        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} FC=${CMAKE_Fortran_COMPILER} CC=${CMAKE_C_COMPILER} HOSTCC=${CMAKE_C_COMPILER} NO_SHARED=1 libs netlib
+        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} FC=${CMAKE_Fortran_COMPILER} CC=${CMAKE_C_COMPILER} HOSTCC=${CMAKE_C_COMPILER} DYNAMIC_ARCH=1 NO_SHARED=1 libs netlib
         INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 PREFIX=<INSTALL_DIR>
         UPDATE_COMMAND      ""
         CONFIGURE_COMMAND   ""
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 446a7532c5..2df042d226 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -14,7 +14,8 @@
 
 INCLUDE(ExternalProject)
 
-FIND_PACKAGE(Protobuf 3.1)
+set(PROTOBUF_VERSION 3.1)
+FIND_PACKAGE(Protobuf ${PROTOBUF_VERSION})
 
 IF(PROTOBUF_FOUND)
     EXEC_PROGRAM(${PROTOBUF_PROTOC_EXECUTABLE} ARGS --version OUTPUT_VARIABLE PROTOBUF_VERSION)
@@ -57,12 +58,20 @@ IF(NOT PROTOBUF_FOUND)
         GIT_TAG         "9f75c5aa851cd877fb0d93ccc31b8567a6706546"
         CONFIGURE_COMMAND
         ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake
-        -Dprotobuf_BUILD_TESTS=OFF
-        -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
-        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-        -DCMAKE_BUILD_TYPE=Release
-        -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
-        -DCMAKE_INSTALL_LIBDIR=lib
+            -Dprotobuf_BUILD_TESTS=OFF
+            -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
+            -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+            -DCMAKE_BUILD_TYPE=Release
+            -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
+            -DCMAKE_INSTALL_LIBDIR=lib
+        CMAKE_CACHE_ARGS
+            -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
+            -DCMAKE_BUILD_TYPE:STRING=Release
+            -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+            -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+            -DZLIB_ROOT:STRING=${ZLIB_ROOT}
     )
 
     LIST(APPEND external_project_dependencies protobuf)
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index 0accf1a8dd..9fd3afd099 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -219,5 +219,9 @@ ELSE(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
 
 ENDIF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
 
-INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
-INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
+IF(WITH_PYTHON)
+    INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
+    INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
+ELSE()
+    SET(PYTHON_LIBRARIES "")
+ENDIF()
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 172c318b35..293070c3cf 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -50,12 +50,19 @@ ExternalProject_Add(
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
     CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
     CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
     CMAKE_ARGS      -DWITH_GPU=${WITH_GPU}
     CMAKE_ARGS      -DWITH_OMP=${USE_OMP}
     CMAKE_ARGS      -DWITH_TORCH=OFF
-    CMAKE_ARGS      -DCMAKE_DISABLE_FIND_PACKAGE_Torch=TRUE
+    CMAKE_ARGS      -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
     CMAKE_ARGS      -DBUILD_SHARED=ON
+    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+    CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=Release
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
 )
 
 LIST(APPEND external_project_dependencies warpctc)
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index 47fa8817fb..45ca5542b7 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -22,7 +22,7 @@ SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include dire
 IF(WIN32)
   SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" CACHE FILEPATH "zlib library." FORCE)
 ELSE(WIN32)
-  set(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
+  SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
 ENDIF(WIN32)
 
 INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
@@ -34,10 +34,18 @@ ExternalProject_Add(
     GIT_TAG         "v1.2.8"
     PREFIX          ${ZLIB_SOURCES_DIR}
     UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
     CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
     CMAKE_ARGS      -DBUILD_SHARED_LIBS=OFF
     CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
     CMAKE_ARGS      -DCMAKE_MACOSX_RPATH=ON
+    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR}
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=Release
 )
 
 LIST(APPEND external_project_dependencies zlib)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index b76852fc6c..7eb92efcb0 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -2,6 +2,7 @@
 include(CheckCXXCompilerFlag)
 include(CheckCCompilerFlag)
 include(CheckCXXSymbolExists)
+include(CheckTypeSize)
 
 function(CheckCompilerCXX11Flag)
     if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
@@ -25,7 +26,7 @@ function(CheckCompilerCXX11Flag)
 endfunction()
 
 CheckCompilerCXX11Flag()
-LIST(APPEND CMAKE_CXX_FLAGS -std=c++11)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 
 # safe_set_flag
 #
@@ -83,6 +84,17 @@ if(NOT UINT64_MAX_EXISTS)
   endif()
 endif()
 
+SET(CMAKE_EXTRA_INCLUDE_FILES "pthread.h")
+CHECK_TYPE_SIZE(pthread_spinlock_t SPINLOCK_FOUND)
+CHECK_TYPE_SIZE(pthread_barrier_t BARRIER_FOUND)
+if(SPINLOCK_FOUND)
+  add_definitions(-DPADDLE_USE_PTHREAD_SPINLOCK)
+endif(SPINLOCK_FOUND)
+if(BARRIER_FOUND)
+  add_definitions(-DPADDLE_USE_PTHREAD_BARRIER)
+endif(BARRIER_FOUND)
+SET(CMAKE_EXTRA_INCLUDE_FILES "")
+
 # Common flags. the compiler flag used for C/C++ sources whenever release or debug
 # Do not care if this flag is support for gcc.
 set(COMMON_FLAGS
diff --git a/cmake/simd.cmake b/cmake/simd.cmake
index d380c996df..46035a908b 100644
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -2,6 +2,7 @@
 # so that PaddlePaddle can unleash the vectorization power of muticore.
 
 INCLUDE(CheckCXXSourceRuns)
+INCLUDE(CheckCXXSourceCompiles)
 
 IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
     set(MMX_FLAG "-mmmx")
@@ -17,6 +18,8 @@ ELSEIF(MSVC)
     SET(AVX2_FLAG "/arch:AVX2")
 ENDIF()
 
+set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS})
+
 # Check  MMX
 set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG})
 CHECK_CXX_SOURCE_RUNS("
@@ -73,4 +76,5 @@ int main()
     return 0;
 }" AVX2_FOUND)
 
+set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
 mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND)
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 3e472da7e0..3ca06665ab 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -67,6 +67,12 @@ MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES)
 MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}")
 MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")
 
+IF(DEFINED CMAKE_SYSTEM_NAME)
+    IF(${CMAKE_SYSTEM_NAME} STREQUAL "Android")
+        SET(ANDROID TRUE)
+    ENDIF()
+ENDIF()
+
 # external dependencies log output
 SET(EXTERNAL_PROJECT_LOG_ARGS
     LOG_DOWNLOAD    0     # Wrap download in script to log output
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 3640e4651f..099a85809d 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -90,25 +90,9 @@ function(link_paddle_exe TARGET_NAME)
         ${RDMA_LD_FLAGS}
         ${RDMA_LIBS})
 
-    if(WITH_PYTHON)
-        target_link_libraries(${TARGET_NAME}
-            ${PYTHON_LIBRARIES} util)
-    endif()
-
-    if(WITH_GPU)
-        target_link_libraries(${TARGET_NAME} ${CUDA_CUDART_LIBRARY})
-        if(NOT WITH_DSO OR WITH_METRIC)
-            target_link_libraries(${TARGET_NAME}
-                ${CUDNN_LIBRARY}
-                ${CUDA_curand_LIBRARY})
-            CUDA_ADD_CUBLAS_TO_TARGET(${TARGET_NAME})
-        endif()
-
-        check_library_exists(rt clock_gettime "time.h" HAVE_CLOCK_GETTIME )
-        if(HAVE_CLOCK_GETTIME)
-            target_link_libraries(${TARGET_NAME} rt)
-        endif()
-    endif()
+    if(ANDROID)
+        target_link_libraries(${TARGET_NAME} log)
+    endif(ANDROID)
 
     add_dependencies(${TARGET_NAME} ${external_project_dependencies})
 endfunction()
diff --git a/demo/seqToseq/api_train_v2.py b/demo/seqToseq/api_train_v2.py
index 5d138a8c4f..3072c37512 100644
--- a/demo/seqToseq/api_train_v2.py
+++ b/demo/seqToseq/api_train_v2.py
@@ -1,13 +1,17 @@
 import sys
+
 import paddle.v2 as paddle
 
 
-def seqToseq_net(source_dict_dim, target_dict_dim):
+def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
     ### Network Architecture
     word_vector_dim = 512  # dimension of word vector
     decoder_size = 512  # dimension of hidden unit in GRU Decoder network
     encoder_size = 512  # dimension of hidden unit in GRU Encoder network
 
+    beam_size = 3
+    max_length = 250
+
     #### Encoder
     src_word_id = paddle.layer.data(
         name='source_language_word',
@@ -67,79 +71,143 @@ def seqToseq_net(source_dict_dim, target_dict_dim):
     group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True)
     group_inputs = [group_input1, group_input2]
 
-    trg_embedding = paddle.layer.embedding(
-        input=paddle.layer.data(
-            name='target_language_word',
-            type=paddle.data_type.integer_value_sequence(target_dict_dim)),
-        size=word_vector_dim,
-        param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
-    group_inputs.append(trg_embedding)
-
-    # For decoder equipped with attention mechanism, in training,
-    # target embeding (the groudtruth) is the data input,
-    # while encoded source sequence is accessed to as an unbounded memory.
-    # Here, the StaticInput defines a read-only memory
-    # for the recurrent_group.
-    decoder = paddle.layer.recurrent_group(
-        name=decoder_group_name,
-        step=gru_decoder_with_attention,
-        input=group_inputs)
-
-    lbl = paddle.layer.data(
-        name='target_language_next_word',
-        type=paddle.data_type.integer_value_sequence(target_dict_dim))
-    cost = paddle.layer.classification_cost(input=decoder, label=lbl)
-
-    return cost
+    if not is_generating:
+        trg_embedding = paddle.layer.embedding(
+            input=paddle.layer.data(
+                name='target_language_word',
+                type=paddle.data_type.integer_value_sequence(target_dict_dim)),
+            size=word_vector_dim,
+            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+        group_inputs.append(trg_embedding)
+
+        # For decoder equipped with attention mechanism, in training,
+        # target embeding (the groudtruth) is the data input,
+        # while encoded source sequence is accessed to as an unbounded memory.
+        # Here, the StaticInput defines a read-only memory
+        # for the recurrent_group.
+        decoder = paddle.layer.recurrent_group(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs)
+
+        lbl = paddle.layer.data(
+            name='target_language_next_word',
+            type=paddle.data_type.integer_value_sequence(target_dict_dim))
+        cost = paddle.layer.classification_cost(input=decoder, label=lbl)
+
+        return cost
+    else:
+        # In generation, the decoder predicts a next target word based on
+        # the encoded source sequence and the last generated target word.
+
+        # The encoded source sequence (encoder's output) must be specified by
+        # StaticInput, which is a read-only memory.
+        # Embedding of the last generated word is automatically gotten by
+        # GeneratedInputs, which is initialized by a start mark, such as <s>,
+        # and must be included in generation.
+
+        trg_embedding = paddle.layer.GeneratedInputV2(
+            size=target_dict_dim,
+            embedding_name='_target_language_embedding',
+            embedding_size=word_vector_dim)
+        group_inputs.append(trg_embedding)
+
+        beam_gen = paddle.layer.beam_search(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs,
+            bos_id=0,
+            eos_id=1,
+            beam_size=beam_size,
+            max_length=max_length)
+
+        return beam_gen
 
 
 def main():
     paddle.init(use_gpu=False, trainer_count=1)
+    is_generating = False
 
     # source and target dict dim.
     dict_size = 30000
     source_dict_dim = target_dict_dim = dict_size
 
-    # define network topology
-    cost = seqToseq_net(source_dict_dim, target_dict_dim)
-    parameters = paddle.parameters.create(cost)
-
-    # define optimize method and trainer
-    optimizer = paddle.optimizer.Adam(
-        learning_rate=5e-5,
-        regularization=paddle.optimizer.L2Regularization(rate=1e-3))
-    trainer = paddle.trainer.SGD(cost=cost,
-                                 parameters=parameters,
-                                 update_equation=optimizer)
-
-    # define data reader
-    feeding = {
-        'source_language_word': 0,
-        'target_language_word': 1,
-        'target_language_next_word': 2
-    }
-
-    wmt14_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(dict_size=dict_size), buf_size=8192),
-        batch_size=5)
-
-    # define event_handler callback
-    def event_handler(event):
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 10 == 0:
-                print "\nPass %d, Batch %d, Cost %f, %s" % (
-                    event.pass_id, event.batch_id, event.cost, event.metrics)
+    # train the network
+    if not is_generating:
+        cost = seqToseq_net(source_dict_dim, target_dict_dim)
+        parameters = paddle.parameters.create(cost)
+
+        # define optimize method and trainer
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=5e-5,
+            regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+        trainer = paddle.trainer.SGD(cost=cost,
+                                     parameters=parameters,
+                                     update_equation=optimizer)
+        # define data reader
+        wmt14_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.wmt14.train(dict_size), buf_size=8192),
+            batch_size=5)
+
+        # define event_handler callback
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndIteration):
+                if event.batch_id % 10 == 0:
+                    print "\nPass %d, Batch %d, Cost %f, %s" % (
+                        event.pass_id, event.batch_id, event.cost,
+                        event.metrics)
+                else:
+                    sys.stdout.write('.')
+                    sys.stdout.flush()
+
+        # start to train
+        trainer.train(
+            reader=wmt14_reader, event_handler=event_handler, num_passes=2)
+
+    # generate a english sequence to french
+    else:
+        # use the first 3 samples for generation
+        gen_creator = paddle.dataset.wmt14.gen(dict_size)
+        gen_data = []
+        gen_num = 3
+        for item in gen_creator():
+            gen_data.append((item[0], ))
+            if len(gen_data) == gen_num:
+                break
+
+        beam_gen = seqToseq_net(source_dict_dim, target_dict_dim, is_generating)
+        # get the pretrained model, whose bleu = 26.92
+        parameters = paddle.dataset.wmt14.model()
+        # prob is the prediction probabilities, and id is the prediction word. 
+        beam_result = paddle.infer(
+            output_layer=beam_gen,
+            parameters=parameters,
+            input=gen_data,
+            field=['prob', 'id'])
+
+        # get the dictionary
+        src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+
+        # the delimited element of generated sequences is -1,
+        # the first element of each generated sequence is the sequence length
+        seq_list = []
+        seq = []
+        for w in beam_result[1]:
+            if w != -1:
+                seq.append(w)
             else:
-                sys.stdout.write('.')
-                sys.stdout.flush()
-
-    # start to train
-    trainer.train(
-        reader=wmt14_reader,
-        event_handler=event_handler,
-        num_passes=10000,
-        feeding=feeding)
+                seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]]))
+                seq = []
+
+        prob = beam_result[0]
+        beam_size = 3
+        for i in xrange(gen_num):
+            print "\n*******************************************************\n"
+            print "src:", ' '.join(
+                [src_dict.get(w) for w in gen_data[i][0]]), "\n"
+            for j in xrange(beam_size):
+                print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j]
 
 
 if __name__ == '__main__':
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index db33a20487..05817ec854 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -109,6 +109,12 @@ sum_to_one_norm
     :members: sum_to_one_norm
     :noindex:
     
+cross_channel_norm
+------------------
+..  automodule:: paddle.v2.layer
+    :members: cross_channel_norm
+    :noindex:
+    
 Recurrent Layers
 ================
 
diff --git a/doc/api/v2/config/optimizer.rst b/doc/api/v2/config/optimizer.rst
index ec6ba0aa46..b32373fdef 100644
--- a/doc/api/v2/config/optimizer.rst
+++ b/doc/api/v2/config/optimizer.rst
@@ -1,5 +1,3 @@
-..  _api_v2.optimizer:
-
 ==========
 Optimizer
 ==========
diff --git a/doc/api/v2/data.rst b/doc/api/v2/data.rst
index b042320bc2..fef87c4fbd 100644
--- a/doc/api/v2/data.rst
+++ b/doc/api/v2/data.rst
@@ -1,6 +1,6 @@
-========
-Datasets
-========
+==================================
+Data Reader Interface and DataSets
+==================================
 
 
 DataTypes
@@ -49,7 +49,6 @@ mnist
     :members:
     :noindex:
 
-
 cifar
 +++++
 
@@ -61,7 +60,7 @@ conll05
 +++++++
 
 ..  automodule:: paddle.v2.dataset.conll05
-    :members:
+    :members: get_dict,get_embedding,test
     :noindex:
 
 imdb
@@ -85,6 +84,12 @@ movielens
     :members:
     :noindex:
 
+..  autoclass:: paddle.v2.dataset.movielens.MovieInfo
+    :noindex:
+    
+..  autoclass:: paddle.v2.dataset.movielens.UserInfo
+    :noindex:
+
 sentiment
 +++++++++
 
@@ -102,7 +107,7 @@ uci_housing
 wmt14
 +++++
 
-..  automodule:: paddle.v2.dataset.uci_housing
+..  automodule:: paddle.v2.dataset.wmt14
     :members:
     :noindex:
 
diff --git a/doc/api/v2/run_logic.rst b/doc/api/v2/run_logic.rst
index 94921e1a7b..5c97651f65 100644
--- a/doc/api/v2/run_logic.rst
+++ b/doc/api/v2/run_logic.rst
@@ -6,18 +6,21 @@ Parameters
 ==========
 
 ..  automodule:: paddle.v2.parameters
+    :members: Parameters
     :noindex:
 
 Trainer
 =======
 
 ..  automodule:: paddle.v2.trainer
+    :members: SGD
     :noindex:
 
 Event
 =====
 
 ..  automodule:: paddle.v2.event
+    :members:
     :noindex:
 
 Inference
@@ -25,3 +28,4 @@ Inference
 
 ..  autofunction:: paddle.v2.infer
     :noindex:
+    
\ No newline at end of file
diff --git a/doc/design/dist/README.md b/doc/design/dist/README.md
new file mode 100644
index 0000000000..1788208bca
--- /dev/null
+++ b/doc/design/dist/README.md
@@ -0,0 +1,172 @@
+# Design Doc: Distributed Training
+
+## Objective
+
+In [this slides](https://www.slideshare.net/cxwangyi/paddlepaddle-a-complete-solution-for-businesses), we explained that we'd like PaddlePaddle running on general-purpose clusters like those managed by Kubernetes, so to address demands for AI from both Internet and non-Internet industries.
+
+This poses technical challenges to PaddlePaddle:
+
+1. Support fault-recovery.
+1. Support both offline and online training.
+1. [Serverless computing](https://en.wikipedia.org/wiki/Serverless_computing) of distributed training.
+
+
+## Training Job
+
+A training job will be created once user asks Paddle cloud to train a model. The training job is made up of different processes that collaboratively consume data and produce a trained model. There are three kinds of processes:
+
+1. the *master process*, which dispatches tasks to
+1. one or more *trainer processes*, which run distributed training and synchronize gradients/models via
+1. one or more *parameter server processes*, where each holds a shard of the global model.
+
+Their relation is illustrated in the following graph:
+
+<img src="src/paddle-model-sharding.png"/>
+
+### Master Process
+
+The master process will:
+
+- Partition a dataset into [tasks](#task) and dispatch tasks to trainers.
+- Keep track of training progress on the dataset with [task queue](#task-queue). A training job will iterate on the dataset for a full pass until it goes into next pass.
+
+
+#### Task 
+
+A task is a data shard to be trained. The total number of tasks will be much bigger than the total number of trainers. The number of data instances inside a task will be much bigger than the mini-batch size.
+
+#### Task Queue
+
+The master process has three task queues to track training progress. As illustrated in the graph below, Job A and Job B both have one master process. Each master process has three task queues.
+
+<img src="src/paddle-task-queues.png"/>
+
+- The todo queue holds tasks to be dispatched. When a job starts, the master process fills in the todo queue with all tasks.
+- The pending queue holds tasks that are currently training by trainers.
+- the done queue holds tasks that are already trained.
+
+The life cycle of a single task is illustrated below:
+
+<img src="src/paddle-task-states.png"/>
+
+1. When a new pass of training starts, all tasks will be placed in the todo queue.
+1. The master process will dispatch few tasks to each trainer at a time, puts them in the pending queue and waits for completion.
+1. The trainer will work on its tasks and tell the master process once a task is completed. The master process will dispatch a new task to that trainer.
+1. If a task timeout. the master process will move it back to the todo queue. The timeout count will increase by one. If the timeout count is above a threshold, the task is likely to cause a trainer to crash, so it will be discarded.
+1. The master process will move completed task to the done queue. When the todo queue is empty, the master process will start a new pass by moving all tasks in the done queue to todo queue and reset the timeout counter of all tasks to zero.
+
+### Trainer Process
+
+The trainer process will:
+
+- Receive tasks from the master.
+- Work on the tasks: calculate and upload gradient to parameter servers, and update local model by downloading new parameters from parameter servers.
+
+### Parameter Server Process
+
+Parameter server processes hold the parameters collaboratively. The parameters are partitioned on different parameter servers.
+
+The parameter server will:
+
+- Receive gradient from the trainers, update its parameters, and give the trainers the latest parameters.
+- Periodically save its parameters to distributed file system by overriding the previous save.
+
+### Optimization Algorithms
+
+The communication pattern between the trainers and the parameter servers depends on the category of optimization algorithm:
+
+- Synchronous Stochastic Gradient Descent (sync-SGD)
+
+	Parameter server will wait for all trainer finish n-th mini-batch calculation and send their gradients before broadcasting new parameters to every trainer. Every trainer will wait for the new parameters before starting n+1-th mini-batch.
+  
+- Asynchronous Stochastic Gradient Descent (async-SGD)
+
+	There will no synchronization between different trainers, and parameter server updates its parameter as soon as it receives new gradient:
+
+	- Each trainer uploads its accumulated gradient every n mini-batches.
+	- Every m mini-batches, the trainer downloads new parameters from parameter server.
+	- n and m do not have to be equal.
+
+## Fault Tolerant
+
+The training job will pause if the master processes is dead, or any of the parameter server process is dead. They will be started by [Kubernetes](https://kubernetes.io/) and recover in few minutes. Please refer to [fault recovery](#fault-recovery).
+
+The training job will continue to make progress if there is at least one training process running. The strategy depends on the type of optimization algorithm:
+
+- sync-SGD
+
+	TODO
+
+- async-SGD
+
+	Since async-SGD does not require synchronization between mini-batches, the system will by definition make process if at least one trainer is running.
+
+## Fault Recovery
+
+PaddlePaddle uses [etcd](https://github.com/coreos/etcd) to keep track of the states of processes. Because etcd is a distributed reliable key-value store, the restarted process can recover its states from etcd. The model parameters are periodically saved into distributed file system, so a restarted parameter server can recover its parameters from the saved file.
+
+Now we will introduce how each process recovers from a failure, the graph below shows how etcd is used:
+
+<img src="src/paddle-etcd.png"/>
+
+### Master Process
+
+When the master is started by the Kubernetes, it executes the following steps at startup:
+
+1. Grabs a unique *master* lock in etcd, which prevents concurrent master instantiations.
+1. Recovers the task queues from etcd if they already exist, otherwise, the master will create them.
+1. Watches the trainer prefix keys `/trainer/` on etcd to find the live trainers.
+1. Starts dispatching the tasks to the trainers, and updates task queue using an etcd transaction to ensure lock is held during the update.
+
+The master process will kill itself if its etcd lease expires.
+
+When the master process is dead for any reason, Kubernetes will restart it. It will be online again with all states recovered from etcd in few minutes.
+
+### Trainer Process
+
+When the trainer is started by the Kubernetes, it executes the following steps at startup:
+
+1. Watches the available parameter server prefix keys `/ps/` on etcd and waits until the count of parameter servers reaches the desired count.
+1. Generates a unique ID, and sets key `/trainer/<unique ID>` with its contact address as value. The key will be deleted when the lease expires, so the master will be aware of the trainer being online and offline.
+1. Waits for tasks from the master to start training.
+
+If trainer's etcd lease expires, it will try set key `/trainer/<unique ID>` again so that the master process can discover the trainer again.
+
+### Parameter Server Process
+
+When the parameter server is started by Kubernetes, it executes the following steps at startup:
+
+1. Read desired total number of parameter servers from etcd `/ps_desired`
+1. Search through etcd keys `/ps/<index>` (`/ps/0`, `/ps/1`, ...) to find the first non-existant key whose index is smaller than the total number of parameter servers. Set the key using a transaction to avoid concurrent writes. The parameter server's index is inferred from the key name.
+
+	The desired number of parameter servers is 3:
+	
+	<img src="src/paddle-ps-0.png"/>
+	
+	The third parameter server joined:
+	
+	<img src="src/paddle-ps-1.png"/>
+
+1. The parameter server can load parameters if there are already saved parameters in the save path (inferred from its index).
+1. Now the parameter server is ready for the trainers' requests.
+
+If the parameter server's etcd lease expires, the parameter server will kill itself.
+
+
+## Dynamic Scaling
+
+### Trainer Scaling
+
+TODO
+
+### Parameter Server Scaling
+
+Not planned for v1.
+
+## Training Dataset Format
+
+TODO
+
+## User Interface
+
+TODO
diff --git a/doc/design/dist/src/paddle-etcd.graffle b/doc/design/dist/src/paddle-etcd.graffle
new file mode 100644
index 0000000000..56681ae5bb
Binary files /dev/null and b/doc/design/dist/src/paddle-etcd.graffle differ
diff --git a/doc/design/dist/src/paddle-etcd.png b/doc/design/dist/src/paddle-etcd.png
new file mode 100644
index 0000000000..4f9c9762b3
Binary files /dev/null and b/doc/design/dist/src/paddle-etcd.png differ
diff --git a/doc/design/dist/src/paddle-model-sharding.graffle b/doc/design/dist/src/paddle-model-sharding.graffle
new file mode 100644
index 0000000000..fba30f0ca2
Binary files /dev/null and b/doc/design/dist/src/paddle-model-sharding.graffle differ
diff --git a/doc/design/dist/src/paddle-model-sharding.png b/doc/design/dist/src/paddle-model-sharding.png
new file mode 100644
index 0000000000..8c3f6724ef
Binary files /dev/null and b/doc/design/dist/src/paddle-model-sharding.png differ
diff --git a/doc/design/dist/src/paddle-ps-0.png b/doc/design/dist/src/paddle-ps-0.png
new file mode 100644
index 0000000000..47ef32806f
Binary files /dev/null and b/doc/design/dist/src/paddle-ps-0.png differ
diff --git a/doc/design/dist/src/paddle-ps-1.png b/doc/design/dist/src/paddle-ps-1.png
new file mode 100644
index 0000000000..f3125db730
Binary files /dev/null and b/doc/design/dist/src/paddle-ps-1.png differ
diff --git a/doc/design/dist/src/paddle-ps.graffle b/doc/design/dist/src/paddle-ps.graffle
new file mode 100644
index 0000000000..0e536ffdd9
Binary files /dev/null and b/doc/design/dist/src/paddle-ps.graffle differ
diff --git a/doc/design/dist/src/paddle-task-queues.graffle b/doc/design/dist/src/paddle-task-queues.graffle
new file mode 100644
index 0000000000..4263ed8bfd
Binary files /dev/null and b/doc/design/dist/src/paddle-task-queues.graffle differ
diff --git a/doc/design/dist/src/paddle-task-queues.png b/doc/design/dist/src/paddle-task-queues.png
new file mode 100644
index 0000000000..5f98026679
Binary files /dev/null and b/doc/design/dist/src/paddle-task-queues.png differ
diff --git a/doc/design/dist/src/paddle-task-states.graffle b/doc/design/dist/src/paddle-task-states.graffle
new file mode 100644
index 0000000000..cf1a0b9246
Binary files /dev/null and b/doc/design/dist/src/paddle-task-states.graffle differ
diff --git a/doc/design/dist/src/paddle-task-states.png b/doc/design/dist/src/paddle-task-states.png
new file mode 100644
index 0000000000..4ae43cb66c
Binary files /dev/null and b/doc/design/dist/src/paddle-task-states.png differ
diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
index d9d54bff30..69f4501f37 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -51,7 +51,7 @@ PaddlePaddle supports some build options.
 <tr><td class="left">WITH_TIMER</td><td class="left">Compile PaddlePaddle with stats timer</td></tr>
 <tr><td class="left">WITH_PROFILER</td><td class="left">Compile PaddlePaddle with GPU profiler</td></tr>
 <tr><td class="left">WITH_DOC</td><td class="left">Compile PaddlePaddle with documentation</td></tr>
-<tr><td class="left">ON_COVERALLS</td><td class="left">Compile PaddlePaddle with code coverage</td></tr>
+<tr><td class="left">WITH_COVERAGE</td><td class="left">Compile PaddlePaddle with code coverage</td></tr>
 <tr><td class="left">COVERALLS_UPLOAD</td><td class="left">Package code coverage data to coveralls</td></tr>
 <tr><td class="left">ON_TRAVIS</td><td class="left">Exclude special unit test on Travis CI</td></tr>
 </tbody>
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index af889ec9d1..22db1ef658 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -4,119 +4,139 @@ PaddlePaddle的Docker容器使用方式
 PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Docker能在所有主要操作系统（包括Linux，Mac OS X和Windows）上运行。 请注意，您需要更改 `Dockers设置 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 才能充分利用Mac OS X和Windows上的硬件资源。
 
 
-纯CPU和GPU的docker镜像使用说明
+PaddlePaddle发布的docker镜像使用说明
 ------------------------------
 
-对于每一个PaddlePaddle版本，我们都会发布两个Docker镜像：纯CPU的和GPU的。
-我们通过设置 `dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_ 自动生成最新的docker镜像：
-`paddledev/paddle:0.10.0rc1-cpu` 和 `paddledev/paddle:0.10.0rc1-gpu`。
+对于每一个PaddlePaddle版本，我们都会发布两种Docker镜像：开发镜像、运行镜像。运行镜像包括纯CPU版本和GPU版本以及其对应的非AVX版本。
+我们会在 `dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_ 提供最新的docker镜像，可以在"tags"标签下找到最新的Paddle镜像版本。
+1. 开发镜像：:code:`paddlepaddle/paddle:<version>-dev`
 
-以交互容器方式运行纯CPU的镜像：
+    这个镜像包含了Paddle相关的开发工具以及编译和运行环境。用户可以使用开发镜像代替配置本地环境，完成开发，编译，发布，
+    文档编写等工作。由于不同的Paddle的版本可能需要不同的依赖和工具，所以如果需要自行配置开发环境需要考虑版本的因素。
+    开发镜像包含了以下工具：
+    - gcc/clang
+    - nvcc
+    - Python
+    - sphinx
+    - woboq
+    - sshd
+    很多开发者会使用远程的安装有GPU的服务器工作，用户可以使用ssh登录到这台服务器上并执行 :code:`docker exec`进入开发镜像并开始工作，
+    也可以在开发镜像中启动一个SSHD服务，方便开发者直接登录到镜像中进行开发:
 
-.. code-block:: bash
+    以交互容器方式运行开发镜像：
 
-    docker run -it --rm paddledev/paddle:0.10.0rc1-cpu /bin/bash
+    .. code-block:: bash
 
-或者，可以以后台进程方式运行容器：
+        docker run -it --rm paddledev/paddle:<version>-dev /bin/bash
 
-.. code-block:: bash
+    或者，可以以后台进程方式运行容器：
 
-    docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:0.10.0rc1-cpu
+    .. code-block:: bash
 
-然后用密码 :code:`root` SSH进入容器：
+        docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:<version>-dev
 
-.. code-block:: bash
+    然后用密码 :code:`root` SSH进入容器：
 
-    ssh -p 2202 root@localhost
+    .. code-block:: bash
 
-SSH方式的一个优点是我们可以从多个终端进入容器。比如，一个终端运行vi，另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上，并在笔记本上通过SSH与其连接。
+        ssh -p 2202 root@localhost
 
+    SSH方式的一个优点是我们可以从多个终端进入容器。比如，一个终端运行vi，另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上，并在笔记本上通过SSH与其连接。
 
-以上方法在GPU镜像里也能用－只是请不要忘记按装CUDA驱动，以及告诉Docker：
+2. 运行镜像：根据CPU、GPU和非AVX区分了如下4个镜像：
+    - GPU/AVX：:code:`paddlepaddle/paddle:<version>-gpu`
+    - GPU/no-AVX：:code:`paddlepaddle/paddle:<version>-gpu-noavx`
+    - CPU/AVX：:code:`paddlepaddle/paddle:<version>`
+    - CPU/no-AVX：:code:`paddlepaddle/paddle:<version>-noavx`
 
-.. code-block:: bash
+    纯CPU镜像以及GPU镜像都会用到AVX指令集，但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX：
 
-    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-    export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0rc1-gpu
+    .. code-block:: bash
 
+       if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
 
-运行PaddlePaddle书籍
----------------------
+    如果输出是No，就需要选择使用no-AVX的镜像
 
-Jupyter Notebook是一个开源的web程序，大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。
+    以上方法在GPU镜像里也能用，只是请不要忘记提前在物理机上安装GPU最新驱动。
+    为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)来运行镜像。
 
-PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nodebook。
-如果您想要更深入了解deep learning，PaddlePaddle书籍一定是您最好的选择。
+    .. code-block:: bash
 
-当您进入容器内之后，只用运行以下命令：
+        nvidia-docker run -it --rm paddledev/paddle:0.10.0rc1-gpu /bin/bash
 
-.. code-block:: bash
-        
-    jupyter notebook
+    注意: 如果使用nvidia-docker存在问题，你也许可以尝试更老的方法，具体如下，但是我们并不推荐这种方法。：
 
-然后在浏览器中输入以下网址：
-    
-.. code-block:: text
+    .. code-block:: bash
 
-    http://localhost:8888/
+        export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+        export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+        docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:<version>-gpu
 
-就这么简单，享受您的旅程！
+3. 使用运行镜像发布你的AI程序
+    假设您已经完成了一个AI训练的python程序 :code:`a.py`，这个程序是您在开发机上使用开发镜像完成开发。此时您可以运行这个命令在开发机上进行测试运行：
 
+    .. code-block:: bash
 
-非AVX镜像
----------
+        docker run -it -v $PWD:/work paddle /work/a.py
 
-纯CPU镜像以及GPU镜像都会用到AVX指令集，但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX：
+    这里`a.py`包含的所有依赖假设都可以在Paddle的运行容器中。如果需要包含更多的依赖、或者需要发布您的应用的镜像，可以编写`Dockerfile`使用`FROM paddledev/paddle:<version>`
+    创建和发布自己的AI程序镜像。
 
-.. code-block:: bash
+运行PaddlePaddle书籍
+---------------------
 
-   if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
+Jupyter Notebook是一个开源的web程序，大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。
 
-如果输出是No，我们就需要手动编译一个非AVX版本的镜像：
+PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nodebook。
+如果您想要更深入了解deep learning，PaddlePaddle书籍一定是您最好的选择。
+
+我们提供可以直接运行PaddlePaddle书籍的docker镜像，直接运行：
 
 .. code-block:: bash
 
-   cd ~
-   git clone https://github.com/PaddlePaddle/Paddle.git
-   cd Paddle
-   docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
-   docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
+    docker run -p 8888:8888 paddlepaddle/book
 
+然后在浏览器中输入以下网址：
+
+.. code-block:: text
+
+    http://localhost:8888/
+
+就这么简单，享受您的旅程！
 
 通过Docker容器开发PaddlePaddle
 ------------------------------
 
-开发人员可以在Docker中开发PaddlePaddle。这样开发人员可以以一致的方式在不同的平台上工作 - Linux，Mac OS X和Windows。
+开发人员可以在Docker开发镜像中开发PaddlePaddle。这样开发人员可以以一致的方式在不同的平台上工作 - Linux，Mac OS X和Windows。
+
+1. 构建开发镜像
 
-1. 将开发环境构建为Docker镜像
-   
    .. code-block:: bash
 
       git clone --recursive https://github.com/PaddlePaddle/Paddle
       cd Paddle
-      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .
+      docker build -t paddle:dev .
 
 
-   请注意，默认情况下，:code:`docker build` 不会将源码导入到镜像中并编译它。如果我们想这样做，需要设置一个参数：
+   请注意，默认情况下，:code:`docker build` 不会将源码导入到镜像中并编译它。如果我们想这样做，需要构建完开发镜像，然后执行：
 
    .. code-block:: bash
 
-      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile --build-arg BUILD_AND_INSTALL=ON .
+      docker run -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "TEST=OFF" paddle:dev
 
 
 2. 运行开发环境
 
    当我们编译好了 :code:`paddle:dev`， 我们可以在docker容器里做开发，源代码可以通过挂载本地文件来被载入Docker的开发环境里面：
-   
+
    .. code-block:: bash
 
-      docker run -d -p 2202:22 -v $PWD:/paddle paddle:dev
+      docker run -d -p 2202:22 -v $PWD:/paddle paddle:dev sshd
 
    以上代码会启动一个带有PaddlePaddle开发环境的docker容器，源代码会被挂载到 :code:`/paddle` 。
 
-   请注意， :code:`paddle:dev` 的默认入口是 :code:`sshd` 。以上的 :code:`docker run` 命令其实会启动一个在2202端口监听的SSHD服务器。这样，我们就能SSH进入我们的开发容器了：
-   
+   以上的 :code:`docker run` 命令其实会启动一个在2202端口监听的SSHD服务器。这样，我们就能SSH进入我们的开发容器了：
+
    .. code-block:: bash
 
       ssh root@localhost -p 2202
@@ -124,13 +144,13 @@ PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nod
 3. 在Docker开发环境中编译与安装PaddlPaddle代码
 
    当在容器里面的时候，可以用脚本 :code:`paddle/scripts/docker/build.sh` 来编译、安装与测试PaddlePaddle：
-   
+
    .. code-block:: bash
-		      
+
       /paddle/paddle/scripts/docker/build.sh
 
    以上指令会在 :code:`/paddle/build` 中编译PaddlePaddle。通过以下指令可以运行单元测试：
-   
+
    .. code-block:: bash
 
       cd /paddle/build
@@ -140,14 +160,14 @@ PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nod
 文档
 ----
 
-Paddle的Docker镜像带有一个通过 `woboq code browser
+Paddle的Docker开发镜像带有一个通过 `woboq code browser
 <https://github.com/woboq/woboq_codebrowser>`_ 生成的HTML版本的C++源代码，便于用户浏览C++源码。
 
 只要在Docker里启动PaddlePaddle的时候给它一个名字，就可以再运行另一个Nginx Docker镜像来服务HTML代码：
 
 .. code-block:: bash
 
-   docker run -d --name paddle-cpu-doc paddle:0.10.0rc1-cpu
+   docker run -d --name paddle-cpu-doc paddle:<version>-dev
    docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
 
 接着我们就能够打开浏览器在 http://localhost:8088/paddle/ 浏览代码。
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index 606746597a..8fb9369e0e 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -8,173 +8,255 @@ Please be aware that you will need to change `Dockers settings
 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ to make full use
 of your hardware resource on Mac OS X and Windows.
 
+Working With Docker
+-------------------
+
+Docker is simple as long as we understand a few basic concepts:
+
+- *image*: A Docker image is a pack of software. It could contain one or more programs and all their dependencies. For example, the PaddlePaddle's Docker image includes pre-built PaddlePaddle and Python and many Python packages. We can run a Docker image directly, other than installing all these software. We can type
+
+  .. code-block:: bash
+
+     docker images
+
+  to list all images in the system. We can also run
+
+  .. code-block:: bash
+		  
+     docker pull paddlepaddle/paddle:0.10.0rc2
+
+  to download a Docker image, paddlepaddle/paddle in this example,
+  from Dockerhub.com.
+
+- *container*: considering a Docker image a program, a container is a
+  "process" that runs the image. Indeed, a container is exactly an
+  operating system process, but with a virtualized filesystem, network
+  port space, and other virtualized environment. We can type
+
+  .. code-block:: bash
+
+     docker run paddlepaddle/paddle:0.10.0rc2
+
+  to start a container to run a Docker image, paddlepaddle/paddle in this example.
+
+- By default docker container have an isolated file system namespace,
+  we can not see the files in the host file system. By using *volume*,
+  mounted files in host will be visible inside docker container.
+  Following command will mount current dirctory into /data inside
+  docker container, run docker container from debian image with
+  command :code:`ls /data`.
+
+  .. code-block:: bash
+
+     docker run --rm -v $(pwd):/data debian ls /data
 
 Usage of CPU-only and GPU Images
 ----------------------------------
 
-For each version of PaddlePaddle, we release 2 Docker images, a
-CPU-only one and a CUDA GPU one.  We do so by configuring
-`dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_
-automatically generate the latest docker images `paddledev/paddle:0.10.0rc1-cpu`
-and `paddledev/paddle:0.10.0rc1-gpu`.
+For each version of PaddlePaddle, we release two types of Docker images:
+development image and production image. Production image includes
+CPU-only version and a CUDA GPU version and their no-AVX versions. We
+put the docker images on `dockerhub.com
+<https://hub.docker.com/r/paddledev/paddle/>`_. You can find the
+latest versions under "tags" tab at dockerhub.com
 
-To run the CPU-only image as an interactive container:
+1. Production images, this image might have multiple variants:
 
-.. code-block:: bash
+   - GPU/AVX：:code:`paddlepaddle/paddle:<version>-gpu`
+   - GPU/no-AVX：:code:`paddlepaddle/paddle:<version>-gpu-noavx`
+   - CPU/AVX：:code:`paddlepaddle/paddle:<version>`
+   - CPU/no-AVX：:code:`paddlepaddle/paddle:<version>-noavx`
 
-    docker run -it --rm paddledev/paddle:0.10.0rc1-cpu /bin/bash
+   Please be aware that the CPU-only and the GPU images both use the
+   AVX instruction set, but old computers produced before 2008 do not
+   support AVX.  The following command checks if your Linux computer
+   supports AVX:
 
-or, we can run it as a daemon container
+   .. code-block:: bash
 
-.. code-block:: bash
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
 
-    docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:0.10.0rc1-cpu
+   
+   To run the CPU-only image as an interactive container:
 
-and SSH to this container using password :code:`root`:
+   .. code-block:: bash
 
-.. code-block:: bash
+      docker run -it --rm paddlepaddle/paddle:0.10.0rc2 /bin/bash
 
-    ssh -p 2202 root@localhost
+   Above method work with the GPU image too -- the recommended way is
+   using `nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_.
 
-An advantage of using SSH is that we can connect to PaddlePaddle from
-more than one terminals.  For example, one terminal running vi and
-another one running Python interpreter.  Another advantage is that we
-can run the PaddlePaddle container on a remote server and SSH to it
-from a laptop.
+   Please install nvidia-docker first following this `tutorial
+   <https://github.com/NVIDIA/nvidia-docker#quick-start>`_.
 
-Above methods work with the GPU image too -- just please don't forget
-to install CUDA driver and let Docker knows about it:
+   Now you can run a GPU image:
 
-.. code-block:: bash
+   .. code-block:: bash
 
-    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-    export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0rc1-gpu
+      nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0rc2-gpu /bin/bash
 
+2. development image :code:`paddlepaddle/paddle:<version>-dev`
 
-PaddlePaddle Book
-------------------
+   This image has packed related develop tools and runtime
+   environment. Users and developers can use this image instead of
+   their own local computer to accomplish development, build,
+   releasing, document writing etc. While different version of paddle
+   may depends on different version of libraries and tools, if you
+   want to setup a local environment, you must pay attention to the
+   versions.  The development image contains:
+   
+   - gcc/clang
+   - nvcc
+   - Python
+   - sphinx
+   - woboq
+   - sshd
+     
+   Many developers use servers with GPUs, they can use ssh to login to
+   the server and run :code:`docker exec` to enter the docker
+   container and start their work.  Also they can start a development
+   docker image with SSHD service, so they can login to the container
+   and start work.
 
-The Jupyter Notebook is an open-source web application that allows
-you to create and share documents that contain live code, equations,
-visualizations and explanatory text in a single browser.
 
-PaddlePaddle Book is an interactive Jupyter Notebook for users and developers. 
-We already exposed port 8888 for this book. If you want to
-dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
+Train Model Using Python API
+----------------------------
 
-Once you are inside the container, simply issue the command:
+Our official docker image provides a runtime for PaddlePaddle
+programs. The typical workflow will be as follows:
+
+Create a directory as workspace:
 
 .. code-block:: bash
-        
-    jupyter notebook
 
-Then, you would back and paste the address into the local browser:
-    
-.. code-block:: text
+   mkdir ~/workspace
 
-    http://localhost:8888/
+Edit a PaddlePaddle python program using your favourite editor
 
-That's all. Enjoy your journey!
+.. code-block:: bash
 
+   emacs ~/workspace/example.py
 
-Non-AVX Images
---------------
+Run the program using docker:
 
-Please be aware that the CPU-only and the GPU images both use the AVX
-instruction set, but old computers produced before 2008 do not support
-AVX.  The following command checks if your Linux computer supports
-AVX:
+.. code-block:: bash
+
+   docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2 python /workspace/example.py
+
+Or if you are using GPU for training:
 
 .. code-block:: bash
 
-   if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
+   nvidia-docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2-gpu python /workspace/example.py
 
+Above commands will start a docker container by running :code:`python
+/workspace/example.py`. It will stop once :code:`python
+/workspace/example.py` finishes.
 
-If it doesn't, we will need to build non-AVX images manually from
-source code:
+Another way is to tell docker to start a :code:`/bin/bash` session and
+run PaddlePaddle program interactively:
 
 .. code-block:: bash
 
-   cd ~
-   git clone https://github.com/PaddlePaddle/Paddle.git
-   cd Paddle
-   docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
-   docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
+   docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2 /bin/bash
+   # now we are inside docker container
+   cd /workspace
+   python example.py
 
+Running with GPU is identical:
 
-Development Using Docker
-------------------------
+.. code-block:: bash
 
-Developers can work on PaddlePaddle using Docker.  This allows
-developers to work on different platforms -- Linux, Mac OS X, and
-Windows -- in a consistent way.
+   nvidia-docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2-gpu /bin/bash
+   # now we are inside docker container
+   cd /workspace
+   python example.py
 
-1. Build the Development Environment as a Docker Image
 
-   .. code-block:: bash
+Develop PaddlePaddle or Train Model Using C++ API
+---------------------------------------------------
 
-      git clone --recursive https://github.com/PaddlePaddle/Paddle
-      cd Paddle
-      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .
+We will be using PaddlePaddle development image since it contains all
+compiling tools and dependencies.
 
+Let's clone PaddlePaddle repo first:
 
-   Note that by default :code:`docker build` wouldn't import source
-   tree into the image and build it.  If we want to do that, we need
-   to set a build arg:
+.. code-block:: bash
 
-   .. code-block:: bash
+   git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle
 
-      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile --build-arg BUILD_AND_INSTALL=ON .
+Mount both workspace folder and paddle code folder into docker
+container, so we can access them inside docker container. There are
+two ways of using PaddlePaddle development docker image:
 
+- run interactive bash directly
 
-2. Run the Development Environment
+  .. code-block:: bash
 
-   Once we got the image :code:`paddle:dev`, we can use it to develop
-   Paddle by mounting the local source code tree into a container that
-   runs the image:
+     # use nvidia-docker instead of docker if you need to use GPU
+     docker run -it -v ~/workspace:/workspace -v $(pwd):/paddle paddlepaddle/paddle:0.10.0rc2-dev /bin/bash
+     # now we are inside docker container
 
-   .. code-block:: bash
+- or, we can run it as a daemon container
 
-      docker run -d -p 2202:22 -p 8888:8888 -v $PWD:/paddle paddle:dev
+  .. code-block:: bash
 
-   This runs a container of the development environment Docker image
-   with the local source tree mounted to :code:`/paddle` of the
-   container.
+     # use nvidia-docker instead of docker if you need to use GPU
+     docker run -d -p 2202:22 -p 8888:8888 -v ~/workspace:/workspace -v $(pwd):/paddle paddlepaddle/paddle:0.10.0rc2-dev /usr/sbin/sshd -D
 
-   Note that the default entry-point of :code:`paddle:dev` is
-   :code:`sshd`, and above :code:`docker run` commands actually starts
-   an SSHD server listening on port 2202.  This allows us to log into
-   this container with:
+  and SSH to this container using password :code:`root`:
 
-   .. code-block:: bash
+  .. code-block:: bash
 
-      ssh root@localhost -p 2202
+     ssh -p 2202 root@localhost
 
-   Usually, I run above commands on my Mac.  I can also run them on a
-   GPU server :code:`xxx.yyy.zzz.www` and ssh from my Mac to it:
+  An advantage is that we can run the PaddlePaddle container on a
+  remote server and SSH to it from a laptop.
 
-   .. code-block:: bash
+When developing PaddlePaddle, you can edit PaddlePaddle source code
+from outside of docker container using your favoriate editor. To
+compile PaddlePaddle, run inside container:
 
-      my-mac$ ssh root@xxx.yyy.zzz.www -p 2202
+.. code-block:: bash
 
-3. Build and Install Using the Development Environment
+   WITH_GPU=OFF WITH_AVX=ON WITH_TEST=ON bash /paddle/paddle/scripts/docker/build.sh
 
-   Once I am in the container, I can use
-   :code:`paddle/scripts/docker/build.sh` to build, install, and test
-   Paddle:
+This builds everything about Paddle in :code:`/paddle/build`.  And we
+can run unit tests there:
 
-   .. code-block:: bash
+.. code-block:: bash
 
-      /paddle/paddle/scripts/docker/build.sh
+   cd /paddle/build
+   ctest
 
-   This builds everything about Paddle in :code:`/paddle/build`.  And
-   we can run unit tests there:
+When training model using C++ API, we can edit paddle program in
+~/workspace outside of docker. And build from /workspace inside of
+docker.
 
-   .. code-block:: bash
+PaddlePaddle Book
+------------------
+
+The Jupyter Notebook is an open-source web application that allows
+you to create and share documents that contain live code, equations,
+visualizations and explanatory text in a single browser.
+
+PaddlePaddle Book is an interactive Jupyter Notebook for users and developers.
+We already exposed port 8888 for this book. If you want to
+dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
+
+We provide a packaged book image, simply issue the command:
+
+.. code-block:: bash
 
-      cd /paddle/build
-      ctest
+    docker run -p 8888:8888 paddlepaddle/book
+
+Then, you would back and paste the address into the local browser:
+
+.. code-block:: text
+
+    http://localhost:8888/
+
+That's all. Enjoy your journey!
 
 
 Documentation
@@ -191,7 +273,7 @@ container:
 
 .. code-block:: bash
 
-   docker run -d --name paddle-cpu-doc paddle:0.10.0rc1-cpu
+   docker run -d --name paddle-cpu-doc paddle:<version>
    docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
 
 
diff --git a/doc/getstarted/build_and_install/ubuntu_install_cn.rst b/doc/getstarted/build_and_install/ubuntu_install_cn.rst
index d02d9c63bb..9e39ccb00f 100644
--- a/doc/getstarted/build_and_install/ubuntu_install_cn.rst
+++ b/doc/getstarted/build_and_install/ubuntu_install_cn.rst
@@ -46,7 +46,6 @@ PaddlePaddle提供了ubuntu 14.04 deb安装包。
         with_double: OFF
         with_python: ON
         with_rdma: OFF
-        with_metric_learning:
         with_timer: OFF
         with_predict_sdk:
 
diff --git a/doc/templates/conf.py.cn.in b/doc/templates/conf.py.cn.in
index 6dc48704bc..95cad835b1 100644
--- a/doc/templates/conf.py.cn.in
+++ b/doc/templates/conf.py.cn.in
@@ -55,6 +55,7 @@ extensions = [
     'sphinx.ext.napoleon',
     'sphinx.ext.graphviz'
 ]
+mathjax_path="https://cdn.bootcss.com/mathjax/2.7.0/MathJax.js"
 table_styling_embed_css = True
 
 autodoc_member_order = 'bysource'
diff --git a/doc/tutorials/embedding_model/index_cn.md b/doc/tutorials/embedding_model/index_cn.md
index fe800308d8..2b4a79fbbf 100644
--- a/doc/tutorials/embedding_model/index_cn.md
+++ b/doc/tutorials/embedding_model/index_cn.md
@@ -6,9 +6,10 @@
 
 ## 介绍 ###
 ### 中文字典 ###
-我们的字典使用内部的分词工具对百度知道和百度百科的语料进行分词后产生。分词风格如下： "《红楼梦》"将被分为 "《"，"红楼梦"，"》"，和 "《红楼梦》"。字典采用UTF8编码，输出有2列：词本身和词频。字典共包含 3206325个词和3个特殊标记：
+我们的字典使用内部的分词工具对百度知道和百度百科的语料进行分词后产生。分词风格如下： "《红楼梦》"将被分为 "《"，"红楼梦"，"》"，和 "《红楼梦》"。字典采用UTF8编码，输出有2列：词本身和词频。字典共包含 3206326个词和4个特殊标记：
   - `<s>`: 分词序列的开始
   - `<e>`: 分词序列的结束
+  - `PALCEHOLDER_JUST_IGNORE_THE_EMBEDDING`: 占位符，没有实际意义
   - `<unk>`: 未知词
 
 ### 中文词向量的预训练模型 ###
diff --git a/doc/tutorials/embedding_model/index_en.md b/doc/tutorials/embedding_model/index_en.md
index d793a50f48..9525f64f9b 100644
--- a/doc/tutorials/embedding_model/index_en.md
+++ b/doc/tutorials/embedding_model/index_en.md
@@ -6,9 +6,10 @@ We thank @lipeng for the pull request that defined the model schemas and pretrai
 
 ## Introduction ###
 ### Chinese Word Dictionary ###
-Our Chinese-word dictionary is created on Baidu ZhiDao and Baidu Baike by using in-house word segmentor. For example, the participle of "《红楼梦》" is "《"，"红楼梦"，"》"，and "《红楼梦》". Our dictionary (using UTF-8 format) has has two columns: word and its frequency. The total word count is 3206325, including 3 special token:
+Our Chinese-word dictionary is created on Baidu ZhiDao and Baidu Baike by using in-house word segmentor. For example, the participle of "《红楼梦》" is "《"，"红楼梦"，"》"，and "《红楼梦》". Our dictionary (using UTF-8 format) has has two columns: word and its frequency. The total word count is 3206326, including 4 special token:
   - `<s>`: the start of a sequence
   - `<e>`: the end of a sequence
+  - `PALCEHOLDER_JUST_IGNORE_THE_EMBEDDING`: a placeholder, just ignore it and its embedding
   - `<unk>`: a word not included in dictionary
 
 ### Pretrained Chinese Word Embedding Model ###
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 503024cff3..9d6d67e62c 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -9,13 +9,8 @@ add_subdirectory(pserver)
 add_subdirectory(trainer)
 add_subdirectory(scripts)
 
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
-    ${CMAKE_CURRENT_SOURCE_DIR}/setup.py)
-
-if(WITH_PREDICT_SDK)
-    add_subdirectory(predict)
-endif()
-
 if(WITH_SWIG_PY)
+  configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
+          ${CMAKE_CURRENT_SOURCE_DIR}/setup.py)
   add_subdirectory(api)
 endif()
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index 6e8fcd114d..4d0dacae90 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -1,21 +1,3 @@
-FUNCTION(generate_python_api target_name)
-    ADD_CUSTOM_COMMAND(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
-                              ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
-                              ${PROJ_ROOT}/paddle/Paddle_wrap.h
-        COMMAND ${SWIG_EXECUTABLE} -python -c++ -outcurrentdir -I../ api/Paddle.swig
-                && mv ${PROJ_ROOT}/paddle/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
-        DEPENDS ${PROJ_ROOT}/paddle/api/Paddle.swig
-                ${PROJ_ROOT}/paddle/api/PaddleAPI.h
-                ${external_project_dependencies}
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle
-        COMMENT "Generate Python API from swig")
-    ADD_CUSTOM_TARGET(${target_name} ALL DEPENDS
-                ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
-                ${PROJ_ROOT}/paddle/Paddle_wrap.h
-                ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
-                ${external_project_dependencies})
-ENDFUNCTION(generate_python_api)
-
 set(API_SOURCES
     Arguments.cpp
     ConfigParser.cpp
@@ -33,65 +15,84 @@ set(API_HEADER
     PaddleAPI.h
     Internal.h)
 
-add_library(paddle_api STATIC
-        ${API_SOURCES})
+add_library(paddle_api STATIC ${API_SOURCES})
 add_dependencies(paddle_api gen_proto_cpp)
 
-list(LENGTH "${GFLAGS_LIBRARIES}" GFLAGS_LIBRARIES_LENGTH)
+INCLUDE(${SWIG_USE_FILE})
+INCLUDE_DIRECTORIES(${PROJ_ROOT}/paddle)
 
-if(${GFLAGS_LIBRARIES_LENGTH} EQUAL 0 AND TARGET "${GFLAGS_LIBRARIES}")
-# Because gflags compiled by cmake, so it is imported by cmake target,
-# not a real library path. Get the real library path here.
-message(STATUS "GFLAGS Libraries is ${GFLAGS_LIBRARIES}")
-get_target_property(GFLAGS_LOCATION ${GFLAGS_LIBRARIES} LOCATION)
-message(STATUS "GFLAGS Target location is ${GFLAGS_LOCATION}")
-else()
-set(GFLAGS_LOCATION ${GFLAGS_LIBRARIES})
-endif()
+FILE(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py)
+
+SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON)
+
+SET(CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR})
+SET(CMAKE_CXX_FLAGS "-std=c++11 -fPIC -Wall")
+IF(WITH_COVERAGE)
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
+ENDIF(WITH_COVERAGE)
 
-configure_file(
-    paddle_api_config.py.in
-    ${PROJ_ROOT}/paddle/api/paddle_api_config.py
+SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS
+    paddle_parameter
+    paddle_function
+    paddle_math
+    paddle_utils
+    paddle_gserver
+    paddle_pserver
+    paddle_api
+    paddle_cuda
+    paddle_trainer_lib
+    paddle_network
+    paddle_proto
+    ${external_project_dependencies}
 )
 
-generate_python_api(python_swig_sources)
+IF(APPLE)
+    SET(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load")
+ELSE(APPLE)
+    SET(START_GROUP "-Xlinker -start-group")
+    SET(END_GROUP "-Xlinker -end-group")
+    SET(ARCHIVE_START "-Wl,--whole-archive")
+    SET(ARCHIVE_END "-Wl,--no-whole-archive")
+ENDIF(APPLE)
 
-file(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py)
+SWIG_ADD_MODULE(swig_paddle python Paddle.i)
+SWIG_LINK_LIBRARIES(swig_paddle
+    ${MACOS_LD_FLAGS}
+    ${START_GROUP}
+    ${ARCHIVE_START}
+    paddle_gserver
+    paddle_function
+    ${METRIC_LIBS}
+    ${ARCHIVE_END}
+    paddle_pserver
+    paddle_trainer_lib
+    paddle_network
+    paddle_parameter
+    paddle_math
+    paddle_utils
+    paddle_proto
+    paddle_cuda
+    paddle_api
+    ${CMAKE_DL_LIBS}
+    ${EXTERNAL_LIBS}
+    ${CMAKE_THREAD_LIBS_INIT}
+    ${START_END}
+)
 
-# TODO(yuyang18) : make wheel name calculated by cmake
-add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/dist/.timestamp
+add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PROJ_ROOT}/paddle/py_paddle
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch dist/.timestamp
     COMMAND rm -rf py_paddle.egg-info build
     WORKING_DIRECTORY ${PROJ_ROOT}/paddle
-    DEPENDS python_swig_sources
-            paddle_parameter
-            paddle_function
-            paddle_math
-            paddle_utils
-            paddle_gserver
-            paddle_pserver
-            paddle_trainer
-            paddle_api
-            paddle_cuda
-        ${PY_PADDLE_PYTHON_FILES}
+    DEPENDS _swig_paddle
 )
 
-install(DIRECTORY ${PROJ_ROOT}/paddle/dist/
-    DESTINATION opt/paddle/share/wheels
-)
+# TODO(yuyang18) : make wheel name calculated by cmake
+add_custom_target(python_api_wheel ALL DEPENDS ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so)
 
-add_custom_target(python_api_wheel ALL DEPENDS
-  ${PROJ_ROOT}/paddle/dist/.timestamp)
-add_dependencies(python_api_wheel python_swig_sources
-  paddle_parameter
-  paddle_math
-  paddle_utils
-  paddle_gserver
-  paddle_pserver
-  paddle_trainer
-  paddle_api
-  paddle_cuda)
+install(DIRECTORY ${PROJ_ROOT}/paddle/dist/ DESTINATION opt/paddle/share/wheels)
 
 if(WITH_TESTING)
     IF(NOT PY_PIP_FOUND)
diff --git a/paddle/api/Paddle.swig b/paddle/api/Paddle.i
similarity index 100%
rename from paddle/api/Paddle.swig
rename to paddle/api/Paddle.i
diff --git a/paddle/api/paddle_api_config.py.in b/paddle/api/paddle_api_config.py.in
deleted file mode 100644
index 82f45ba6cc..0000000000
--- a/paddle/api/paddle_api_config.py.in
+++ /dev/null
@@ -1,17 +0,0 @@
-PADDLE_BUILD_DIR="@CMAKE_CURRENT_BINARY_DIR@/../"
-WITH_GPU="@WITH_GPU@"
-PROTOBUF_LIBRARY="@PROTOBUF_LIBRARY@"
-ZLIB_LIBRARIES="@ZLIB_LIBRARIES@"
-CMAKE_THREAD_LIB="@CMAKE_THREAD_LIBS_INIT@"
-CMAKE_DL_LIBS="@CMAKE_DL_LIBS@"
-
-
-WITH_PYTHON="@WITH_PYTHON@"
-PYTHON_LIBRARIES="@PYTHON_LIBRARIES@"
-GLOG_LIBRARIES="@GLOG_LIBRARIES@"
-GFLAGS_LIBRARIES="@GFLAGS_LIBRARIES@"
-GFLAGS_LOCATION="@GFLAGS_LOCATION@"
-CBLAS_LIBRARIES="@CBLAS_LIBRARIES@"
-
-CUDA_LIBRARIES="@CUDA_CUDART_LIBRARY@"
-WITH_COVERALLS="@ON_COVERALLS@"
diff --git a/paddle/api/paddle_ld_flags.py b/paddle/api/paddle_ld_flags.py
deleted file mode 100644
index ad5dce209b..0000000000
--- a/paddle/api/paddle_ld_flags.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-try:
-    from paddle_api_config import *
-    import os.path
-    import platform
-
-    system = platform.system().lower()
-    is_osx = (system == 'darwin')
-    is_win = (system == 'windows')
-    is_lin = (system == 'linux')
-
-    if is_lin:
-        whole_start = "-Wl,--whole-archive"
-        whole_end = "-Wl,--no-whole-archive"
-    elif is_osx:
-        whole_start = ""
-        whole_end = ""
-
-    LIB_DIRS = [
-        "math", 'function', 'utils', 'parameter', "gserver", "api", "cuda",
-        "pserver", "trainer"
-    ]
-    PARENT_LIB_DIRS = ['proto']
-
-    class PaddleLDFlag(object):
-        def __init__(self):
-            self.paddle_build_dir = PADDLE_BUILD_DIR
-            self.paddle_build_dir = os.path.abspath(self.paddle_build_dir)
-            self.with_gpu = PaddleLDFlag.cmake_bool(WITH_GPU)
-            self.protolib = PROTOBUF_LIBRARY
-            self.zlib = ZLIB_LIBRARIES
-            self.thread = CMAKE_THREAD_LIB
-            self.dl_libs = CMAKE_DL_LIBS
-            self.with_python = PaddleLDFlag.cmake_bool(WITH_PYTHON)
-            self.python_libs = PYTHON_LIBRARIES
-
-            self.glog_libs = GLOG_LIBRARIES
-
-            self.with_coverage = PaddleLDFlag.cmake_bool(WITH_COVERALLS)
-            self.gflags_libs = GFLAGS_LIBRARIES
-            self.gflags_location = GFLAGS_LOCATION
-            self.cblas_libs = CBLAS_LIBRARIES
-            self.curt = CUDA_LIBRARIES
-
-        def ldflag_str(self):
-            return " ".join(
-                [self.libs_dir_str(), self.parent_dir_str(), self.libs_str()])
-
-        def libs_dir_str(self):
-            libdirs = LIB_DIRS
-            return " ".join(
-                map(lambda x: "-L" + os.path.join(self.paddle_build_dir, x),
-                    libdirs))
-
-        def parent_dir_str(self):
-            libdirs = PARENT_LIB_DIRS
-            return " ".join(
-                map(lambda x: "-L" + os.path.join(self.paddle_build_dir, '..', x),
-                    libdirs))
-
-        def libs_str(self):
-            libs = [
-                whole_start,
-                "-lpaddle_gserver",
-                "-lpaddle_function",
-                whole_end,
-                "-lpaddle_pserver",
-                "-lpaddle_trainer_lib",
-                "-lpaddle_network",
-                '-lpaddle_parameter',
-                "-lpaddle_math",
-                '-lpaddle_utils',
-                "-lpaddle_proto",
-                "-lpaddle_cuda",
-                "-lpaddle_api",
-                self.normalize_flag(self.protolib),
-                self.normalize_flag(self.glog_libs),
-                self.normalize_flag(self.gflags_libs),
-                self.normalize_flag(self.zlib),
-                self.normalize_flag(self.thread),
-                self.normalize_flag(self.dl_libs),
-                self.normalize_flag(self.cblas_libs),
-            ]
-
-            if self.with_python:
-                libs.append(self.normalize_flag(self.python_libs))
-            if self.with_gpu:
-                libs.append(self.normalize_flag(self.curt))
-            if self.with_coverage:
-                libs.append("-fprofile-arcs")
-            return " ".join(filter(lambda l: len(l) != 0, libs))
-
-        def normalize_flag(self, cmake_flag):
-            """
-            CMake flag string to ld flag
-            :type cmake_flag: str
-            """
-            if ";" in cmake_flag:
-                return " ".join(map(self.normalize_flag, cmake_flag.split(";")))
-            if cmake_flag.startswith("/"):  # is a path
-                return cmake_flag
-            elif cmake_flag.startswith("-l"):  # normal link command
-                return cmake_flag
-            elif cmake_flag in [
-                    "gflags-shared", "gflags-static", "gflags_nothreads-shared",
-                    "gflags_nothreads-static"
-            ]:  # special for gflags
-                assert PaddleLDFlag.cmake_bool(self.gflags_location)
-                return self.gflags_location
-            elif len(cmake_flag) != 0:
-                return "".join(["-l", cmake_flag])
-            else:
-                return ""
-
-        @staticmethod
-        def cmake_bool(cmake_str):
-            """
-            CMake bool string to bool
-            :param cmake_str: cmake boolean string
-            :type cmake_str: str
-            :rtype: bool
-            """
-            if cmake_str in ["FALSE", "OFF", "NO"] or cmake_str.endswith(
-                    "-NOTFOUND"):
-                return False
-            else:
-                return True
-
-        def c_flag(self):
-            if self.with_coverage:
-                return [
-                    "-fprofile-arcs", "-ftest-coverage", "-O0", "-g",
-                    "-std=c++11"
-                ]
-            else:
-                return ["-std=c++11"]
-except ImportError:
-
-    class PaddleLDFlag(object):
-        def ldflag_str(self):
-            pass
-
-        def c_flag(self):
-            pass
diff --git a/paddle/cuda/include/hl_cpu_matrix_kernel.cuh b/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
index f35bfbc5c8..9c49a4bd20 100644
--- a/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
@@ -17,7 +17,11 @@ limitations under the License. */
 
 #include <stdio.h>
 #include "hl_base.h"
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include "hl_neon_matrix_kernel.cuh"
+#else
 #include "hl_sse_matrix_kernel.cuh"
+#endif
 
 /**
  * @brief   cpu element wise unary operator.
diff --git a/paddle/cuda/include/hl_matrix_base.cuh b/paddle/cuda/include/hl_matrix_base.cuh
index db35ee2037..8b755c1095 100644
--- a/paddle/cuda/include/hl_matrix_base.cuh
+++ b/paddle/cuda/include/hl_matrix_base.cuh
@@ -66,6 +66,8 @@ typedef BaseOp SSESquaredDiff;
 typedef BaseOp SSEFirst;
 typedef BaseOp SSESecond;
 typedef BaseOp SSEClassificationError;
+#elif defined(__ARM__NEON__) || defined(__ARM_NEON)
+#include "hl_matrix_base_neon.cuh"
 #else
 #include "hl_matrix_base_sse.cuh"
 #endif
diff --git a/paddle/cuda/include/hl_matrix_base_neon.cuh b/paddle/cuda/include/hl_matrix_base_neon.cuh
new file mode 100644
index 0000000000..e13019f5ee
--- /dev/null
+++ b/paddle/cuda/include/hl_matrix_base_neon.cuh
@@ -0,0 +1,161 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_MATRIX_BASE_NEON_CUH_
+#define HL_MATRIX_BASE_NEON_CUH_
+
+namespace aggregate {
+class SSESum {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return vaddq_f32(a, b);
+  }
+};
+
+class SSEMax {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return vmaxq_f32(a, b);
+  }
+};
+
+class SSEMin {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return vminq_f32(a, b);
+  }
+};
+}  // namespace aggregate
+
+namespace base {
+namespace unary {
+class SSEIdentity {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a) const {
+    return a;
+  }
+};
+}  // namespace unary
+
+namespace binary {
+class SSEAdd {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return vaddq_f32(a, b);
+  }
+};
+
+class SSEAdd2 {
+public:
+  static const bool sse = true;
+  const real p1;
+  const real p2;
+  float32x4_t mp1;
+  float32x4_t mp2;
+
+public:
+  SSEAdd2(const real s1, const real s2) : p1(s1), p2(s2) {
+    mp1 = vdupq_n_f32(p1);
+    mp2 = vdupq_n_f32(p2);
+  }
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    float32x4_t tmp1, tmp2;
+    tmp1 = vmulq_f32(mp1, a);
+    tmp2 = vmulq_f32(mp2, b);
+    return vaddq_f32(tmp1, tmp2);
+  }
+};
+
+class SSESub {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return vsubq_f32(a, b);
+  }
+};
+
+class SSEMul {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return vmulq_f32(a, b);
+  }
+};
+
+class SSEDiv {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    float32x4_t tmp;
+    tmp = vrecpeq_f32(b);
+    return vmulq_f32(a, tmp);
+  }
+};
+
+class SSESquaredDiff {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    float32x4_t tmp;
+    tmp = vsubq_f32(a, b);
+    return vmulq_f32(tmp, tmp);
+  }
+};
+
+class SSEFirst {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return a;
+  }
+};
+
+class SSESecond {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return b;
+  }
+};
+
+class SSEClassificationError {
+public:
+  static const bool sse = true;
+  const real p;
+  float32x4_t mp;
+  uint32x4_t result;
+
+public:
+  explicit SSEClassificationError(const real s) : p(s) {
+    mp = vdupq_n_f32(p);
+    result = vdupq_n_u32(1);
+  }
+  // TODO: to be check
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    uint32x4_t tmp1 = vcgtq_f32(a, mp);
+    uint32x4_t tmp2 = vcgtq_f32(b, mp);
+    uint32x4_t tmp3 = veorq_u32(tmp1, tmp2);
+    return vcvtq_f32_u32(vandq_u32(tmp3, result));
+  }
+};
+}  // namespace binary
+}  // namespace base
+
+#endif /* HL_MATRIX_BASE_NEON_CUH_ */
diff --git a/paddle/cuda/include/hl_matrix_type.cuh b/paddle/cuda/include/hl_matrix_type.cuh
index 59213eee75..f965ba9667 100644
--- a/paddle/cuda/include/hl_matrix_type.cuh
+++ b/paddle/cuda/include/hl_matrix_type.cuh
@@ -17,13 +17,20 @@ limitations under the License. */
 
 #include "hl_base.h"
 
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__)
 #include <vector_types.h>
 #ifndef PADDLE_TYPE_DOUBLE
 typedef float4 vecType;
 #else
 typedef double2 vecType;
 #endif
+#elif (defined  __ARM_NEON) || (defined __ARM_NEON__)
+#include <arm_neon.h>
+#ifndef PADDLE_TYPE_DOUBLE
+typedef float32x4_t  vecType;
+#else
+#error NEON instructions does not support double precision
+#endif
 #else
 #include <mmintrin.h>
 #include <xmmintrin.h>
diff --git a/paddle/cuda/include/hl_neon_matrix_kernel.cuh b/paddle/cuda/include/hl_neon_matrix_kernel.cuh
new file mode 100644
index 0000000000..7b4e5b0007
--- /dev/null
+++ b/paddle/cuda/include/hl_neon_matrix_kernel.cuh
@@ -0,0 +1,299 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_NEON_MATRIX_KERNEL_CUH_
+#define HL_NEON_MATRIX_KERNEL_CUH_
+
+#include "hl_matrix_type.cuh"
+
+#define VECTOR_SIZE     16
+
+/* number of float in vector */
+#define     VECTOR_LEN      4
+#define     VECTOR_SET      vdupq_n_f32
+
+inline bool hl_check_align(size_t size) {
+  return !(size & (VECTOR_SIZE - 1));
+}
+
+inline bool hl_check_align(void *ptr) {
+  return hl_check_align(reinterpret_cast<size_t>(ptr));
+}
+
+template <class Agg>
+inline real hl_agg_op(Agg agg, vecType mm) {
+  float32x4_t rev = vrev64q_f32(mm);
+  float32x4_t tmp1 = agg.vecOp(rev, rev);
+  float32x2_t lo = vget_high_f32(rev);
+  float32x2_t hi = vget_low_f32(rev);
+  float32x4_t tmp2 = vcombine_f32(hi, lo);
+  float32x4_t ret = agg.vecOp(tmp1, tmp2);
+
+  return vgetq_lane_f32(ret, 0);
+}
+
+template <class Agg, class Op, class Saver>
+void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
+                          int dimM, int dimN,
+                          real *dst, int ld,
+                          real *A, int lda) {
+  for (int i = 0; i < dimM; i++, A += lda) {
+    vecType mm = VECTOR_SET(agg.init());
+    vecType *a = (vecType*)(A);
+    for (int j = 0; j < dimN / VECTOR_LEN; j++, a++) {
+      mm = agg.vecOp(mm, op.vecOp(*a));
+    }
+
+    int rem = dimN % VECTOR_LEN;
+    if (rem) {
+      real tmp = hl_agg_op(agg, mm);
+      real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN;
+      for (int j = 0; j < rem; j++) {
+        tmp = agg(tmp, op(a[j]));
+      }
+      dst[i*ld] = sv(dst[i*ld], tmp);
+    } else {
+      dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm));
+    }
+  }
+}
+
+template <class Agg, class Op, class Saver>
+void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
+                          int dimM, int dimN,
+                          real *dst, int ld,
+                          real *A, int lda,
+                          real *B, int ldb) {
+  for (int i = 0; i < dimM; i++, A += lda, B += ldb) {
+    vecType mm = VECTOR_SET(agg.init());
+    vecType *a = (vecType*)(A);
+    vecType *b = (vecType*)(B);
+    for (int j = 0; j < dimN / VECTOR_LEN; j++, a++, b++) {
+        mm = agg.vecOp(mm, op.vecOp(*a, *b));
+    }
+
+    int rem = dimN % VECTOR_LEN;
+    if (rem) {
+      real tmp = hl_agg_op(agg, mm);
+      real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN;
+      real *b = B + (dimN / VECTOR_LEN) * VECTOR_LEN;
+      for (int j = 0; j < rem; j++) {
+          tmp = agg(tmp, op(a[j], b[j]));
+      }
+      dst[i*ld] = sv(dst[i*ld], tmp);
+    } else {
+        dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm));
+    }
+  }
+}
+
+template <class Agg, class Op, class Saver>
+void hl_matrix_column_op(Agg agg, Op op, Saver sv,
+                         int dimM, int dimN,
+                         real *dst,
+                         real *A, int lda) {
+  for (int j = 0; j < dimN; j++) {
+    real tmp = agg.init();
+    for (int i = 0; i < dimM; i++) {
+        tmp = agg(tmp, op(A[i * lda + j]));
+    }
+    dst[j] = sv(dst[j], tmp);
+  }
+}
+
+template <class Agg, class Op, class Saver>
+void hl_matrix_column_op(Agg agg, Op op, Saver sv,
+                         int dimM, int dimN,
+                         real *dst,
+                         real *A, int lda,
+                         real *B, int ldb) {
+  for (int j = 0; j < dimN; j++) {
+    real tmp = agg.init();
+    for (int i = 0; i < dimM; i++) {
+        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
+    }
+    dst[j] = sv(dst[j], tmp);
+  }
+}
+
+/*
+ * MaxRow greater than or equal dimN
+ * dimN is multiples of VECTOR_LEN
+ * so rem <= MaxRow / VECTOR_LEN
+ */
+template <int MaxRow, class Agg, class Op, class Saver>
+void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv,
+                               int dimM, int dimN,
+                               real *dst,
+                               real *A, int lda) {
+  vecType mm[MaxRow / VECTOR_LEN];
+  for (int n = 0; n < MaxRow / VECTOR_LEN; n++) {
+    mm[n] = VECTOR_SET(agg.init());
+  }
+
+  for (int i = 0; i < dimM; i++) {
+    vecType *a = (vecType*)(A + i * lda);
+    for (int n = 0; n < dimN / VECTOR_LEN; n++) {
+      mm[n] = agg.vecOp(mm[n], op.vecOp(a[n]));
+    }
+  }
+
+  vecType *result = (vecType*)(dst);
+  for (int n = 0; n < dimN / VECTOR_LEN; n++) {
+    result[n] = sv.vecOp(result[n], mm[n]);
+  }
+
+  int rem = dimN % VECTOR_LEN;
+  if (rem) {
+    A += (dimN / VECTOR_LEN) * VECTOR_LEN;
+    dst += (dimN / VECTOR_LEN) * VECTOR_LEN;
+    hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda);
+  }
+}
+
+/*
+ * dimN is multiples of VECTOR_LEN
+ * dimN greater than Step
+ */
+template <int Step, class Agg, class Op, class Saver>
+void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
+                             int dimM, int dimN,
+                             real *dst,
+                             real *A, int lda) {
+  for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step) {
+    vecType mm[Step / VECTOR_LEN];
+    for (int n = 0; n < Step / VECTOR_LEN; n++) {
+      mm[n] = VECTOR_SET(agg.init());
+    }
+
+    for (int i = 0; i < dimM; i++) {
+      vecType *a = (vecType*)(A + i * lda);
+      for (int n = 0; n < Step / VECTOR_LEN; n++) {
+        mm[n] = agg.vecOp(mm[n], op.vecOp(a[n]));
+      }
+    }
+
+    vecType *result = (vecType*)(dst);
+    for (int n = 0; n < Step / VECTOR_LEN; n++) {
+      result[n] = sv.vecOp(result[n], mm[n]);
+    }
+  }
+
+  int remRow = dimN % Step;
+  if (remRow) {
+    hl_sse_column_op_with_rem<Step>(agg, op, sv, dimM, remRow, dst, A, lda);
+  }
+}
+
+template <class Agg, class Op, class Saver>
+void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
+                             int dimM, int dimN,
+                             real *dst,
+                             real *A, int lda) {
+  if (dimN <= 16) {
+    hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda);
+  } else if (dimN <= 32) {
+    hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda);
+  } else if (dimN <= 1024 || dimM <= 512) {
+    hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda);
+  } else {
+    hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda);
+  }
+}
+
+template <int MaxRow, class Agg, class Op, class Saver>
+void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv,
+                               int dimM, int dimN,
+                               real *dst,
+                               real *A, int lda,
+                               real *B, int ldb) {
+  vecType mm[MaxRow / VECTOR_LEN];
+  for (int n = 0; n < MaxRow / VECTOR_LEN; n++) {
+    mm[n] = VECTOR_SET(agg.init());
+  }
+
+  for (int i = 0; i < dimM; i++) {
+    vecType *a = (vecType*)(A + i * lda);
+    vecType *b = (vecType*)(B + i * ldb);
+    for (int n = 0; n < dimN / VECTOR_LEN; n++) {
+      mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n]));
+    }
+  }
+
+  vecType *result = (vecType*)(dst);
+  for (int n = 0; n < dimN / VECTOR_LEN; n++) {
+    result[n] = sv.vecOp(result[n], mm[n]);
+  }
+
+  int rem = dimN % VECTOR_LEN;
+  if (rem) {
+    A += (dimN / VECTOR_LEN) * VECTOR_LEN;
+    B += (dimN / VECTOR_LEN) * VECTOR_LEN;
+    dst += (dimN / VECTOR_LEN) * VECTOR_LEN;
+    hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda, B, ldb);
+  }
+}
+
+template <int Step, class Agg, class Op, class Saver>
+void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
+                             int dimM, int dimN,
+                             real *dst,
+                             real *A, int lda,
+                             real *B, int ldb) {
+  for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step, B += Step) {
+    vecType mm[Step / VECTOR_LEN];
+    for (int n = 0; n < Step / VECTOR_LEN; n++) {
+      mm[n] = VECTOR_SET(agg.init());
+    }
+
+    for (int i = 0; i < dimM; i++) {
+      vecType *a = (vecType*)(A + i * lda);
+      vecType *b = (vecType*)(B + i * ldb);
+      for (int n = 0; n < Step / VECTOR_LEN; n++) {
+        mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n]));
+      }
+    }
+
+    vecType *result = (vecType*)(dst);
+    for (int n = 0; n < Step / VECTOR_LEN; n++) {
+      result[n] = sv.vecOp(result[n], mm[n]);
+    }
+  }
+
+  int remRow = dimN % Step;
+  if (remRow) {
+    hl_sse_column_op_with_rem<Step>(
+        agg, op, sv, dimM, remRow, dst, A, lda, B, ldb);
+  }
+}
+
+template <class Agg, class Op, class Saver>
+void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
+                             int dimM, int dimN,
+                             real *dst,
+                             real *A, int lda,
+                             real *B, int ldb) {
+  if (dimN <= 16) {
+    hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
+  } else if (dimN <= 32) {
+    hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
+  } else if (dimN <= 1024 || dimM <= 512) {
+    hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
+  } else {
+    hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
+  }
+}
+
+#endif /* HL_NEON_MATRIX_KERNEL_CUH_ */
diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/cuda/include/hl_sequence.h
index 9f9d8f972e..973ddcceed 100644
--- a/paddle/cuda/include/hl_sequence.h
+++ b/paddle/cuda/include/hl_sequence.h
@@ -159,4 +159,10 @@ extern void hl_sequence_avg_forward(real* dst,
                                     int width,
                                     const int mode);
 
+extern void hl_sequence_avg_backward(real* dst,
+                                     real* src,
+                                     const int* starts,
+                                     int height,
+                                     int width,
+                                     const int mode);
 #endif /* HL_SEQUENCE_H_ */
diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/cuda/include/stub/hl_sequence_stub.h
index 05e51bce9e..920b417b1c 100644
--- a/paddle/cuda/include/stub/hl_sequence_stub.h
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
@@ -57,4 +57,10 @@ inline void hl_sequence_avg_forward(real* dst,
                                     int width,
                                     const int mode) {}
 
+inline void hl_sequence_avg_backward(real* dst,
+                                     real* src,
+                                     const int* starts,
+                                     int height,
+                                     int width,
+                                     const int mode) {}
 #endif  // HL_SEQUENCE_STUB_H_
diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu
index ba823de272..0fe2877f89 100644
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -325,12 +325,12 @@ __global__ void KeSequenceAvgForward(real* dst,
     int seqLength = end - start;
     if (seqLength == 0) return;
     real sum = 0.0;
-    for (int i = 0; i < seqLength; i++) {
-      sum += src[(start + i) * width + col];
+    for (int i = start; i < end; i++) {
+      sum += src[i * width + col];
     }
     sum = mode == 1 ? sum :
         (mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength));
-    dst[row * width + col] = sum;
+    dst[gid] = sum;
   }
 }
 
@@ -354,3 +354,48 @@ void hl_sequence_avg_forward(real* dst,
            (dst, src, starts, height, width, mode);
   CHECK_SYNC("hl_sequence_avg_forward failed");
 }
+
+__global__ void KeSequenceAvgBackward(real* dst,
+                                      real* src,
+                                      const int* starts,
+                                      int height,
+                                      int width,
+                                      const int mode) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int row = gid / width;
+  int col = gid % width;
+
+  if (gid < height * width) {
+    int start = starts[row];
+    int end = starts[row + 1];
+    int seqLength = end - start;
+    if (seqLength == 0) return;
+    real grad = src[gid];
+    grad = mode == 1 ? grad :
+        (mode == 0 ? grad / seqLength : grad * my_rsqrt((real)seqLength));
+    for (int i = start; i < end; i++) {
+      dst[i * width + col] += grad;
+    }
+  }
+}
+
+void hl_sequence_avg_backward(real* dst,
+                              real* src,
+                              const int* starts,
+                              int height,
+                              int width,
+                              const int mode) {
+  CHECK_NOTNULL(dst);
+  CHECK_NOTNULL(src);
+  CHECK_NOTNULL(starts);
+
+  int block = 512;
+  int grid = DIVUP(width * height, 512);
+
+  CHECK(mode == 0 || mode == 1 || mode == 2)
+    << "mode error in hl_sequence_avg_backward!";
+
+  KeSequenceAvgBackward<<< grid, block, 0, STREAM_DEFAULT >>>
+           (dst, src, starts, height, width, mode);
+  CHECK_SYNC("hl_sequence_avg_backward failed");
+}
diff --git a/paddle/function/Function.cpp b/paddle/function/Function.cpp
index f47d55a4ad..f71c0f681b 100644
--- a/paddle/function/Function.cpp
+++ b/paddle/function/Function.cpp
@@ -16,66 +16,6 @@ limitations under the License. */
 
 namespace paddle {
 
-template <>
-size_t FuncConfig::get<size_t>(const std::string& key) const {
-  auto it = valueMap_.find(key);
-  CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
-  return it->second.s;
-}
-
-template <>
-real FuncConfig::get<real>(const std::string& key) const {
-  auto it = valueMap_.find(key);
-  CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
-  return it->second.r;
-}
-
-template <>
-int FuncConfig::get<int>(const std::string& key) const {
-  auto it = valueMap_.find(key);
-  CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
-  return it->second.i;
-}
-
-template <>
-bool FuncConfig::get<bool>(const std::string& key) const {
-  auto it = valueMap_.find(key);
-  CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
-  return it->second.b;
-}
-
-template <>
-FuncConfig& FuncConfig::set<size_t>(const std::string& key, size_t v) {
-  CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
-                                                      << key;
-  valueMap_[key].s = v;
-  return *this;
-}
-
-template <>
-FuncConfig& FuncConfig::set<real>(const std::string& key, real v) {
-  CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
-                                                      << key;
-  valueMap_[key].r = v;
-  return *this;
-}
-
-template <>
-FuncConfig& FuncConfig::set<int>(const std::string& key, int v) {
-  CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
-                                                      << key;
-  valueMap_[key].i = v;
-  return *this;
-}
-
-template <>
-FuncConfig& FuncConfig::set<bool>(const std::string& key, bool v) {
-  CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
-                                                      << key;
-  valueMap_[key].b = v;
-  return *this;
-}
-
 void BufferArgs::addArg(const Matrix& arg,
                         const TensorShape& shape,
                         ArgType argType) {
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
index 3bbeb6e525..15eb35b7f7 100644
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -18,32 +18,49 @@ limitations under the License. */
 #include <vector>
 #include "BufferArg.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Any.h"
 #include "paddle/utils/ClassRegistrar.h"
+#include "paddle/utils/Error.h"
 
 namespace paddle {
 
 /**
  * Function Configuration.
  * The argument type of Function::init.
- * Follow-up will consider moving this data structure to Proto inside.
  */
 class FuncConfig {
 public:
-  union value {
-    size_t s;
-    real r;
-    int i;
-    bool b;
-  };
-
   template <typename T>
-  T get(const std::string& key) const;
+  T get(const std::string& key, Error* err = nullptr) const {
+    try {
+      return any_cast<T>(valueMap_.at(key));
+    } catch (std::exception& e) {  // could be cast or out of range exception.
+      if (err) {
+        *err = Error(e.what());
+      } else {
+        LOG(FATAL) << "Cannot get key " << key << " with error " << e.what();
+      }
+      return T();
+    }
+  }
 
   template <typename T>
-  FuncConfig& set(const std::string& key, T v);
+  FuncConfig& set(const std::string& key, T v, Error* err = nullptr) {
+    auto it = valueMap_.find(key);
+    if (it != valueMap_.end()) {  // already contains key.
+      if (err) {
+        *err = Error("Key %s is already set in FuncConfig", key.c_str());
+      } else {
+        LOG(FATAL) << "Key " << key << " is already set in FuncConfig.";
+      }
+      return *this;
+    }
+    valueMap_[key] = any(v);
+    return *this;
+  }
 
 protected:
-  std::map<std::string, value> valueMap_;
+  mutable std::unordered_map<std::string, any> valueMap_;
 };
 
 /**
diff --git a/paddle/function/PadOp.cpp b/paddle/function/PadOp.cpp
index f1a0d2a1a9..adba7c92ec 100644
--- a/paddle/function/PadOp.cpp
+++ b/paddle/function/PadOp.cpp
@@ -25,9 +25,9 @@ void Pad<DEVICE_TYPE_CPU>(real* outputs,
                           const int inH,
                           const int inW,
                           const PadConf& pad) {
-  int cstart = pad.channelStart, cend = pad.channelEnd;
-  int hstart = pad.heightStart, hend = pad.heightEnd;
-  int wstart = pad.widthStart, wend = pad.widthEnd;
+  int cstart = pad.channel[0], cend = pad.channel[1];
+  int hstart = pad.height[0], hend = pad.height[1];
+  int wstart = pad.width[0], wend = pad.width[1];
   int outC = inC + cstart + cend;
   int outH = inH + hstart + hend;
   int outW = inW + wstart + wend;
@@ -51,9 +51,9 @@ void PadGrad<DEVICE_TYPE_CPU>(real* inGrad,
                               const int inH,
                               const int inW,
                               const PadConf& pad) {
-  int cstart = pad.channelStart, cend = pad.channelEnd;
-  int hstart = pad.heightStart, hend = pad.heightEnd;
-  int wstart = pad.widthStart, wend = pad.widthEnd;
+  int cstart = pad.channel[0], cend = pad.channel[1];
+  int hstart = pad.height[0], hend = pad.height[1];
+  int wstart = pad.width[0], wend = pad.width[1];
   int outC = inC + cstart + cend;
   int outH = inH + hstart + hend;
   int outW = inW + wstart + wend;
@@ -71,6 +71,12 @@ void PadGrad<DEVICE_TYPE_CPU>(real* inGrad,
   }
 }
 
+static inline PadConf castToPadConf(const FuncConfig& conf) {
+  return {conf.get<std::vector<uint32_t>>("channel"),
+          conf.get<std::vector<uint32_t>>("height"),
+          conf.get<std::vector<uint32_t>>("width")};
+}
+
 /**
  * \brief Padding zeros to input according to the specify dimension.
  *        The struct pad_ contains the padding size in each dimension.
@@ -127,14 +133,7 @@ void PadGrad<DEVICE_TYPE_CPU>(real* inGrad,
 template <DeviceType Device>
 class PadFunc : public FunctionBase {
 public:
-  void init(const FuncConfig& config) override {
-    pad_.channelStart = config.get<int>("cstart");
-    pad_.channelEnd = config.get<int>("cend");
-    pad_.heightStart = config.get<int>("hstart");
-    pad_.heightEnd = config.get<int>("hend");
-    pad_.widthStart = config.get<int>("wstart");
-    pad_.widthEnd = config.get<int>("wend");
-  }
+  void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK_EQ(1UL, inputs.size());
@@ -175,14 +174,7 @@ private:
 template <DeviceType Device>
 class PadGradFunc : public FunctionBase {
 public:
-  void init(const FuncConfig& config) override {
-    pad_.channelStart = config.get<int>("cstart");
-    pad_.channelEnd = config.get<int>("cend");
-    pad_.heightStart = config.get<int>("hstart");
-    pad_.heightEnd = config.get<int>("hend");
-    pad_.widthStart = config.get<int>("wstart");
-    pad_.widthEnd = config.get<int>("wend");
-  }
+  void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK_EQ(1UL, inputs.size());
diff --git a/paddle/function/PadOp.h b/paddle/function/PadOp.h
index 7b5c730a6a..0e226ec737 100644
--- a/paddle/function/PadOp.h
+++ b/paddle/function/PadOp.h
@@ -19,18 +19,12 @@ limitations under the License. */
 namespace paddle {
 
 struct PadConf {
-  /// how many values to add before the data along channel dimension.
-  int channelStart;
-  /// how many values to add after the data along channel dimension.
-  int channelEnd;
-  /// how many values to add before the data along height dimension.
-  int heightStart;
-  /// how many values to add after the data along height dimension.
-  int heightEnd;
-  /// how many values to add before the data along width dimension.
-  int widthStart;
-  /// how many values to add after the data along width dimension.
-  int widthEnd;
+  /// how many values to add before/after the data along channel dimension.
+  std::vector<uint32_t> channel;
+  /// how many values to add before/after the data along height dimension.
+  std::vector<uint32_t> height;
+  /// how many values to add before/after the data along width dimension.
+  std::vector<uint32_t> width;
 };
 
 /**
diff --git a/paddle/function/PadOpGpu.cu b/paddle/function/PadOpGpu.cu
index 9104b1aca5..9094f15284 100644
--- a/paddle/function/PadOpGpu.cu
+++ b/paddle/function/PadOpGpu.cu
@@ -44,9 +44,9 @@ void Pad<DEVICE_TYPE_GPU>(real* outputs,
   size_t nth = num * inC * inH * inW;
   int blockSize = 1024;
   int gridSize = (nth + 1024 - 1) / 1024;
-  int cstart = pad.channelStart, cend = pad.channelEnd;
-  int hstart = pad.heightStart, hend = pad.heightEnd;
-  int wstart = pad.widthStart, wend = pad.widthEnd;
+  int cstart = pad.channel[0], cend = pad.channel[1];
+  int hstart = pad.height[0], hend = pad.height[1];
+  int wstart = pad.width[0], wend = pad.width[1];
   int outC = inC + cstart + cend;
   int outH = inH + hstart + hend;
   int outW = inW + wstart + wend;
@@ -83,9 +83,9 @@ void PadGrad<DEVICE_TYPE_GPU>(real* inGrad,
   int nth = num * inC * inH * inW;
   int blockSize = 1024;
   int gridSize = (nth + 1024 - 1) / 1024;
-  int cstart = pad.channelStart, cend = pad.channelEnd;
-  int hstart = pad.heightStart, hend = pad.heightEnd;
-  int wstart = pad.widthStart, wend = pad.widthEnd;
+  int cstart = pad.channel[0], cend = pad.channel[1];
+  int hstart = pad.height[0], hend = pad.height[1];
+  int wstart = pad.width[0], wend = pad.width[1];
   int outC = inC + cstart + cend;
   int outH = inH + hstart + hend;
   int outW = inW + wstart + wend;
diff --git a/paddle/function/PadOpTest.cpp b/paddle/function/PadOpTest.cpp
index cd22d91135..f77ac2a8c4 100644
--- a/paddle/function/PadOpTest.cpp
+++ b/paddle/function/PadOpTest.cpp
@@ -24,48 +24,22 @@ TEST(Pad, real) {
         for (size_t imgSizeW : {5, 32, 96}) {
           VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
                   << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
-
-          FunctionCompare compare("Pad",
-                                  FuncConfig()
-                                      .set("cstart", 2)
-                                      .set("cend", 3)
-                                      .set("hstart", 1)
-                                      .set("hend", 2)
-                                      .set("wstart", 3)
-                                      .set("wend", 2));
-          TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
-          TensorShape outDims{
-              numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
-          compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, inDims));
-          compare.addOutputs(BufferArg(VALUE_TYPE_FLOAT, outDims, ASSIGN_TO));
-          compare.run();
-        }
-      }
-    }
-  }
-}
-
-TEST(PadGrad, real) {
-  for (size_t numSamples : {5, 32}) {
-    for (size_t channels : {1, 5, 32}) {
-      for (size_t imgSizeH : {5, 33, 100}) {
-        for (size_t imgSizeW : {5, 32, 96}) {
-          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
-                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
-          FunctionCompare compare("PadGrad",
-                                  FuncConfig()
-                                      .set("cstart", 2)
-                                      .set("cend", 3)
-                                      .set("hstart", 1)
-                                      .set("hend", 2)
-                                      .set("wstart", 3)
-                                      .set("wend", 2));
-          TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
-          TensorShape outDims{
-              numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
-          compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, outDims));
-          compare.addOutputs(BufferArg(VALUE_TYPE_FLOAT, inDims, ASSIGN_TO));
-          compare.run();
+          for (bool test_grad : {false, true}) {
+            FunctionCompare compare(
+                test_grad ? "PadGrad" : "Pad",
+                FuncConfig()
+                    .set<std::vector<uint32_t>>("channel", {2, 3})
+                    .set<std::vector<uint32_t>>("height", {1, 2})
+                    .set<std::vector<uint32_t>>("width", {3, 2}));
+            TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
+            TensorShape outDims{
+                numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
+            compare.addInputs(
+                BufferArg(VALUE_TYPE_FLOAT, test_grad ? outDims : inDims));
+            compare.addOutputs(BufferArg(
+                VALUE_TYPE_FLOAT, test_grad ? inDims : outDims, ASSIGN_TO));
+            compare.run();
+          }
         }
       }
     }
diff --git a/paddle/gserver/gradientmachines/GradientMachine.h b/paddle/gserver/gradientmachines/GradientMachine.h
index bc2f2f8563..f9c82a2bef 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.h
+++ b/paddle/gserver/gradientmachines/GradientMachine.h
@@ -134,9 +134,7 @@ public:
     backward(callback);
   }
 
-  virtual Argument getLayerOutput(const std::string& layerName) {
-    return *((Argument*)nullptr);
-  }
+  virtual Argument getLayerOutput(const std::string& layerName) = 0;
 
   // see comment in Layer.h for the function with the same name
   virtual void resetState() {}
diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp
index 2d30029027..7b1b99b135 100644
--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
@@ -42,7 +42,8 @@ void AgentLayer::forward(PassType passType) {
   // get Arguments from real layers
   if (numSamples_ > 0 && numSamples_ < realHeight) {
     if (realOutput.ids) {
-      output_.ids->subVecFrom(*realOutput.ids, 0, numSamples_);
+      output_.ids =
+          IVector::create(realOutput.ids->getData(), numSamples_, useGpu_);
     } else {
       output_.subArgFrom(
           realOutput, /* offset */ 0, numSamples_, getSize(), useGpu_);
diff --git a/paddle/gserver/layers/AverageLayer.cpp b/paddle/gserver/layers/AverageLayer.cpp
index b8955ab04f..96cc4288c6 100644
--- a/paddle/gserver/layers/AverageLayer.cpp
+++ b/paddle/gserver/layers/AverageLayer.cpp
@@ -26,8 +26,6 @@ bool AverageLayer::init(const LayerMap& layerMap,
                         const ParameterMap& parameterMap) {
   SequencePoolLayer::init(layerMap, parameterMap);
 
-  dataMtx_ = Matrix::create(nullptr, 1, 1, false, useGpu_);
-  outMtx_ = Matrix::create(nullptr, 1, getSize(), false, useGpu_);
   // average strategy
   if (config_.average_strategy() == "average") {
     mode_ = kAverage;
@@ -60,43 +58,9 @@ void AverageLayer::forward(PassType passType) {
 void AverageLayer::backward(const UpdateCallback& callback) {
   SequencePoolLayer::backward(callback);
 
-  const int* starts = startPositions_->getData(false);
-  MatrixPtr grad = getInputGrad(0);
-
-  if (grad) {
-    size_t dim = getSize();
-    real* gradientData = getInputGrad(0)->getData();
-    real* gradient = getOutputGrad()->getData();
-    size_t numSequences = startPositions_->getSize() - 1;
-    for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
-      // TODO(Dangqingqing) optimization for GPU
-      int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
-      if (0 == sequenceLength) {
-        // empty sequence
-        continue;
-      }
-      dataMtx_->setData(
-          gradientData + starts[sequenceId] * dim, sequenceLength, dim);
-      outMtx_->setData(gradient + sequenceId * dim);
-      switch (mode_) {
-        case kAverage: {
-          // plain average
-          dataMtx_->addBias(*outMtx_, 1.0f / sequenceLength);
-          break;
-        }
-        case kSum: {
-          // sum instead of average
-          dataMtx_->addBias(*outMtx_, 1.0f);
-          break;
-        }
-        case kAverageSquareRootN: {
-          // divide by square root of sequenceLength
-          dataMtx_->addBias(*outMtx_, 1.0f / sqrt(sequenceLength));
-          break;
-        }
-        default: { LOG(FATAL) << "should not reach here"; }
-      }
-    }
+  if (getInputGrad(0)) {
+    getInputGrad(0)->sequenceAvgBackward(
+        *getOutputGrad(), *startPositions_->getVector(useGpu_), mode_);
   }
 }
 
diff --git a/paddle/gserver/layers/AverageLayer.h b/paddle/gserver/layers/AverageLayer.h
index 621e1d7bb1..332552a304 100644
--- a/paddle/gserver/layers/AverageLayer.h
+++ b/paddle/gserver/layers/AverageLayer.h
@@ -45,8 +45,6 @@ public:
   void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
-  MatrixPtr outMtx_;
-  MatrixPtr dataMtx_;
   int mode_;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/CrossChannelNormLayer.cpp b/paddle/gserver/layers/CrossChannelNormLayer.cpp
new file mode 100644
index 0000000000..3fbccc1103
--- /dev/null
+++ b/paddle/gserver/layers/CrossChannelNormLayer.cpp
@@ -0,0 +1,122 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "NormLayer.h"
+#include "paddle/math/BaseMatrix.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+MatrixPtr CrossChannelNormLayer::createSampleMatrix(MatrixPtr data,
+                                                    size_t iter,
+                                                    size_t spatialDim) {
+  return Matrix::create(data->getData() + iter * channels_ * spatialDim,
+                        channels_,
+                        spatialDim,
+                        false,
+                        useGpu_);
+}
+
+MatrixPtr CrossChannelNormLayer::createSpatialMatrix(MatrixPtr data,
+                                                     size_t iter,
+                                                     size_t spatialDim) {
+  return Matrix::create(
+      data->getData() + iter * spatialDim, 1, spatialDim, false, useGpu_);
+}
+
+void CrossChannelNormLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  MatrixPtr inV = getInputValue(0);
+
+  size_t batchSize = inV->getHeight();
+  size_t dataDim = inV->getWidth();
+  CHECK_EQ(getSize(), dataDim);
+
+  reserveOutput(batchSize, dataDim);
+  MatrixPtr outV = getOutputValue();
+  size_t spatialDim = dataDim / channels_;
+
+  Matrix::resizeOrCreate(dataBuffer_, batchSize, dataDim, false, useGpu_);
+  Matrix::resizeOrCreate(spatialBuffer_, 1, spatialDim, false, useGpu_);
+  Matrix::resizeOrCreate(normBuffer_, batchSize, spatialDim, false, useGpu_);
+  normBuffer_->zeroMem();
+  // add eps to avoid overflow
+  normBuffer_->addScalar(*normBuffer_, 1e-6);
+  inV->square2(*dataBuffer_);
+  for (size_t i = 0; i < batchSize; i++) {
+    const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
+    const MatrixPtr dataTmp = createSampleMatrix(dataBuffer_, i, spatialDim);
+    MatrixPtr outVTmp = createSampleMatrix(outV, i, spatialDim);
+    MatrixPtr normTmp = createSpatialMatrix(normBuffer_, i, spatialDim);
+
+    // compute norm.
+    spatialBuffer_->sumCols(*dataTmp, 1, 0);
+    spatialBuffer_->sqrt2(*spatialBuffer_);
+    normTmp->copyFrom(*spatialBuffer_);
+    outVTmp->copyFrom(*inVTmp);
+    outVTmp->divRowVector(*spatialBuffer_);
+    // scale the layer.
+    outVTmp->mulColVector(*scale_->getW());
+  }
+}
+
+void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inG = getInputGrad(0);
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr outV = getOutputValue();
+
+  size_t batchSize = inG->getHeight();
+  size_t dataDim = inG->getWidth();
+  size_t spatialDim = dataDim / channels_;
+
+  dataBuffer_->dotMul(*outG, *outV);
+  Matrix::resizeOrCreate(scaleDiff_, channels_, 1, false, useGpu_);
+  Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_);
+  Matrix::resizeOrCreate(sampleBuffer_, channels_, spatialDim, false, useGpu_);
+  scaleDiff_->zeroMem();
+  for (size_t i = 0; i < batchSize; i++) {
+    MatrixPtr outGTmp = createSampleMatrix(outG, i, spatialDim);
+    const MatrixPtr dataTmp = createSampleMatrix(dataBuffer_, i, spatialDim);
+    const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
+    const MatrixPtr inGTmp = createSampleMatrix(inG, i, spatialDim);
+    const MatrixPtr normTmp = createSpatialMatrix(normBuffer_, i, spatialDim);
+
+    channelBuffer_->sumRows(*dataTmp, 1, 0);
+    channelBuffer_->dotDiv(*channelBuffer_, *(scale_->getW()));
+    // store a / scale[i] in scaleDiff_ temporary
+    scaleDiff_->add(*channelBuffer_, 1.);
+
+    sampleBuffer_->dotMul(*inVTmp, *outGTmp);
+    spatialBuffer_->sumCols(*sampleBuffer_, 1., 1.);
+    // scale the grad
+    inGTmp->copyFrom(*inVTmp);
+    inGTmp->mulRowVector(*spatialBuffer_);
+    // divide by square of norm
+    spatialBuffer_->dotMul(*normTmp, *normTmp);
+    inGTmp->divRowVector(*spatialBuffer_);
+    // subtract
+    inGTmp->add(*outGTmp, -1, 1);
+    // divide by norm
+    inGTmp->divRowVector(*normTmp);
+    // scale the diff
+    inGTmp->mulColVector(*scale_->getW());
+  }
+  // updata scale
+  if (scale_->getWGrad()) scale_->getWGrad()->copyFrom(*scaleDiff_);
+  scale_->getParameterPtr()->incUpdate(callback);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.cpp b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
index 9ddccc2027..fdcf994cdb 100644
--- a/paddle/gserver/layers/ExpandConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
@@ -107,6 +107,10 @@ void ExpandConvBaseLayer::expandOneFrame(MatrixPtr image,
   int channel = isDeconv_ ? numFilters_ : channels_[inIdx];
 
   resetExpandInput(subK_[inIdx] * groups_[inIdx], subN_[inIdx]);
+
+  CHECK_EQ(image->getWidth(),
+           static_cast<size_t>(imgSizeH_[inIdx] * imgSizeW_[inIdx] * channel));
+
   real *imgData = image->getData() + startIdx * image->getWidth();
   MatrixPtr imageTmp =
       Matrix::create(imgData,
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
index 3f6875fb9f..9afd40b167 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
@@ -36,7 +36,7 @@ namespace paddle {
  * |   |- 5
  * |
  * |-*- 0
- * |- 1
+ *   |- 1
  * @endcode
  *
  * where * indicates an internal node, and each leaf node represents a class.
diff --git a/paddle/gserver/layers/NormLayer.cpp b/paddle/gserver/layers/NormLayer.cpp
index 3db0af2515..e094078bfe 100644
--- a/paddle/gserver/layers/NormLayer.cpp
+++ b/paddle/gserver/layers/NormLayer.cpp
@@ -26,6 +26,8 @@ Layer* NormLayer::create(const LayerConfig& config) {
     return new ResponseNormLayer(config);
   } else if (norm == "cmrnorm-projection") {
     return new CMRProjectionNormLayer(config);
+  } else if (norm == "cross-channel-norm") {
+    return new CrossChannelNormLayer(config);
   } else {
     LOG(FATAL) << "Unknown norm type: " << norm;
     return nullptr;
@@ -54,4 +56,14 @@ bool ResponseNormLayer::init(const LayerMap& layerMap,
   return true;
 }
 
+bool CrossChannelNormLayer::init(const LayerMap& layerMap,
+                                 const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK(parameters_[0]);
+  const NormConfig& conf = config_.inputs(0).norm_conf();
+  channels_ = conf.channels();
+  scale_.reset(new Weight(channels_, 1, parameters_[0]));
+  return true;
+}
+
 }  // namespace paddle
diff --git a/paddle/gserver/layers/NormLayer.h b/paddle/gserver/layers/NormLayer.h
index e77faaa322..7c238ac944 100644
--- a/paddle/gserver/layers/NormLayer.h
+++ b/paddle/gserver/layers/NormLayer.h
@@ -65,4 +65,35 @@ public:
   }
 };
 
+/**
+ * This layer applys normalization across the channels of each sample to a
+ * conv layer's output, and scales the output by a group of trainable factors
+ * whose dimensions equal to the number of channels.
+ * - Input: One and only one input layer are accepted.
+ * - Output: The normalized data of the input data.
+ * Reference:
+ *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
+ *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
+ */
+class CrossChannelNormLayer : public NormLayer {
+public:
+  explicit CrossChannelNormLayer(const LayerConfig& config)
+      : NormLayer(config) {}
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback);
+  MatrixPtr createSampleMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
+  MatrixPtr createSpatialMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
+
+protected:
+  size_t channels_;
+  std::unique_ptr<Weight> scale_;
+  MatrixPtr scaleDiff_;
+  MatrixPtr normBuffer_;
+  MatrixPtr dataBuffer_;
+  MatrixPtr channelBuffer_;
+  MatrixPtr spatialBuffer_;
+  MatrixPtr sampleBuffer_;
+};
+
 }  // namespace paddle
diff --git a/paddle/gserver/layers/PadLayer.cpp b/paddle/gserver/layers/PadLayer.cpp
index bb618c09f9..a5ed7e057a 100644
--- a/paddle/gserver/layers/PadLayer.cpp
+++ b/paddle/gserver/layers/PadLayer.cpp
@@ -36,12 +36,9 @@ bool PadLayer::init(const LayerMap& layerMap,
   CHECK_EQ(2, pad_conf.pad_c_size());
   CHECK_EQ(2, pad_conf.pad_h_size());
   CHECK_EQ(2, pad_conf.pad_w_size());
-  padc_.push_back(pad_conf.pad_c(0));
-  padc_.push_back(pad_conf.pad_c(1));
-  padh_.push_back(pad_conf.pad_h(0));
-  padh_.push_back(pad_conf.pad_h(1));
-  padw_.push_back(pad_conf.pad_w(0));
-  padw_.push_back(pad_conf.pad_w(1));
+  padc_ = {pad_conf.pad_c(0), pad_conf.pad_c(1)};
+  padh_ = {pad_conf.pad_h(0), pad_conf.pad_h(1)};
+  padw_ = {pad_conf.pad_w(0), pad_conf.pad_w(1)};
 
   outDims_ = TensorShape(4);
   setOutDims(0);
@@ -49,21 +46,15 @@ bool PadLayer::init(const LayerMap& layerMap,
   createFunction(forward_,
                  "Pad",
                  FuncConfig()
-                     .set("cstart", padc_[0])
-                     .set("cend", padc_[1])
-                     .set("hstart", padh_[0])
-                     .set("hend", padh_[1])
-                     .set("wstart", padw_[0])
-                     .set("wend", padw_[1]));
+                     .set("channel", padc_)
+                     .set("height", padh_)
+                     .set("width", padw_));
   createFunction(backward_,
                  "PadGrad",
                  FuncConfig()
-                     .set("cstart", padc_[0])
-                     .set("cend", padc_[1])
-                     .set("hstart", padh_[0])
-                     .set("hend", padh_[1])
-                     .set("wstart", padw_[0])
-                     .set("wend", padw_[1]));
+                     .set("channel", padc_)
+                     .set("height", padh_)
+                     .set("width", padw_));
 
   return true;
 }
diff --git a/paddle/gserver/layers/PadLayer.h b/paddle/gserver/layers/PadLayer.h
index b2bbf28082..fe9388d8cc 100644
--- a/paddle/gserver/layers/PadLayer.h
+++ b/paddle/gserver/layers/PadLayer.h
@@ -38,9 +38,9 @@ protected:
   void setOutDims(const size_t batchSize);
   void setTensorDim(const size_t batchSize);
 
-  std::vector<int> padc_;
-  std::vector<int> padh_;
-  std::vector<int> padw_;
+  std::vector<uint32_t> padc_;
+  std::vector<uint32_t> padh_;
+  std::vector<uint32_t> padw_;
   TensorShape inDims_;
   TensorShape outDims_;
 };
diff --git a/paddle/gserver/layers/PriorBox.cpp b/paddle/gserver/layers/PriorBox.cpp
index bcf5e912a5..331bc7672e 100644
--- a/paddle/gserver/layers/PriorBox.cpp
+++ b/paddle/gserver/layers/PriorBox.cpp
@@ -20,7 +20,7 @@ namespace paddle {
 /**
  * @brief A layer for generating priorbox locations and variances.
  * - Input: Two and only two input layer are accepted. The input layer must be
- *        be a data output layer and a convolution output layer.
+ *          be a data output layer and a convolution output layer.
  * - Output: The priorbox locations and variances of the input data.
  * Reference:
  *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
@@ -45,27 +45,32 @@ protected:
   MatrixPtr buffer_;
 };
 
+REGISTER_LAYER(priorbox, PriorBoxLayer);
+
 bool PriorBoxLayer::init(const LayerMap& layerMap,
                          const ParameterMap& parameterMap) {
   Layer::init(layerMap, parameterMap);
   auto pbConf = config_.inputs(0).priorbox_conf();
+  std::vector<real> tmp;
+  aspectRatio_.push_back(1.);
   std::copy(pbConf.min_size().begin(),
             pbConf.min_size().end(),
             std::back_inserter(minSize_));
   std::copy(pbConf.max_size().begin(),
             pbConf.max_size().end(),
             std::back_inserter(maxSize_));
-  std::copy(pbConf.aspect_ratio().begin(),
-            pbConf.aspect_ratio().end(),
-            std::back_inserter(aspectRatio_));
   std::copy(pbConf.variance().begin(),
             pbConf.variance().end(),
             std::back_inserter(variance_));
+  std::copy(pbConf.aspect_ratio().begin(),
+            pbConf.aspect_ratio().end(),
+            std::back_inserter(tmp));
   // flip
-  int inputRatioLength = aspectRatio_.size();
-  for (int index = 0; index < inputRatioLength; index++)
-    aspectRatio_.push_back(1 / aspectRatio_[index]);
-  aspectRatio_.push_back(1.);
+  int inputRatioLength = tmp.size();
+  for (int index = 0; index < inputRatioLength; index++) {
+    aspectRatio_.push_back(tmp[index]);
+    aspectRatio_.push_back(1 / tmp[index]);
+  }
   numPriors_ = aspectRatio_.size();
   if (maxSize_.size() > 0) numPriors_++;
   return true;
@@ -94,12 +99,12 @@ void PriorBoxLayer::forward(PassType passType) {
     for (int w = 0; w < layerWidth; ++w) {
       real centerX = (w + 0.5) * stepW;
       real centerY = (h + 0.5) * stepH;
-      int minSize = 0;
+      real minSize = 0;
       for (size_t s = 0; s < minSize_.size(); s++) {
         // first prior.
         minSize = minSize_[s];
-        int boxWidth = minSize;
-        int boxHeight = minSize;
+        real boxWidth = minSize;
+        real boxHeight = minSize;
         // xmin, ymin, xmax, ymax.
         tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
         tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
@@ -112,7 +117,7 @@ void PriorBoxLayer::forward(PassType passType) {
           CHECK_EQ(minSize_.size(), maxSize_.size());
           // second prior.
           for (size_t s = 0; s < maxSize_.size(); s++) {
-            int maxSize = maxSize_[s];
+            real maxSize = maxSize_[s];
             boxWidth = boxHeight = sqrt(minSize * maxSize);
             tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
             tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
@@ -145,6 +150,5 @@ void PriorBoxLayer::forward(PassType passType) {
   MatrixPtr outV = getOutputValue();
   outV->copyFrom(buffer_->data_, dim * 2);
 }
-REGISTER_LAYER(priorbox, PriorBoxLayer);
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
index 7a13cd7ad0..944c705166 100644
--- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
+++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
@@ -25,6 +25,11 @@ namespace paddle {
  * Input: a sequence
  * If SequenceLevel = kNonseq:
  *   Output: a sequence containing only the last instance of the input sequence
+ *   If stride_ > 0:
+ *      Output: a shorten sequence. The operation of getting last instance of a
+ *              sequence is independently performed on every slice of the input
+ *              sequence, which is obtained by sliding a window with the window
+ *              size set to stride_.
  * If SequenceLevel = kSeq:
  *   Check input sequence must has sub-sequence
  *   Output: a sequence containing only the last instance of each sub-sequence
@@ -37,6 +42,7 @@ class SequenceLastInstanceLayer : public SequencePoolLayer {
 protected:
   MatrixPtr tmpSrc_;
   MatrixPtr tmpDest_;
+  std::vector<int> instanceIds_;
 
 public:
   explicit SequenceLastInstanceLayer(const LayerConfig& config)
@@ -54,6 +60,7 @@ REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer);
 bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
                                      const ParameterMap& parameterMap) {
   SequencePoolLayer::init(layerMap, parameterMap);
+  reversed_ = config_.select_first();
 
   tmpSrc_ =
       Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
@@ -66,7 +73,8 @@ bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
 void SequenceLastInstanceLayer::forward(PassType passType) {
   SequencePoolLayer::forward(passType);
 
-  const int* starts = startPositions_->getData(false);
+  auto starts = (stride_ > 0) ? stridePositions_->getData()
+                              : startPositions_->getData(false);
   MatrixPtr inputValue = getInputValue(0);
   MatrixPtr outputValue = getOutputValue();
 
@@ -74,9 +82,10 @@ void SequenceLastInstanceLayer::forward(PassType passType) {
     AsyncGpuBlock asyncGpuBlock;
     REGISTER_TIMER_INFO("SequenceLastInstanceLayerForward", getName().c_str());
 
+    instanceIds_.clear();
     for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
-      int insId =
-          config_.select_first() ? starts[seqId] : starts[seqId + 1] - 1;
+      int insId = reversed_ ? starts[seqId] : starts[seqId + 1] - 1;
+      instanceIds_.push_back(insId);
 
       outputValue->subMatrix(seqId, 1, tmpDest_)
           ->assign(*(inputValue->subMatrix(insId, 1, tmpSrc_)));
@@ -96,18 +105,13 @@ void SequenceLastInstanceLayer::backward(const UpdateCallback& callback) {
 
   MatrixPtr inputGrad = getInputGrad(0);
   MatrixPtr outputGrad = getOutputGrad();
-  const int* starts = startPositions_->getData(false);
-  size_t numSequences = startPositions_->getSize() - 1;
 
   if (inputGrad) {
     AsyncGpuBlock asyncGpuBlock;
     REGISTER_TIMER_INFO("SequenceLastInstanceLayerBackward", getName().c_str());
 
-    for (size_t seqId = 0; seqId < numSequences; ++seqId) {
-      int insId =
-          config_.select_first() ? starts[seqId] : starts[seqId + 1] - 1;
-
-      inputGrad->subMatrix(insId, 1, tmpDest_)
+    for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
+      inputGrad->subMatrix(instanceIds_[seqId], 1, tmpDest_)
           ->add(*(outputGrad->subMatrix(seqId, 1, tmpSrc_)));
     }
   }
diff --git a/paddle/gserver/layers/SequencePoolLayer.cpp b/paddle/gserver/layers/SequencePoolLayer.cpp
index 5807c42496..8c49502011 100644
--- a/paddle/gserver/layers/SequencePoolLayer.cpp
+++ b/paddle/gserver/layers/SequencePoolLayer.cpp
@@ -37,6 +37,7 @@ bool SequencePoolLayer::init(const LayerMap& layerMap,
   } else {
     LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
   }
+  stride_ = config_.seq_pool_stride();
   setNeedSequenceInfo(false);
   return true;
 }
@@ -55,8 +56,6 @@ void SequencePoolLayer::forward(PassType passType) {
   CHECK_EQ(starts->getData()[newBatchSize_], input.getBatchSize());
   CHECK_EQ(newBatchSize_, starts->getSize() - 1);
 
-  resetOutput(newBatchSize_, dim);
-
   /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
    * thus, in this case, output_ has no sequenceStartPositions.
    * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
@@ -67,6 +66,15 @@ void SequencePoolLayer::forward(PassType passType) {
         << "when trans_type = seq, input must hasSubseq";
     output_.degradeSequence(input);
   }
+  if (stride_ > 0) {
+    CHECK_EQ(input.hasSubseq(), 0UL)
+        << "sequence stride pooling is invalid for hasSubseq now";
+    output_.poolSequenceWithStride(
+        input, stride_, &stridePositions_, reversed_);
+    newBatchSize_ = stridePositions_->getSize() - 1;
+  }
+
+  resetOutput(newBatchSize_, dim);
 }
 
 void SequencePoolLayer::backward(const UpdateCallback& callback) {
diff --git a/paddle/gserver/layers/SequencePoolLayer.h b/paddle/gserver/layers/SequencePoolLayer.h
index 85b51ccd1d..293d1bf278 100644
--- a/paddle/gserver/layers/SequencePoolLayer.h
+++ b/paddle/gserver/layers/SequencePoolLayer.h
@@ -26,6 +26,10 @@ namespace paddle {
  *    Output: output size is the number of input sequences (NOT input instances)
  *    output[i] = seqlastin/average/max_{for each instance in this
  * sequence}{input[i]}
+ *    If stride_ > 0:
+ *        Check input sequence must not have sub-sequence
+ *        Output: a shorten sequence, pooling is performed upon a small local
+ *                area
  * If SequenceLevel = kSeq:
  *    Check input sequence must has sub-sequence
  *    Output: output size is the number of input sub-sequences
@@ -42,6 +46,11 @@ protected:
   enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
   size_t newBatchSize_;
   ICpuGpuVectorPtr startPositions_;
+  int stride_;
+  // Store the start position of each window.
+  IVectorPtr stridePositions_;
+  // Whether the input sequence is reversed or not.
+  bool reversed_ = false;
 
 public:
   explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {}
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index 7617af10ba..a0b1cd471d 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -778,8 +778,10 @@ void testProjectionGrad(ProjectionConfig conf,
   config.biasSize = biasSize == 0 ? config.layerConfig.size() : biasSize;
   config.layerConfig.set_bias_size(config.biasSize);
   config.layerConfig.set_shared_biases(sharedBias);
-  config.inputDefs.push_back(
-      {inputType, "layer_0", conf.input_size(), parameterSize});
+  config.inputDefs.push_back({inputType,
+                              "layer_0",
+                              static_cast<size_t>(conf.input_size()),
+                              parameterSize});
   *config.layerConfig.add_inputs()->mutable_proj_conf() = conf;
   config.testState = testState;
   testLayerGrad(config, "mixed", batchSize, false, useGpu);
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 5f8a7b79a0..193b876c31 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -804,10 +804,14 @@ TEST(Layer, ExpandLayer) {
   testExpandLayer("seq", true);       // seq expand to hasSubseq
 }
 
-void testDegradeLayer(bool hasSubseq, string layer_type, string trans_type) {
+void testDegradeLayer(bool hasSubseq,
+                      string layer_type,
+                      string trans_type,
+                      int stride) {
   TestConfig config;
   config.layerConfig.set_type(layer_type);
   config.layerConfig.set_size(10);
+  config.layerConfig.set_seq_pool_stride(stride);
   config.biasSize = 0;
 
   config.inputDefs.push_back(
@@ -827,36 +831,46 @@ void testDegradeLayer(bool hasSubseq, string layer_type, string trans_type) {
   if (layer_type == "average") {
     for (auto strategy : {"average", "sum", "squarerootn"}) {
       LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
-                << " average_strategy=" << strategy;
+                << " average_strategy=" << strategy
+                << " seq_pool_stride=" << stride;
       config.layerConfig.set_average_strategy(strategy);
       testDegradeLayerGrad(config, layer_type);
     }
   } else {
-    LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type;
+    LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
+              << " seq_pool_stride=" << stride;
     testDegradeLayerGrad(config, layer_type);
   }
 }
 
 TEST(Layer, MaxLayer) {
-  testDegradeLayer(false, "max", "non-seq");  // seq max to non-seq
-  testDegradeLayer(true, "max", "non-seq");   // hasSubseq max to non-seq
-  testDegradeLayer(true, "max", "seq");       // hasSubseq max to seq
+  testDegradeLayer(false, "max", "non-seq", -1);  // seq max to non-seq
+  testDegradeLayer(true, "max", "non-seq", -1);   // hasSubseq max to non-seq
+  testDegradeLayer(true, "max", "seq", -1);       // hasSubseq max to seq
 }
 
 TEST(Layer, SequenceLastInstanceLayer) {
   testDegradeLayer(false,
                    "seqlastins",
-                   "non-seq");  // seq seqlastins to non-seq
+                   "non-seq",
+                   -1);  // seq seqlastins to non-seq
+  testDegradeLayer(false,
+                   "seqlastins",
+                   "non-seq",
+                   5);  // seq seqlastins to a shorten seq, stride window = 5
   testDegradeLayer(true,
                    "seqlastins",
-                   "non-seq");  // hasSubseq seqlastins to non-seq
-  testDegradeLayer(true, "seqlastins", "seq");  // hasSubseq seqlastins to seq
+                   "non-seq",
+                   -1);  // hasSubseq seqlastins to non-seq
+  testDegradeLayer(
+      true, "seqlastins", "seq", -1);  // hasSubseq seqlastins to seq
 }
 
 TEST(Layer, AverageLayer) {
-  testDegradeLayer(false, "average", "non-seq");  // seq average to non-seq
-  testDegradeLayer(true, "average", "non-seq");  // hasSubseq average to non-seq
-  testDegradeLayer(true, "average", "seq");      // hasSubseq average to seq
+  testDegradeLayer(false, "average", "non-seq", -1);  // seq average to non-seq
+  testDegradeLayer(
+      true, "average", "non-seq", -1);           // hasSubseq average to non-seq
+  testDegradeLayer(true, "average", "seq", -1);  // hasSubseq average to seq
 }
 
 TEST(Layer, SequenceConcatLayer) {
@@ -1642,6 +1656,25 @@ TEST(Layer, PadLayer) {
   }
 }
 
+TEST(Layer, CrossChannelNormLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("norm");
+  config.layerConfig.set_size(100);
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_norm_type("cross-channel-norm");
+  norm->set_channels(10);
+  norm->set_size(100);
+  norm->set_scale(0);
+  norm->set_pow(0);
+  norm->set_blocked(0);
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 100, 10});
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false, 5);
+  }
+}
+
 TEST(Layer, smooth_l1) {
   TestConfig config;
   config.layerConfig.set_type("smooth_l1");
diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
index 0a0d92d1ae..de48b6fac9 100644
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -1453,6 +1453,24 @@ void BaseMatrixT<T>::divRowVector(BaseMatrixT& b) {
               true_type() /* bAsRowVector */, false_type());
 }
 
+template<class T>
+void BaseMatrixT<T>::mulColVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::DotMul<T>(), b, numRows, numCols, offset,
+              false_type(), true_type() /* bAsColVector */);
+}
+
+template<class T>
+void BaseMatrixT<T>::divColVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::DotDiv<T>(), b, numRows, numCols, offset,
+              false_type(), true_type() /* bAsColVector */);
+}
+
 template<>
 template <class Agg>
 int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h
index 8691c87ac3..6ed48c8d88 100644
--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -545,6 +545,9 @@ public:
   void mulRowVector(BaseMatrixT& b);
   void divRowVector(BaseMatrixT& b);
 
+  void mulColVector(BaseMatrixT& b);
+  void divColVector(BaseMatrixT& b);
+
   void addP2P(BaseMatrixT& b);
 
   /**
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index d7aa118487..6203cd3b9a 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -85,11 +85,16 @@ int getrf<float>(const CBLAS_ORDER order,
                  float* A,
                  const int lda,
                  int* ipiv) {
+#ifdef PADDLE_USE_LAPACK
 #ifdef PADDLE_USE_ATLAS
   return clapack_sgetrf(order, M, N, A, lda, ipiv);
 #else
   return LAPACKE_sgetrf(order, M, N, A, lda, ipiv);
 #endif
+#else
+  LOG(FATAL) << "Not implemented";
+#endif
+  return 0;
 }
 
 template <>
@@ -99,11 +104,16 @@ int getrf<double>(const CBLAS_ORDER order,
                   double* A,
                   const int lda,
                   int* ipiv) {
+#ifdef PADDLE_USE_LAPACK
 #ifdef PADDLE_USE_ATLAS
   return clapack_dgetrf(order, M, N, A, lda, ipiv);
 #else
   return LAPACKE_dgetrf(order, M, N, A, lda, ipiv);
 #endif
+#else
+  LOG(FATAL) << "Not implemented";
+#endif
+  return 0;
 }
 
 template <>
@@ -112,11 +122,16 @@ int getri<float>(const CBLAS_ORDER order,
                  float* A,
                  const int lda,
                  const int* ipiv) {
+#ifdef PADDLE_USE_LAPACK
 #ifdef PADDLE_USE_ATLAS
   return clapack_sgetri(order, N, A, lda, ipiv);
 #else
   return LAPACKE_sgetri(order, N, A, lda, ipiv);
 #endif
+#else
+  LOG(FATAL) << "Not implemented";
+#endif
+  return 0;
 }
 
 template <>
@@ -125,11 +140,16 @@ int getri<double>(const CBLAS_ORDER order,
                   double* A,
                   const int lda,
                   const int* ipiv) {
+#ifdef PADDLE_USE_LAPACK
 #ifdef PADDLE_USE_ATLAS
   return clapack_dgetri(order, N, A, lda, ipiv);
 #else
   return LAPACKE_dgetri(order, N, A, lda, ipiv);
 #endif
+#else
+  LOG(FATAL) << "Not implemented";
+#endif
+  return 0;
 }
 
 template <>
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index c8559eefd8..9f8f84a87c 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -17,11 +17,14 @@ limitations under the License. */
 
 #ifdef PADDLE_USE_MKL
 #include <mkl.h>
+#ifdef PADDLE_USE_LAPACK
 #include <mkl_lapacke.h>
+#endif
 #else
 extern "C" {
 #include <cblas.h>
 }
+#ifdef PADDLE_USE_LAPACK
 #ifdef PADDLE_USE_ATLAS
 extern "C" {
 #include <clapack.h>
@@ -30,6 +33,7 @@ extern "C" {
 #include <lapacke.h>
 #endif
 #endif
+#endif
 
 #include <cmath>
 
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 9eead5b62c..55a7344495 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -483,6 +483,20 @@ void GpuMatrix::sequenceAvgForward(Matrix& a,
   hl_sequence_avg_forward(dst, src, starts, height, width, mode);
 }
 
+void GpuMatrix::sequenceAvgBackward(Matrix& a,
+                                    const IVector& startsPos,
+                                    int mode) {
+  size_t height = a.getHeight();
+  size_t width = getWidth();
+  CHECK_EQ(height, startsPos.getSize() - 1);
+  CHECK_EQ(width, a.getWidth());
+  real* dst = getData();
+  real* src = a.getData();
+  const int* starts = startsPos.getData();
+
+  hl_sequence_avg_backward(dst, src, starts, height, width, mode);
+}
+
 /* this = scaleAB*(a*b) +  scaleT*this */
 void GpuMatrix::mul(const GpuMatrix& a,
                     const GpuMatrix& b,
@@ -2304,6 +2318,41 @@ void CpuMatrix::sequenceAvgForward(Matrix& a,
   }
 }
 
+void CpuMatrix::sequenceAvgBackward(Matrix& a,
+                                    const IVector& startsPos,
+                                    int mode) {
+  size_t height = a.getHeight();
+  size_t width = getWidth();
+  CHECK_EQ(height, startsPos.getSize() - 1);
+  CHECK_EQ(width, a.getWidth());
+  real* dst = getData();
+  real* src = a.getData();
+  const int* starts = startsPos.getData();
+  MatrixPtr outMtx = Matrix::create(nullptr, 1, width, false, false);
+  MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false);
+  for (size_t i = 0; i < height; ++i) {
+    int sequenceLength = starts[i + 1] - starts[i];
+    if (0 == sequenceLength) {
+      // empty sequence
+      continue;
+    }
+    outMtx->setData(dst + starts[i] * width, sequenceLength, width);
+    dataMtx->setData(src + i * width);
+    if (mode == 0) {
+      // plain average
+      outMtx->addBias(*dataMtx, 1.0f / sequenceLength);
+    } else if (mode == 1) {
+      // sum instead of average
+      outMtx->addBias(*dataMtx, 1.0f);
+    } else if (mode == 2) {
+      // divide by square root of sequenceLength
+      outMtx->addBias(*dataMtx, 1.0f / std::sqrt(sequenceLength));
+    } else {
+      LOG(FATAL) << "should not reach here";
+    }
+  }
+}
+
 /* this = scaleAB*(a*b) + scaleT*this*/
 void CpuMatrix::mul(const Matrix& a,
                     const Matrix& b,
@@ -2377,41 +2426,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
   int lda = a->getStride();
   int ldb = b->getStride();
   int ldc = getStride();
-#ifndef PADDLE_TYPE_DOUBLE
-  cblas_sgemm(CblasRowMajor,
-              a_trans,
-              b_trans,
-              M,
-              N,
-              K,
-              scaleAB,
-              A,
-              lda,
-              B,
-              ldb,
-              scaleT,
-              C,
-              ldc);
-#else
-  cblas_dgemm(CblasRowMajor,
-              a_trans,
-              b_trans,
-              M,
-              N,
-              K,
-              scaleAB,
-              A,
-              lda,
-              B,
-              ldb,
-              scaleT,
-              C,
-              ldc);
-// TODO(yuyang18): Is gemm defined other place?
-#endif
-
-  VLOG(2) << " A[0]=" << A[0] << " A[1]=" << A[1] << " B[0]=" << B[0]
-          << " B[1]=" << B[1] << " C[0]=" << C[0] << " C[1]=" << C[1];
+  gemm<real>(
+      a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc);
 }
 
 void CpuMatrix::mul(
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index dbdb629614..3252adb19e 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -461,6 +461,12 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
+  virtual void sequenceAvgBackward(Matrix& a,
+                                   const IVector& startsPos,
+                                   int mode) {
+    LOG(FATAL) << "Not implemented";
+  }
+
   /**
    * @code
    * this = scaleAB*(a*b) + scaleT*this
@@ -1203,6 +1209,7 @@ public:
   void collectSharedBias(Matrix& a, real scale);
 
   void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
+  void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode);
 
   /**
    * @code
@@ -1619,6 +1626,7 @@ public:
   void collectSharedBias(Matrix& a, real scale);
 
   void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
+  void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode);
 
   /**
    * @code
diff --git a/paddle/math/SIMDFunctions.cpp b/paddle/math/SIMDFunctions.cpp
index 95219debf5..d66d543a61 100644
--- a/paddle/math/SIMDFunctions.cpp
+++ b/paddle/math/SIMDFunctions.cpp
@@ -13,119 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "SIMDFunctions.h"
+#ifdef __SSE3__
 #include <immintrin.h>
+#endif
 #include <algorithm>
 
-#ifndef __AVX__
-static void addto_sse(float* a, const float* b, size_t len) {
-  int offset = len % 16;
-  __m128 ma0, ma1, ma2, ma3;
-  __m128 mb0, mb1, mb2, mb3;
-
-  for (unsigned int k = 0; k < len / 16; k++, a += 16, b += 16) {
-    ma0 = _mm_load_ps(a);
-    ma1 = _mm_load_ps(a + 4);
-    ma2 = _mm_load_ps(a + 8);
-    ma3 = _mm_load_ps(a + 12);
-
-    mb0 = _mm_load_ps(b);
-    mb1 = _mm_load_ps(b + 4);
-    mb2 = _mm_load_ps(b + 8);
-    mb3 = _mm_load_ps(b + 12);
-
-    ma0 = _mm_add_ps(ma0, mb0);
-    ma1 = _mm_add_ps(ma1, mb1);
-    ma2 = _mm_add_ps(ma2, mb2);
-    ma3 = _mm_add_ps(ma3, mb3);
-
-    _mm_store_ps(a, ma0);
-    _mm_store_ps(a + 4, ma1);
-    _mm_store_ps(a + 8, ma2);
-    _mm_store_ps(a + 12, ma3);
-  }
-
-  for (int i = 0; i < offset; i++) a[i] += b[i];
-}
-
-static void batch_addto_sse(float* a, const float* b[], int batch, size_t len) {
-  int offset = len % 16;
-
-  __m128 ma0, ma1, ma2, ma3;
-  __m128 mb0, mb1, mb2, mb3;
-
-  for (unsigned int k = 0; k < len / 16; k++, a += 16) {
-    ma0 = _mm_load_ps(a);
-    ma1 = _mm_load_ps(a + 4);
-    ma2 = _mm_load_ps(a + 8);
-    ma3 = _mm_load_ps(a + 12);
-
-    for (int i = 0; i < batch; i++) {
-      mb0 = _mm_load_ps(b[i]);
-      mb1 = _mm_load_ps(b[i] + 4);
-      mb2 = _mm_load_ps(b[i] + 8);
-      mb3 = _mm_load_ps(b[i] + 12);
-      ma0 = _mm_add_ps(ma0, mb0);
-      ma1 = _mm_add_ps(ma1, mb1);
-      ma2 = _mm_add_ps(ma2, mb2);
-      ma3 = _mm_add_ps(ma3, mb3);
-      b[i] += 16;
-    }
-
-    _mm_store_ps(a, ma0);
-    _mm_store_ps(a + 4, ma1);
-    _mm_store_ps(a + 8, ma2);
-    _mm_store_ps(a + 12, ma3);
-  }
-
-  for (int i = 0; i < offset; i++) {
-    for (int k = 0; k < batch; k++) a[i] += b[k][i];
-  }
-  return;
-}
-
-static void col_max_sse(float* result,
-                        const float* data,
-                        int dim,
-                        int numSamples) {
-  // first sample, direct copy
-  for (int d = 0; d < dim; ++d) {
-    result[d] = data[d];
-  }
-  int offset = dim % 16;
-  __m128 ma0, ma1, ma2, ma3;
-  __m128 mb0, mb1, mb2, mb3;
-  // first 16n dims
-  for (int k = 0; k < dim / 16; k++, result += 16, data += 16) {
-    ma0 = _mm_load_ps(result);
-    ma1 = _mm_load_ps(result + 4);
-    ma2 = _mm_load_ps(result + 8);
-    ma3 = _mm_load_ps(result + 12);
-    for (int i = 1; i < numSamples; i++) {
-      mb0 = _mm_load_ps(data + i * dim);
-      mb1 = _mm_load_ps(data + i * dim + 4);
-      mb2 = _mm_load_ps(data + i * dim + 8);
-      mb3 = _mm_load_ps(data + i * dim + 12);
-      ma0 = _mm_max_ps(ma0, mb0);
-      ma1 = _mm_max_ps(ma1, mb1);
-      ma2 = _mm_max_ps(ma2, mb2);
-      ma3 = _mm_max_ps(ma3, mb3);
-    }
-    _mm_store_ps(result, ma0);
-    _mm_store_ps(result + 4, ma1);
-    _mm_store_ps(result + 8, ma2);
-    _mm_store_ps(result + 12, ma3);
-  }
-  // last dims
-  for (int d = 0; d < offset; ++d) {
-    float sm = data[d];
-    for (int i = 1; i < numSamples; ++i) {
-      sm = std::max(sm, data[i * dim + d]);
-    }
-    result[d] = sm;
-  }
-}
-
-#else
+#ifdef __AVX__
 static void addto_avx(float* a, const float* b, size_t len) {
   int offset = len % 32;
 
@@ -355,17 +248,128 @@ static void decayL1_avx(
   }
 }
 
+#elif defined(__SSE3__)
+
+static void addto_sse(float* a, const float* b, size_t len) {
+  int offset = len % 16;
+  __m128 ma0, ma1, ma2, ma3;
+  __m128 mb0, mb1, mb2, mb3;
+
+  for (unsigned int k = 0; k < len / 16; k++, a += 16, b += 16) {
+    ma0 = _mm_load_ps(a);
+    ma1 = _mm_load_ps(a + 4);
+    ma2 = _mm_load_ps(a + 8);
+    ma3 = _mm_load_ps(a + 12);
+
+    mb0 = _mm_load_ps(b);
+    mb1 = _mm_load_ps(b + 4);
+    mb2 = _mm_load_ps(b + 8);
+    mb3 = _mm_load_ps(b + 12);
+
+    ma0 = _mm_add_ps(ma0, mb0);
+    ma1 = _mm_add_ps(ma1, mb1);
+    ma2 = _mm_add_ps(ma2, mb2);
+    ma3 = _mm_add_ps(ma3, mb3);
+
+    _mm_store_ps(a, ma0);
+    _mm_store_ps(a + 4, ma1);
+    _mm_store_ps(a + 8, ma2);
+    _mm_store_ps(a + 12, ma3);
+  }
+
+  for (int i = 0; i < offset; i++) a[i] += b[i];
+}
+
+static void batch_addto_sse(float* a, const float* b[], int batch, size_t len) {
+  int offset = len % 16;
+
+  __m128 ma0, ma1, ma2, ma3;
+  __m128 mb0, mb1, mb2, mb3;
+
+  for (unsigned int k = 0; k < len / 16; k++, a += 16) {
+    ma0 = _mm_load_ps(a);
+    ma1 = _mm_load_ps(a + 4);
+    ma2 = _mm_load_ps(a + 8);
+    ma3 = _mm_load_ps(a + 12);
+
+    for (int i = 0; i < batch; i++) {
+      mb0 = _mm_load_ps(b[i]);
+      mb1 = _mm_load_ps(b[i] + 4);
+      mb2 = _mm_load_ps(b[i] + 8);
+      mb3 = _mm_load_ps(b[i] + 12);
+      ma0 = _mm_add_ps(ma0, mb0);
+      ma1 = _mm_add_ps(ma1, mb1);
+      ma2 = _mm_add_ps(ma2, mb2);
+      ma3 = _mm_add_ps(ma3, mb3);
+      b[i] += 16;
+    }
+
+    _mm_store_ps(a, ma0);
+    _mm_store_ps(a + 4, ma1);
+    _mm_store_ps(a + 8, ma2);
+    _mm_store_ps(a + 12, ma3);
+  }
+
+  for (int i = 0; i < offset; i++) {
+    for (int k = 0; k < batch; k++) a[i] += b[k][i];
+  }
+  return;
+}
+
+static void col_max_sse(float* result,
+                        const float* data,
+                        int dim,
+                        int numSamples) {
+  // first sample, direct copy
+  for (int d = 0; d < dim; ++d) {
+    result[d] = data[d];
+  }
+  int offset = dim % 16;
+  __m128 ma0, ma1, ma2, ma3;
+  __m128 mb0, mb1, mb2, mb3;
+  // first 16n dims
+  for (int k = 0; k < dim / 16; k++, result += 16, data += 16) {
+    ma0 = _mm_load_ps(result);
+    ma1 = _mm_load_ps(result + 4);
+    ma2 = _mm_load_ps(result + 8);
+    ma3 = _mm_load_ps(result + 12);
+    for (int i = 1; i < numSamples; i++) {
+      mb0 = _mm_load_ps(data + i * dim);
+      mb1 = _mm_load_ps(data + i * dim + 4);
+      mb2 = _mm_load_ps(data + i * dim + 8);
+      mb3 = _mm_load_ps(data + i * dim + 12);
+      ma0 = _mm_max_ps(ma0, mb0);
+      ma1 = _mm_max_ps(ma1, mb1);
+      ma2 = _mm_max_ps(ma2, mb2);
+      ma3 = _mm_max_ps(ma3, mb3);
+    }
+    _mm_store_ps(result, ma0);
+    _mm_store_ps(result + 4, ma1);
+    _mm_store_ps(result + 8, ma2);
+    _mm_store_ps(result + 12, ma3);
+  }
+  // last dims
+  for (int d = 0; d < offset; ++d) {
+    float sm = data[d];
+    for (int i = 1; i < numSamples; ++i) {
+      sm = std::max(sm, data[i * dim + d]);
+    }
+    result[d] = sm;
+  }
+}
+
 #endif
 
-#ifndef __AVX__
-#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__)
-#else
+#if defined(__AVX__)
 #define SIMD_INVOKE(func, ...) func##_avx(__VA_ARGS__)
+#elif defined(__SSE3__)
+#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__)
 #endif
 
 namespace paddle {
 namespace simd {
 namespace internal {
+#ifdef __SSE3__
 void addToImpl(float* a, const float* b, size_t len) {
   SIMD_INVOKE(addto, a, b, len);
 }
@@ -376,6 +380,7 @@ void batchAddToImpl(float* a, const float* b[], int batch, size_t len) {
 void colMaxImpl(float* result, const float* data, int dim, int numSamples) {
   SIMD_INVOKE(col_max, result, data, dim, numSamples);
 }
+#endif
 
 #ifdef __AVX__
 void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len) {
@@ -385,8 +390,8 @@ void decayL1AvxImpl(
     float* dst, float* src, float* lr, float lambda, size_t len) {
   decayL1_avx(dst, src, lr, lambda, len);
 }
-
 #endif
+
 }  // namespace internal
 }  // namespace simd
 }  // namespace paddle
diff --git a/paddle/math/SIMDFunctions.h b/paddle/math/SIMDFunctions.h
index 9b0a8719b2..439f11b79d 100644
--- a/paddle/math/SIMDFunctions.h
+++ b/paddle/math/SIMDFunctions.h
@@ -128,17 +128,29 @@ void decayL1AvxImpl(
 
 template <>
 inline void addTo(float* a, const float* b, size_t len) {
+#ifdef __SSE3__
   internal::addToImpl(a, b, len);
+#else
+  naive::addTo(a, b, len);
+#endif
 }
 
 template <>
 inline void batchAddTo(float* a, const float* b[], int batch, size_t len) {
+#ifdef __SSE3__
   internal::batchAddToImpl(a, b, batch, len);
+#else
+  naive::batchAddTo(a, b, batch, len);
+#endif
 }
 
 template <>
 inline void colMax(float* result, const float* data, int dim, int numSamples) {
+#ifdef __SSE3__
   internal::colMaxImpl(result, data, dim, numSamples);
+#else
+  naive::colMax(result, data, dim, numSamples);
+#endif
 }
 
 template <>
diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp
index 56e5442394..7ce17a3207 100644
--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "Storage.h"
 #include "Allocator.h"
+#include "paddle/utils/StringUtil.h"
 #include "paddle/utils/Util.h"
 
 DEFINE_int32(pool_limit_size,
@@ -62,7 +63,7 @@ PoolAllocator* StorageEngine::getGpuAllocator(int deviceId) {
     }
     if (gpuAllocator_[deviceId] == nullptr) {
       std::string name =
-          "gpu" + std::to_string(deviceId) + std::string("_pool");
+          "gpu" + str::to_string(deviceId) + std::string("_pool");
       gpuAllocator_[deviceId] =
           new PoolAllocator(new GpuAllocator(), FLAGS_pool_limit_size, name);
     }
diff --git a/paddle/math/tests/test_BaseMatrix.cpp b/paddle/math/tests/test_BaseMatrix.cpp
index 21918b86e1..22ce39701f 100644
--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ b/paddle/math/tests/test_BaseMatrix.cpp
@@ -110,6 +110,8 @@ TEST(BaseMatrix, BaseMatrix) {
       compare(&BaseMatrix::addRowVector);
       compare(&BaseMatrix::mulRowVector);
       compare(&BaseMatrix::divRowVector);
+      compare(&BaseMatrix::mulColVector);
+      compare(&BaseMatrix::divColVector);
       compare(&BaseMatrix::addP2P);
       compare(&BaseMatrix::invSqrt);
     }
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 08b64c1bb6..dd19fe516f 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -685,7 +685,7 @@ TEST(SMatrix, topK) {
   }
 }
 
-void testMatrixSequenceAvgForward(int batchSize, int inputDim, int mode) {
+void testMatrixSequenceAvg(int batchSize, int inputDim, int mode) {
   MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
   MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
   cpuInput->randomizeUniform();
@@ -706,15 +706,25 @@ void testMatrixSequenceAvgForward(int batchSize, int inputDim, int mode) {
   gpuOutput->sequenceAvgForward(*gpuInput, *gpuSequence, mode);
 
   TensorCheckErr(*cpuOutput, *gpuOutput);
+
+  MatrixPtr cpuInGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInGrad->randomizeUniform();
+  gpuInGrad->copyFrom(*cpuInGrad);
+
+  cpuInGrad->sequenceAvgBackward(*cpuOutput, *cpuSequence, mode);
+  gpuInGrad->sequenceAvgBackward(*gpuOutput, *gpuSequence, mode);
+
+  TensorCheckErr(*cpuInGrad, *gpuInGrad);
 }
 
-TEST(Matrix, sequenceAvgForward) {
+TEST(Matrix, sequenceAvg) {
   for (auto batchSize : {10, 128, 6000}) {
     for (auto inputDim : {32, 100, 512}) {
       for (auto mode : {0, 1, 2}) {
         VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim
                 << " mode=" << mode;
-        testMatrixSequenceAvgForward(batchSize, inputDim, mode);
+        testMatrixSequenceAvg(batchSize, inputDim, mode);
       }
     }
   }
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index 4139f59a2c..645bf73799 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -559,6 +559,49 @@ void Argument::degradeSequence(const Argument& input) {
   tgtBuf[numSequences] = numSubSequences;
 }
 
+void Argument::poolSequenceWithStride(const Argument& input,
+                                      size_t stride,
+                                      IVectorPtr* stridePostions,
+                                      bool reversed) {
+  // If input.sequenceStartPositions = [0, 9, 14, 17, 30] and stride = 5,
+  // then sequenceStartPositions = [0, 2, 3, 4, 7].
+  // If reversed = false, stridePostions = [0, 5, 9, 14, 17, 22, 27, 30];
+  // else reversed = true, stridePostions = [0, 4, 9, 14, 17, 20, 25, 30]
+
+  CHECK(input.sequenceStartPositions);
+  CHECK_EQ(input.hasSubseq(), 0UL);
+  CHECK_GT(stride, 0) << "stride must larger than 0";
+  size_t numSequences = input.getNumSequences();
+  ICpuGpuVector::resizeOrCreate(
+      sequenceStartPositions, numSequences + 1, false);
+  const int* starts = input.sequenceStartPositions->getData(false);
+  int* tgtBuf = sequenceStartPositions->getMutableData(false);
+  // first index of target sequence and stride positions are both 0
+  tgtBuf[0] = 0;
+  std::vector<int> stridePos;
+  for (size_t seqId = 0; seqId < numSequences; ++seqId) {
+    size_t seqLength = starts[seqId + 1] - starts[seqId];
+    stridePos.emplace_back(starts[seqId]);
+    if (seqLength == 0) {
+      // empty sequence
+      tgtBuf[seqId + 1] = tgtBuf[seqId];
+    } else {
+      int size = ceil((float)seqLength / stride);
+      tgtBuf[seqId + 1] = tgtBuf[seqId] + size;
+      for (int i = 0; i < size - 1; ++i) {
+        int cur = reversed ? starts[seqId + 1] - (size - 1 - i) * stride
+                           : stridePos.back() + stride;
+        stridePos.emplace_back(cur);
+      }
+    }
+  }
+  stridePos.emplace_back(starts[numSequences]);
+  int size = stridePos.size();
+  CHECK_EQ(size - 1, tgtBuf[numSequences]);
+  IVector::resizeOrCreate(*stridePostions, size, false);
+  (*stridePostions)->copyFrom(stridePos.data(), size);
+}
+
 void Argument::getValueString(
     std::unordered_map<std::string, std::string>* out) const {
   if (value) {
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
index 9fd84bc4b7..91aca98e18 100644
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -291,6 +291,15 @@ struct Argument {
    */
   void degradeSequence(const Argument& input);
 
+  /*
+   After pooling with stride n (n is smaller than sequence length),
+   a long sequence will be shorten.
+   This function is invalid for sequence having sub-sequence.
+   */
+  void poolSequenceWithStride(const Argument& input,
+                              size_t stride,
+                              IVectorPtr* stridePositions,
+                              bool reversed = false);
   /**
    * @brief getValueString will return the argument's output in string. There
    * are several kinds of output. The keys of output dictionary are 'value',
diff --git a/paddle/parameter/tests/CMakeLists.txt b/paddle/parameter/tests/CMakeLists.txt
index cab264db8e..181ccdc1f0 100644
--- a/paddle/parameter/tests/CMakeLists.txt
+++ b/paddle/parameter/tests/CMakeLists.txt
@@ -1 +1,2 @@
 add_simple_unittest(test_common)
+add_simple_unittest(test_argument)
diff --git a/paddle/parameter/tests/test_argument.cpp b/paddle/parameter/tests/test_argument.cpp
new file mode 100644
index 0000000000..81fe4ee397
--- /dev/null
+++ b/paddle/parameter/tests/test_argument.cpp
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/parameter/Argument.h>
+
+using namespace paddle;  // NOLINT
+
+TEST(Argument, poolSequenceWithStride) {
+  Argument input, output;
+  ICpuGpuVector::resizeOrCreate(input.sequenceStartPositions, 5, false);
+  int* inStart = input.sequenceStartPositions->getMutableData(false);
+  inStart[0] = 0;
+  inStart[1] = 9;
+  inStart[2] = 14;
+  inStart[3] = 17;
+  inStart[4] = 30;
+
+  int strideResult[] = {0, 5, 9, 14, 17, 22, 27, 30};
+  int strideResultReversed[] = {0, 4, 9, 14, 17, 20, 25, 30};
+
+  for (auto reversed : {false, true}) {
+    IVectorPtr stridePositions;
+    output.poolSequenceWithStride(
+        input, 5 /* stride */, &stridePositions, reversed);
+
+    const int* outStart = output.sequenceStartPositions->getData(false);
+    CHECK_EQ(outStart[0], 0);
+    CHECK_EQ(outStart[1], 2);
+    CHECK_EQ(outStart[2], 3);
+    CHECK_EQ(outStart[3], 4);
+    CHECK_EQ(outStart[4], 7);
+
+    CHECK_EQ(stridePositions->getSize(), 8);
+    auto result = reversed ? strideResultReversed : strideResult;
+    for (int i = 0; i < 8; i++) {
+      CHECK_EQ(stridePositions->getData()[i], result[i]);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp
index 877cbb86ec..19ff40ba7e 100644
--- a/paddle/pserver/ParameterServer2.cpp
+++ b/paddle/pserver/ParameterServer2.cpp
@@ -29,6 +29,7 @@ limitations under the License. */
 #include "paddle/utils/Flags.h"
 #include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/Stat.h"
+#include "paddle/utils/StringUtil.h"
 
 DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
 DEFINE_double(async_lagged_ratio_min,
@@ -218,7 +219,8 @@ void ParameterServer2::setConfig(const SetConfigRequest& request,
   callback(response);
 
   /// always defined, barrier slowest node function need it.
-  statSet_.reset(new StatSet("ParameterServer" + std::to_string(serverId_)));
+  statSet_.reset(new StatSet("ParameterServer" +
+                             str::to_string(static_cast<int>(serverId_))));
 }
 
 real bufferSum(const std::vector<ParameterServer2::Buffer>& buffers) {
diff --git a/paddle/py_paddle/.gitignore b/paddle/py_paddle/.gitignore
index 9e8ad4bf16..80d1f76fbc 100644
--- a/paddle/py_paddle/.gitignore
+++ b/paddle/py_paddle/.gitignore
@@ -1 +1,2 @@
 swig_paddle.py
+_swig_paddle.so
diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py
index 6d6a406cf6..7c6b835410 100644
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -160,10 +160,19 @@ class SparseFloatScanner(SparseBinaryScanner):
 class IndexScanner(IScanner):
     def __init__(self, input_type, pos):
         IScanner.__init__(self, input_type, pos)
-        self.__ids__ = []
+        self.__ids__ = None
+        self.__idx__ = 0
+
+    def pre_scan(self, dat):
+        self.__idx__ += 1
+
+    def finish_pre_scan(self, argument):
+        self.__ids__ = [0] * self.__idx__
+        self.__idx__ = 0
 
     def scan(self, dat):
-        self.__ids__.append(dat)
+        self.__ids__[self.__idx__] = dat
+        self.__idx__ += 1
 
     def finish_scan(self, argument):
         ids = swig_paddle.IVector.create(self.__ids__, self.data_in_gpu)
@@ -178,6 +187,13 @@ class SequenceScanner(IScanner):
         self.__inner_scanner__ = inner_scanner
         self.__setter__ = setter
 
+    def pre_scan(self, dat):
+        for each in dat:
+            self.__inner_scanner__.pre_scan(each)
+
+    def finish_pre_scan(self, argument):
+        self.__inner_scanner__.finish_pre_scan(argument)
+
     def scan(self, dat):
         self.__seq__.append(self.__seq__[-1] + self.get_size(dat))
         for each in dat:
diff --git a/paddle/py_paddle/util.py b/paddle/py_paddle/util.py
index 1c9455fab5..3ae8dbf964 100644
--- a/paddle/py_paddle/util.py
+++ b/paddle/py_paddle/util.py
@@ -83,13 +83,17 @@ def __arguments_to_numpy__(i, arg):
     assert isinstance(arg, swig_paddle.Arguments)
     value = arg.getSlotValue(i)
     ids = arg.getSlotIds(i)
+    prob = arg.getSlotIn(i)
     if value is not None:
         assert isinstance(value, swig_paddle.Matrix)
         value = value.copyToNumpyMat()
     if ids is not None:
         assert isinstance(ids, swig_paddle.IVector)
         ids = ids.copyToNumpyArray()
-    return {"value": value, "id": ids}
+    if prob is not None:
+        assert isinstance(prob, swig_paddle.Matrix)
+        prob = prob.copyToNumpyMat()
+    return {"value": value, "id": ids, "prob": prob}
 
 
 def __monkeypatch_gradient_machine__():
diff --git a/paddle/scripts/docker/README.md b/paddle/scripts/docker/README.md
index e5af5c9a1e..132f8cd8aa 100644
--- a/paddle/scripts/docker/README.md
+++ b/paddle/scripts/docker/README.md
@@ -94,7 +94,7 @@ docker build -t paddle:dev --build-arg UBUNTU_MIRROR=mirror://mirrors.ubuntu.com
 Given the development image `paddle:dev`, the following command builds PaddlePaddle from the source tree on the development computer (host):
 
 ```bash
-docker run -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "TEST=OFF" paddle:dev
+docker run --rm -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=OFF" -e "RUN_TEST=OFF" paddle:dev
 ```
 
 This command mounts the source directory on the host into `/paddle` in the container, so the default entry point of `paddle:dev`, `build.sh`, could build the source code with possible local changes.  When it writes to `/paddle/build` in the container, it writes to `$PWD/build` on the host indeed.
@@ -108,7 +108,11 @@ This command mounts the source directory on the host into `/paddle` in the conta
 Users can specify the following Docker build arguments with either "ON" or "OFF" value:
 - `WITH_GPU`: ***Required***. Generates NVIDIA CUDA GPU code and relies on CUDA libraries.
 - `WITH_AVX`: ***Required***. Set to "OFF" prevents from generating AVX instructions. If you don't know what is AVX, you might want to set "ON".
-- `TEST`: ***Optional, default OFF***. Build unit tests and run them after building.
+- `WITH_TEST`: ***Optional, default OFF***. Build unit tests binaries. Once you've built the unit tests, you can run these test manually by the following command:
+  ```bash
+    docker run --rm -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" paddle:dev sh -c "cd /paddle/build; make coverall"
+  ```
+- `RUN_TEST`: ***Optional, default OFF***. Run unit tests after building. You can't run unit tests without building it.
 
 ### Build the Production Docker Image
 
@@ -125,7 +129,7 @@ This production image is minimal -- it includes binary `paddle`, the shared libr
 Again the development happens on the host.  Suppose that we have a simple application program in `a.py`, we can test and run it using the production image:
 
 ```bash
-docker run -it -v $PWD:/work paddle /work/a.py
+docker run --rm -it -v $PWD:/work paddle /work/a.py
 ```
 
 But this works only if all dependencies of `a.py` are in the production image. If this is not the case, we need to build a new Docker image from the production image and with more dependencies installs.
@@ -162,3 +166,18 @@ docker tag myapp me/myapp
 docker push
 kubectl ...
 ```
+
+### Reading source code with woboq codebrowser
+For developers who are interested in the C++ source code, please use -e "WOBOQ=ON" to enable the building of C++ source code into HTML pages using [Woboq codebrowser](https://github.com/woboq/woboq_codebrowser).
+
+- The following command builds PaddlePaddle, generates HTML pages from C++ source code, and writes HTML pages into `$HOME/woboq_out` on the host:
+
+```bash
+docker run -v $PWD:/paddle -v $HOME/woboq_out:/woboq_out -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" -e "WOBOQ=ON" paddle:dev
+```
+
+- You can open the generated HTML files in your Web browser. Or, if you want to run a Nginx container to serve them for a wider audience, you can run:
+
+```
+docker run -v $HOME/woboq_out:/usr/share/nginx/html -d -p 8080:80 nginx
+```
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 405d3338af..a4b63f90ec 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -4,19 +4,21 @@ set -e
 
 # Set BASE_IMAGE according to env variables
 if [ ${WITH_GPU} == "ON" ]; then
-  BASE_IMAGE="nvidia/cuda:7.5-cudnn5-runtime-ubuntu14.04"
+  BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu14.04"
   # additional packages to install when building gpu images
   GPU_DOCKER_PKG="python-pip python-dev"
 else
   BASE_IMAGE="python:2.7.13-slim"
+  # FIXME: python base image uses different python version than WITH_GPU
+  # need to change PYTHONHOME to /usr/local when using python base image
+  CPU_DOCKER_PYTHON_HOME_ENV="ENV PYTHONHOME /usr/local"
 fi
 
 DOCKERFILE_GPU_ENV=""
+DOCKERFILE_CUDNN_DSO=""
 if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
     DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
-
-    # for cmake to find cudnn
-    ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/lib/libcudnn.so
+    DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so"
 fi
 
 mkdir -p /paddle/build
@@ -33,10 +35,10 @@ cmake .. \
       -DWITH_SWIG_PY=ON \
       -DCUDNN_ROOT=/usr/ \
       -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF} \
-      -DON_COVERALLS=${TEST:-OFF} \
+      -DON_COVERALLS=${WITH_TEST:-OFF} \
       -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 make -j `nproc`
-if [[ ${TEST:-OFF} == "ON" ]]; then
+if [[ ${RUN_TEST:-OFF} == "ON" ]]; then
     make coveralls
 fi
 make install
@@ -47,7 +49,7 @@ make install
 # install them in docker
 cpack -D CPACK_GENERATOR='DEB' -D CPACK_DEBIAN_PACKAGE_DEPENDS="" ..
 
-if [[ ${BUILD_WOBOQ:-OFF} == 'ON' ]]; then
+if [[ ${WOBOQ:-OFF} == 'ON' ]]; then
     apt-get install -y clang-3.8 llvm-3.8 libclang-3.8-dev
     # Install woboq_codebrowser.
     git clone https://github.com/woboq/woboq_codebrowser /woboq
@@ -57,7 +59,7 @@ if [[ ${BUILD_WOBOQ:-OFF} == 'ON' ]]; then
           .
     make
 
-    export WOBOQ_OUT=/usr/share/nginx/html/paddle
+    export WOBOQ_OUT=/woboq_out/paddle
     export BUILD_DIR=/paddle/build
     mkdir -p $WOBOQ_OUT
     cp -rv /woboq/data $WOBOQ_OUT/../data
@@ -95,7 +97,11 @@ RUN ${MIRROR_UPDATE}
 # Use different deb file when building different type of images
 ADD build/*.deb /usr/local/opt/paddle/deb/
 # run paddle version to install python packages first
-RUN dpkg -i /usr/local/opt/paddle/deb/*.deb && rm -f /usr/local/opt/paddle/deb/*.deb && paddle version
+RUN dpkg -i /usr/local/opt/paddle/deb/*.deb && \
+    rm -f /usr/local/opt/paddle/deb/*.deb && \
+    paddle version
+${CPU_DOCKER_PYTHON_HOME_ENV}
+${DOCKERFILE_CUDNN_DSO}
 ${DOCKERFILE_GPU_ENV}
 # default command shows the paddle version and exit
 CMD ["paddle", "version"]
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 5a45df4072..8fba4a19ba 100644
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -21,9 +21,7 @@ function version(){
         echo "    with_double: @WITH_DOUBLE@"
         echo "    with_python: @WITH_PYTHON@"
         echo "    with_rdma: @WITH_RDMA@"
-        echo "    with_metric_learning: @WITH_METRIC@"
         echo "    with_timer: @WITH_TIMER@"
-        echo "    with_predict_sdk: @WITH_PREDICT_SDK@"
 }
 
 function ver2num() {
diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh
index 7deb3e62e8..f2cbc56165 100755
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -5,7 +5,7 @@ NPROC=1
 export PYTHONPATH=/opt/python/2.7.12/lib/python2.7/site-packages
 export PYTHONHOME=/opt/python/2.7.12
 export PATH=/opt/python/2.7.12/bin:${PATH}
-cmake .. -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
+cmake .. -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DON_TRAVIS=ON -DWITH_COVERAGE=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
 NRPOC=`nproc`
 make -j $NPROC
 make coveralls
diff --git a/paddle/setup.py.in b/paddle/setup.py.in
index 382d5be6ec..0b62436a7f 100644
--- a/paddle/setup.py.in
+++ b/paddle/setup.py.in
@@ -12,68 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# This file is used to build paddle python binding package.
-# It will be invoked by Makefile that generated by COMAKE
 
 from setuptools import setup, Extension
 
-import numpy as np
-import api.paddle_ld_flags
-import platform
-import os
-
-system = platform.system().lower()
-
-is_osx = (system == 'darwin')
-is_win = (system == 'windows')
-is_lin = (system == 'linux')
-
-
-# The extra links will passed from COMAKE
-#   because generate paddle LDFLAGS is too complicated to do in setup.py
-#   it just read COMAKE generated LDFLAGS.
-extra_comps = []
-extra_links = []
-obj = api.paddle_ld_flags.PaddleLDFlag()
-extra_comps = obj.c_flag()
-ldflags = obj.ldflag_str()
-if ldflags is not None:
-  extra_links.extend(ldflags.split(" "))
-
-try:
-  with open('.py_paddle_extra_link_flags', 'r') as f:
-    for line in f:
-      extra_links += line.split()
-except:
-  pass
-
-if is_lin == True:
-    extra_links = ["-Xlinker", '-start-group'] + extra_links + ["-Xlinker", "-end-group"]
-elif is_osx == True:
-    os.environ["ARCHFLAGS"] = "-arch x86_64"
-    extra_links = ["-Wl,-all_load"] + extra_links
-
-include_dirs = [np.get_include(), "../"]    # include numpy and paddle.
-
-os.environ["CC"] = "@CMAKE_C_COMPILER@"
-os.environ["CXX"] = "@CMAKE_CXX_COMPILER@"
-
 setup(name="py_paddle",
-  version="@PADDLE_VERSION@",
-  ext_modules=[
-    Extension('py_paddle._swig_paddle',      # Build SWIG Extension.
-       ['Paddle_wrap.cxx'],
-       language = "c++",
-       include_dirs = include_dirs,
-       extra_link_args = extra_links,
-       extra_compile_args = extra_comps
-    )
-  ],
-  packages=['py_paddle'],
-  include_dirs = include_dirs,
-  install_requires = [
-    'nltk>=3.2.2',
-    'numpy>=1.8.0',      # The numpy is required.
-    'protobuf>=3.0.0'    # The paddle protobuf version
-  ],
+      version="${PADDLE_VERSION}",
+      packages=['py_paddle'],
+      include_package_data=True,
+      package_data={'py_paddle':['*.py','_swig_paddle.so']},
+      install_requires = [
+        'nltk>=3.2.2',
+        'numpy>=1.8.0',      # The numpy is required.
+        'protobuf>=${PROTOBUF_VERSION}'    # The paddle protobuf version
+      ],
+      url='http://www.paddlepaddle.org/',
+      license='Apache 2.0',
 )
diff --git a/paddle/utils/.gitignore b/paddle/utils/.gitignore
index 956b606a18..f2cfd74094 100644
--- a/paddle/utils/.gitignore
+++ b/paddle/utils/.gitignore
@@ -1,2 +1 @@
 enable_virtualenv.c
-PythonUtil.cpp
diff --git a/paddle/utils/Any.h b/paddle/utils/Any.h
new file mode 100644
index 0000000000..99a0139acc
--- /dev/null
+++ b/paddle/utils/Any.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#if __cplusplus > 201402L
+#include <any>
+
+namespace paddle {
+// using std::any for C++ 17
+using std::any;
+using std::any_cast;
+using std::bad_any_cast;
+}  // namespace paddle
+
+#else
+#include <any.hpp>
+
+namespace paddle {
+// use linb::any for C++ 11
+using linb::any;
+using linb::any_cast;
+using linb::bad_any_cast;
+}  // namespace paddle
+#endif
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index 10d906ee16..171eae381a 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -1,7 +1,4 @@
 # The utilities for paddle
-
-configure_file(PythonUtil.cpp.in ${PROJ_ROOT}/paddle/utils/PythonUtil.cpp)
-
 file(GLOB UTIL_HEADERS . *.h)
 file(GLOB UTIL_SOURCES . *.cpp)
 create_resources(enable_virtualenv.py enable_virtualenv.c)
diff --git a/paddle/utils/CpuId.cpp b/paddle/utils/CpuId.cpp
index 8eefdd2980..edd33c4541 100644
--- a/paddle/utils/CpuId.cpp
+++ b/paddle/utils/CpuId.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 /// for MSVC
 #define CPUID(info, x) __cpuidex(info, x, 0)
 
-#else
+#elif !defined(__ANDROID__)
 
 #include <cpuid.h>
 
@@ -31,6 +31,7 @@ limitations under the License. */
 namespace paddle {
 
 SIMDFlags::SIMDFlags() {
+#if !defined(__ANDROID__)
   unsigned int cpuInfo[4];
   // CPUID: https://en.wikipedia.org/wiki/CPUID
   // clang-format off
@@ -51,6 +52,9 @@ SIMDFlags::SIMDFlags() {
   CPUID(cpuInfo, 0x80000001);
   simd_flags_ |= cpuInfo[2] & (1 << 16) ? SIMD_FMA4  : SIMD_NONE;
   // clang-fotmat on
+#else
+  simd_flags_ = SIMD_NEON;
+#endif
 }
 
 SIMDFlags const* SIMDFlags::instance() {
diff --git a/paddle/utils/CpuId.h b/paddle/utils/CpuId.h
index 5fc610964d..869be5be54 100644
--- a/paddle/utils/CpuId.h
+++ b/paddle/utils/CpuId.h
@@ -30,6 +30,7 @@ enum simd_t {
   SIMD_AVX    = 1 << 8,     ///< AVX
   SIMD_AVX2   = 1 << 9,     ///< AVX 2
   SIMD_AVX512 = 1 << 10,    ///< AVX 512
+  SIMD_NEON   = 1 << 11,    ///  NEON
 };
 // clang-format on
 
@@ -96,6 +97,7 @@ private:
 #define HAS_AVX     HAS_SIMD(SIMD_AVX)
 #define HAS_AVX2    HAS_SIMD(SIMD_AVX2)
 #define HAS_AVX512  HAS_SIMD(SIMD_AVX512)
+#define HAS_NEON    HAS_SIMD(SIMD_NEON)
 // clang-format on
 
 /**
diff --git a/paddle/utils/Logging.cpp b/paddle/utils/Logging.cpp
index 5a1c6ecb22..ea96bad240 100644
--- a/paddle/utils/Logging.cpp
+++ b/paddle/utils/Logging.cpp
@@ -18,6 +18,7 @@ limitations under the License. */
  */
 
 #include "Logging.h"
+#include <cstdlib>
 
 namespace paddle {
 
diff --git a/paddle/utils/PythonUtil.cpp.in b/paddle/utils/PythonUtil.cpp
similarity index 98%
rename from paddle/utils/PythonUtil.cpp.in
rename to paddle/utils/PythonUtil.cpp
index 66b5795e29..7faeff55c2 100644
--- a/paddle/utils/PythonUtil.cpp.in
+++ b/paddle/utils/PythonUtil.cpp
@@ -195,10 +195,6 @@ extern const char enable_virtualenv_py[];
 }
 void initPython(int argc, char** argv) {
 #ifndef PADDLE_NO_PYTHON
-  char pyHome[] = "@PYTHON_INSTALL_DIR@"; // NOLINT
-  if (strlen(pyHome)) {
-    Py_SetPythonHome(pyHome);
-  }
   Py_SetProgramName(argv[0]);
   Py_Initialize();
   PySys_SetArgv(argc, argv);
diff --git a/paddle/utils/StringUtil.h b/paddle/utils/StringUtil.h
index 0b4f4c9113..95f071cb7d 100644
--- a/paddle/utils/StringUtil.h
+++ b/paddle/utils/StringUtil.h
@@ -54,6 +54,25 @@ inline T toWithStatus(const std::string& s, bool* ok = nullptr) {
   return v;
 }
 
+/**
+ * Cast type T to string with status.
+ *
+ * @param [in] v input value of type T.
+ * @param [out] ok status, return true if there is no error in casting. Set
+ *              nullptr if user don't care error at all.
+ * @return result of casting. If error occurred, a empty string will be
+ *              returned.
+ */
+template <class T>
+inline std::string toWithStatus(const T v, bool* ok = nullptr) {
+  std::ostringstream sout;
+  sout << v;
+  if (ok) {
+    *ok = !sout.fail();
+  }
+  return sout.str();
+}
+
 /// Convert string to type T. It makes sure all the characters in s are used.
 /// Otherwise it will abort.
 ///
@@ -67,6 +86,18 @@ inline T to(const std::string& s) {
   return v;
 }
 
+/// Convert type T to string.
+///
+/// @tparam T type of input value
+/// @param v input value of type T
+template <class T>
+std::string to_string(T v) {
+  bool ok;
+  std::string s = toWithStatus<T>(v, &ok);
+  CHECK(ok) << "Cannot convert v(" << v << ") to type std::string";
+  return s;
+}
+
 }  // namespace str
 
 #undef DEFINE_STRING_CONVERSION
diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp
index 1f56b6b8a9..b18b73e06a 100644
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
@@ -15,11 +15,16 @@ limitations under the License. */
 #include "Util.h"
 
 #include <dirent.h>
-#include <pmmintrin.h>
 #include <signal.h>
 #include <sys/stat.h>
 #include <sys/types.h>
+
+#ifdef __SSE__
 #include <xmmintrin.h>
+#endif
+#ifdef __SSE3__
+#include <pmmintrin.h>
+#endif
 
 #include <fstream>
 #include <mutex>
@@ -163,8 +168,12 @@ void initMain(int argc, char** argv) {
 
   installProfilerSwitch();
 
+#ifdef __SSE__
   _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+#endif
+#ifdef __SSE3__
   _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+#endif
 
   if (FLAGS_seed == 0) {
     unsigned int t = time(NULL);
diff --git a/paddle/utils/arch/linux/Locks.cpp b/paddle/utils/arch/linux/Locks.cpp
index 2a6f96e04d..310c9a6542 100644
--- a/paddle/utils/arch/linux/Locks.cpp
+++ b/paddle/utils/arch/linux/Locks.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/utils/Locks.h"
 #include <semaphore.h>
 #include <unistd.h>
+#include "paddle/utils/Logging.h"
 
 namespace paddle {
 class SemaphorePrivate {
@@ -26,7 +27,10 @@ Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) {
   sem_init(&m->sem, 0, initValue);
 }
 
-Semaphore::~Semaphore() { sem_destroy(&m->sem); }
+Semaphore::~Semaphore() {
+  sem_destroy(&m->sem);
+  delete m;
+}
 
 bool Semaphore::timeWait(struct timespec* ts) {
   return (0 == sem_timedwait(&m->sem, ts));
@@ -36,36 +40,101 @@ void Semaphore::wait() { sem_wait(&m->sem); }
 
 void Semaphore::post() { sem_post(&m->sem); }
 
+#ifdef PADDLE_USE_PTHREAD_SPINLOCK
+
 class SpinLockPrivate {
 public:
   inline SpinLockPrivate() { pthread_spin_init(&lock_, 0); }
   inline ~SpinLockPrivate() { pthread_spin_destroy(&lock_); }
+
+  inline void lock() { pthread_spin_lock(&lock_); }
+  inline void unlock() { pthread_spin_unlock(&lock_); }
+
   pthread_spinlock_t lock_;
   char padding_[64 - sizeof(pthread_spinlock_t)];
 };
 
-SpinLock::SpinLock() : m(new SpinLockPrivate()) {}
+#else
 
-SpinLock::~SpinLock() { delete m; }
+#include <atomic>
+class SpinLockPrivate {
+public:
+  inline void lock() {
+    while (lock_.test_and_set(std::memory_order_acquire)) {
+    }
+  }
+  inline void unlock() { lock_.clear(std::memory_order_release); }
+
+  std::atomic_flag lock_ = ATOMIC_FLAG_INIT;
+  char padding_[64 - sizeof(lock_)];  // Padding to cache line size
+};
 
-void SpinLock::lock() { pthread_spin_lock(&m->lock_); }
+#endif
 
-void SpinLock::unlock() { pthread_spin_unlock(&m->lock_); }
+SpinLock::SpinLock() : m(new SpinLockPrivate()) {}
+SpinLock::~SpinLock() { delete m; }
+void SpinLock::lock() { m->lock(); }
+void SpinLock::unlock() { m->unlock(); }
+
+#ifdef PADDLE_USE_PTHREAD_BARRIER
 
 class ThreadBarrierPrivate {
 public:
   pthread_barrier_t barrier_;
+
+  inline explicit ThreadBarrierPrivate(int count) {
+    pthread_barrier_init(&barrier_, nullptr, count);
+  }
+
+  inline ~ThreadBarrierPrivate() { pthread_barrier_destroy(&barrier_); }
+
+  inline void wait() { pthread_barrier_wait(&barrier_); }
 };
 
-ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate()) {
-  pthread_barrier_init(&m->barrier_, nullptr, count);
-}
+#else
 
-ThreadBarrier::~ThreadBarrier() {
-  pthread_barrier_destroy(&m->barrier_);
-  delete m;
-}
+class ThreadBarrierPrivate {
+public:
+  pthread_mutex_t mutex_;
+  pthread_cond_t cond_;
+  int count_;
+  int tripCount_;
+
+  inline explicit ThreadBarrierPrivate(int cnt) : count_(0), tripCount_(cnt) {
+    CHECK_NE(cnt, 0);
+    CHECK_GE(pthread_mutex_init(&mutex_, 0), 0);
+    CHECK_GE(pthread_cond_init(&cond_, 0), 0);
+  }
+
+  inline ~ThreadBarrierPrivate() {
+    pthread_cond_destroy(&cond_);
+    pthread_mutex_destroy(&mutex_);
+  }
+
+  /**
+   * @brief wait
+   * @return true if the last wait
+   */
+  inline bool wait() {
+    pthread_mutex_lock(&mutex_);
+    ++count_;
+    if (count_ >= tripCount_) {
+      count_ = 0;
+      pthread_cond_broadcast(&cond_);
+      pthread_mutex_unlock(&mutex_);
+      return true;
+    } else {
+      pthread_cond_wait(&cond_, &mutex_);
+      pthread_mutex_unlock(&mutex_);
+      return false;
+    }
+  }
+};
+
+#endif
 
-void ThreadBarrier::wait() { pthread_barrier_wait(&m->barrier_); }
+ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate(count)) {}
+ThreadBarrier::~ThreadBarrier() { delete m; }
+void ThreadBarrier::wait() { m->wait(); }
 
 }  // namespace paddle
diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/utils/tests/test_CustomStackTrace.cpp
index 378788bcec..b5d9f93f13 100644
--- a/paddle/utils/tests/test_CustomStackTrace.cpp
+++ b/paddle/utils/tests/test_CustomStackTrace.cpp
@@ -19,6 +19,7 @@ limitations under the License. */
 
 #include "paddle/utils/CustomStackTrace.h"
 #include "paddle/utils/Locks.h"
+#include "paddle/utils/StringUtil.h"
 #include "paddle/utils/Util.h"
 
 DEFINE_int32(test_thread_num, 10, "testing thread number");
@@ -69,11 +70,11 @@ TEST(CustomStackTrace, normalTrain) {
     while (countDown-- > 0) {
       start.wait();
       for (size_t i = 0; i < layerSize; ++i) {
-        tracer.push("layer_" + std::to_string(i));
+        tracer.push("layer_" + paddle::str::to_string(i));
       }
       tracer.pop("");
       for (size_t i = 0; i < layerSize; ++i) {
-        tracer.pop("layer_" + std::to_string(layerSize - 1 - i));
+        tracer.pop("layer_" + paddle::str::to_string(layerSize - 1 - i));
       }
       finish.wait();
     }
@@ -89,7 +90,7 @@ TEST(CustomStackTrace, normalTest) {
     while (countDown-- > 0) {
       start.wait();
       for (size_t i = 0; i < layerSize; ++i) {
-        tracer.push("layer_" + std::to_string(i));
+        tracer.push("layer_" + paddle::str::to_string(i));
       }
       tracer.clear();  // in forward test, tracer will clear after forward.
       finish.wait();
diff --git a/paddle/utils/tests/test_CustomStackTracePrint.cpp b/paddle/utils/tests/test_CustomStackTracePrint.cpp
index 611b16aa71..360c61c88a 100644
--- a/paddle/utils/tests/test_CustomStackTracePrint.cpp
+++ b/paddle/utils/tests/test_CustomStackTracePrint.cpp
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/utils/CustomStackTrace.h"
+#include "paddle/utils/StringUtil.h"
 #include "paddle/utils/Util.h"
 
 int main(int argc, char** argv) {
   paddle::initMain(argc, argv);
 
   for (size_t i = 0; i < 1000; ++i) {
-    paddle::gLayerStackTrace.push("layer_" + std::to_string(i));
+    paddle::gLayerStackTrace.push("layer_" + paddle::str::to_string(i));
     if (i == 998) {
       throw "Unhandle exception";
     }
diff --git a/paddle/utils/tests/test_SIMDFlags.cpp b/paddle/utils/tests/test_SIMDFlags.cpp
index 8200a24ce7..185789c927 100644
--- a/paddle/utils/tests/test_SIMDFlags.cpp
+++ b/paddle/utils/tests/test_SIMDFlags.cpp
@@ -18,7 +18,8 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 
 TEST(SIMDFlags, gccTest) {
-#if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__))
+#if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__)) && \
+    !defined(__arm__)
   // clang-format off
   CHECK(!__builtin_cpu_supports("sse")    != HAS_SSE);
   CHECK(!__builtin_cpu_supports("sse2")   != HAS_SSE2);
@@ -43,4 +44,5 @@ TEST(SIMDFlags, normalPrint) {
   LOG(INFO) << "Has AVX:     " << std::boolalpha << HAS_AVX;
   LOG(INFO) << "Has AVX2:    " << std::boolalpha << HAS_AVX2;
   LOG(INFO) << "Has AVX512:  " << std::boolalpha << HAS_AVX512;
+  LOG(INFO) << "Has NEON:    " << std::boolalpha << HAS_NEON;
 }
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index 65d5d50277..4f9b53d6f6 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -441,6 +441,11 @@ message LayerConfig {
 
   // blank label used in ctc loss
   optional uint32 blank = 52 [default = 0];
+
+  // stride parameter for seqlastins layer, AverageLayer, MaxLayer, which 
+  // controls the scope of pooling operation. can be set > 0.
+  // leave empty or set to -1 to disable this stride pooling.
+  optional int32 seq_pool_stride = 53 [default = -1];
 }
 
 message EvaluatorConfig {
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 48e0a1993d..e7a0895533 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -24,8 +24,9 @@ add_custom_target(paddle_python ALL DEPENDS
     ${OUTPUT_DIR}/.timestamp)
 
 add_subdirectory(paddle/trainer_config_helpers/tests)
-add_subdirectory(paddle/v2/reader/tests)
 add_subdirectory(paddle/v2/tests)
+add_subdirectory(paddle/v2/reader/tests)
+add_subdirectory(paddle/v2/plot/tests)
 
 install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/dist/
     DESTINATION opt/paddle/share/wheels
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index e9f5d53678..dc89419c40 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -724,7 +724,8 @@ class ConvProjection(ConvBaseProjection):
                  num_filters=None,
                  conv_conf=None,
                  **xargs):
-        super(ConvProjection, self).__init__(input_layer_name, **xargs)
+        super(ConvProjection, self).__init__(input_layer_name, num_filters,
+                                             conv_conf, **xargs)
 
         parse_conv(conv_conf, self.input_layer_name, self.proj_conf.conv_conf,
                    num_filters)
@@ -742,7 +743,8 @@ class ConvTransProjection(ConvBaseProjection):
                  num_filters=None,
                  conv_conf=None,
                  **xargs):
-        super(ConvTransProjection, self).__init__(input_layer_name, **xargs)
+        super(ConvTransProjection, self).__init__(input_layer_name, num_filters,
+                                                  conv_conf, **xargs)
 
         parse_conv(
             conv_conf,
@@ -1218,9 +1220,11 @@ def parse_image(image, input_layer_name, image_conf):
 
 def parse_norm(norm, input_layer_name, norm_conf):
     norm_conf.norm_type = norm.norm_type
-    config_assert(norm.norm_type in ['rnorm', 'cmrnorm-projection'],
-                  "norm-type %s is not in [rnorm, 'cmrnorm-projection']" %
-                  norm.norm_type)
+    config_assert(
+        norm.norm_type in
+        ['rnorm', 'cmrnorm-projection', 'cross-channel-norm'],
+        "norm-type %s is not in [rnorm, cmrnorm-projection, cross-channel-norm]"
+        % norm.norm_type)
     norm_conf.channels = norm.channels
     norm_conf.size = norm.size
     norm_conf.scale = norm.scale
@@ -1896,6 +1900,9 @@ class NormLayer(LayerBase):
                        norm_conf)
             self.set_cnn_layer(name, norm_conf.output_y, norm_conf.output_x,
                                norm_conf.channels, False)
+            if norm_conf.norm_type == "cross-channel-norm":
+                self.create_input_parameter(0, norm_conf.channels,
+                                            [norm_conf.channels, 1])
 
 
 @config_layer('pool')
@@ -2478,6 +2485,7 @@ class SequenceLastInstanceLayer(LayerBase):
                  active_type='linear',
                  trans_type='non-seq',
                  bias=False,
+                 stride=-1,
                  **xargs):
         super(SequenceLastInstanceLayer, self).__init__(
             name,
@@ -2488,10 +2496,11 @@ class SequenceLastInstanceLayer(LayerBase):
             **xargs)
         config_assert(
             len(inputs) == 1, 'SequenceLastInstanceLayer must have 1 input')
+        if trans_type == 'seq':
+            config_assert(stride == -1, 'subseq does not support stride window')
         self.config.trans_type = trans_type
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            self.set_layer_size(input_layer.size)
+        self.config.seq_pool_stride = stride
+        self.set_layer_size(self.get_input_layer(0).size)
         self.create_bias_parameter(bias, self.config.size)
 
 
@@ -2503,10 +2512,16 @@ class SequenceFirstInstanceLayer(SequenceLastInstanceLayer):
                  active_type='linear',
                  trans_type='non-seq',
                  bias=False,
+                 stride=-1,
                  **xargs):
         super(SequenceFirstInstanceLayer, self).__init__(
-            name, inputs=inputs, active_type=active_type, bias=bias, **xargs)
-        self.config.trans_type = trans_type
+            name,
+            inputs=inputs,
+            active_type=active_type,
+            trans_type=trans_type,
+            bias=bias,
+            stride=stride,
+            **xargs)
         self.config.select_first = True
 
 
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index b006eb46d9..97db3c2d4c 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -18,7 +18,7 @@ import inspect
 
 from paddle.trainer.config_parser import *
 from .activations import LinearActivation, SigmoidActivation, TanhActivation, \
-    ReluActivation, IdentityActivation, SoftmaxActivation
+    ReluActivation, IdentityActivation, SoftmaxActivation, BaseActivation
 from .evaluators import *
 from .poolings import MaxPooling, AvgPooling, BasePoolingType
 from .attrs import *
@@ -112,6 +112,7 @@ __all__ = [
     'out_prod_layer',
     'print_layer',
     'priorbox_layer',
+    'cross_channel_norm_layer',
     'spp_layer',
     'pad_layer',
     'eos_layer',
@@ -1008,6 +1009,46 @@ def priorbox_layer(input,
         size=size)
 
 
+@wrap_name_default("cross_channel_norm")
+def cross_channel_norm_layer(input, name=None, param_attr=None):
+    """
+    Normalize a layer's output. This layer is necessary for ssd.
+    This layer applys normalize across the channels of each sample to
+    a conv layer's output and scale the output by a group of trainable
+    factors which dimensions equal to the channel's number.
+
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param param_attr: The Parameter Attribute|list.
+    :type param_attr: ParameterAttribute
+    :return: LayerOutput
+    """
+    assert input.num_filters is not None
+    Layer(
+        name=name,
+        type=LayerType.NORM_LAYER,
+        inputs=[
+            Input(
+                input.name,
+                norm=Norm(
+                    norm_type="cross-channel-norm",
+                    channels=input.num_filters,
+                    size=input.size,
+                    scale=0,
+                    pow=0,
+                    blocked=0),
+                **param_attr.attr)
+        ])
+    return LayerOutput(
+        name,
+        LayerType.NORM_LAYER,
+        parents=input,
+        num_filters=input.num_filters,
+        size=input.size)
+
+
 @wrap_name_default("seq_pooling")
 @wrap_bias_attr_default(has_bias=False)
 @wrap_param_default(['pooling_type'], default_factory=lambda _: MaxPooling())
@@ -1301,10 +1342,16 @@ def grumemory(input,
 def last_seq(input,
              name=None,
              agg_level=AggregateLevel.EACH_TIMESTEP,
+             stride=-1,
              layer_attr=None):
     """
     Get Last Timestamp Activation of a sequence.
 
+    If stride > 0, this layer slides a window whose size is determined by stride, 
+    and return the last value of the window as the output. Thus, a long sequence 
+    will be shorten. Note that for sequence with sub-sequence, the default value 
+    of stride is -1.
+
     The simple usage is:
 
     .. code-block:: python
@@ -1316,6 +1363,8 @@ def last_seq(input,
     :type name: basestring
     :param input: Input layer name.
     :type input: LayerOutput
+    :param stride: window size.  
+    :type stride: Int
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
@@ -1327,11 +1376,15 @@ def last_seq(input,
                        " series information at all. Maybe you want to use"
                        " first_seq instead.")
 
+    if agg_level == AggregateLevel.EACH_SEQUENCE:
+        assert stride == -1
+
     Layer(
         name=name,
         type=LayerType.SEQUENCE_LAST_INSTANCE,
         inputs=[input.name],
         trans_type=agg_level,
+        stride=stride,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(
         name,
@@ -1345,10 +1398,16 @@ def last_seq(input,
 def first_seq(input,
               name=None,
               agg_level=AggregateLevel.EACH_TIMESTEP,
+              stride=-1,
               layer_attr=None):
     """
     Get First Timestamp Activation of a sequence.
 
+    If stride > 0, this layer slides a window whose size is determined by stride, 
+    and return the first value of the window as the output. Thus, a long sequence 
+    will be shorten. Note that for sequence with sub-sequence, the default value 
+    of stride is -1.
+
     The simple usage is:
 
     .. code-block:: python
@@ -1360,6 +1419,8 @@ def first_seq(input,
     :type name: basestring
     :param input: Input layer name.
     :type input: LayerOutput
+    :param stride: window size.  
+    :type stride: Int
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
@@ -1372,11 +1433,15 @@ def first_seq(input,
                        ' time series information at all. Maybe you want to use'
                        ' last_seq instead.')
 
+    if agg_level == AggregateLevel.EACH_SEQUENCE:
+        assert stride == -1
+
     Layer(
         name=name,
         type=LayerType.SEQUENCE_FIRST_INSTANCE,
         inputs=[input.name],
         trans_type=agg_level,
+        stride=stride,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(
         name,
@@ -1875,7 +1940,7 @@ def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None):
 @layer_support()
 def hsigmoid(input,
              label,
-             num_classes,
+             num_classes=None,
              name=None,
              bias_attr=None,
              param_attr=None,
@@ -1891,8 +1956,7 @@ def hsigmoid(input,
     ..  code-block:: python
 
         cost = hsigmoid(input=[layer1, layer2],
-                        label=data_layer,
-                        num_classes=3)
+                        label=data_layer)
 
     :param input: Input layers. It could be a LayerOutput or list/tuple of
                  LayerOutput.
@@ -1900,12 +1964,14 @@ def hsigmoid(input,
     :param label: Label layer.
     :type label: LayerOutput
     :param num_classes: number of classes.
-    :type num_classes: int
+    :type num_classes: int|None
     :param name: layer name
     :type name: basestring
     :param bias_attr: Bias attribute. None means default bias.
                       False means no bias.
     :type bias_attr: ParameterAttribute|False
+    :param param_attr: Parameter Attribute. None means default parameter.
+    :type param_attr: ParameterAttribute|None
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
@@ -1925,6 +1991,11 @@ def hsigmoid(input,
     assert isinstance(label, LayerOutput)
     assert label.layer_type == LayerType.DATA
 
+    if num_classes is None:
+        num_classes = label.size
+    if num_classes is None or num_classes <= 2:
+        raise ValueError("hsigmoid label size must larger than 2.")
+
     ipts_for_layer = []
     parents = []
     for each_input, each_param_attr in zip(input, param_attr):
@@ -2212,8 +2283,9 @@ def img_pool_layer(input,
         pool_type.name = 'avg'
 
     type_name = pool_type.name + '-projection' \
-      if (isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
-      else pool_type.name
+        if (
+    isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
+        else pool_type.name
 
     pool_size_y = pool_size if pool_size_y is None else pool_size_y
     stride_y = stride if stride_y is None else stride_y
@@ -3253,8 +3325,8 @@ def recurrent_group(step,
 
     assert (targetInlink == None or targetInlink_in_inlinks())
     targetInlinkName = None if targetInlink == None \
-                            else targetInlink.name if isinstance(targetInlink, LayerOutput) \
-                                                   else targetInlink.input.name
+        else targetInlink.name if isinstance(targetInlink, LayerOutput) \
+        else targetInlink.input.name
 
     contains_sub_seq = [False]
 
@@ -4766,12 +4838,14 @@ def crf_decoding_layer(input,
     return LayerOutput(name, LayerType.CRF_DECODING_LAYER, parents, size=1)
 
 
+@wrap_act_default(act=SigmoidActivation())
 @wrap_bias_attr_default(has_bias=True)
 @wrap_name_default()
 @layer_support()
 def nce_layer(input,
               label,
               num_classes,
+              act=None,
               weight=None,
               num_neg_samples=10,
               neg_distribution=None,
@@ -4800,6 +4874,8 @@ def nce_layer(input,
     :type weight: LayerOutput
     :param num_classes: number of classes.
     :type num_classes: int
+    :param act: Activation, default is Sigmoid.
+    :type act: BaseActivation
     :param num_neg_samples: number of negative samples. Default is 10.
     :type num_neg_samples: int
     :param neg_distribution: The distribution for generating the random negative labels.
@@ -4821,7 +4897,9 @@ def nce_layer(input,
     if neg_distribution is not None:
         assert isinstance(neg_distribution, collections.Sequence)
         assert len(neg_distribution) == num_classes
-        assert sum(neg_distribution) == 1
+        assert abs(sum(neg_distribution) - 1.0) < 1e-5
+    if not isinstance(act, BaseActivation):
+        raise TypeError()
 
     ipts_for_layer = []
     parents = []
@@ -4843,12 +4921,17 @@ def nce_layer(input,
         type=LayerType.NCE_LAYER,
         num_classes=num_classes,
         neg_sampling_dist=neg_distribution,
+        active_type=act.name,
         num_neg_samples=num_neg_samples,
         inputs=ipts_for_layer,
         bias=ParamAttr.to_bias(bias_attr),
         **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(
-        name, LayerType.NCE_LAYER, parents=parents, size=l.config.size)
+        name,
+        LayerType.NCE_LAYER,
+        parents=parents,
+        size=l.config.size,
+        activation=act)
 
 
 """
diff --git a/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py b/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
index 3a1a0132b6..3c6dbc95e5 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
@@ -14,4 +14,7 @@ for op in seq_op:
     for al in agg_level:
         opts.append(op(input=din, agg_level=al))
 
+for op in seq_op:
+    opts.append(op(input=din, agg_level=AggregateLevel.EACH_TIMESTEP, stride=5))
+
 outputs(opts)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr
index 7b2911f8e3..12b2255f3a 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr
@@ -15,6 +15,7 @@ layers {
   }
   select_first: true
   trans_type: "seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__first_seq_1__"
@@ -26,6 +27,7 @@ layers {
   }
   select_first: true
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__last_seq_0__"
@@ -36,6 +38,7 @@ layers {
     input_layer_name: "data"
   }
   trans_type: "seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__last_seq_1__"
@@ -46,12 +49,38 @@ layers {
     input_layer_name: "data"
   }
   trans_type: "non-seq"
+  seq_pool_stride: -1
+}
+layers {
+  name: "__first_seq_2__"
+  type: "seqlastins"
+  size: 30
+  active_type: "linear"
+  inputs {
+    input_layer_name: "data"
+  }
+  select_first: true
+  trans_type: "non-seq"
+  seq_pool_stride: 5
+}
+layers {
+  name: "__last_seq_2__"
+  type: "seqlastins"
+  size: 30
+  active_type: "linear"
+  inputs {
+    input_layer_name: "data"
+  }
+  trans_type: "non-seq"
+  seq_pool_stride: 5
 }
 input_layer_names: "data"
 output_layer_names: "__first_seq_0__"
 output_layer_names: "__first_seq_1__"
 output_layer_names: "__last_seq_0__"
 output_layer_names: "__last_seq_1__"
+output_layer_names: "__first_seq_2__"
+output_layer_names: "__last_seq_2__"
 sub_models {
   name: "root"
   layer_names: "data"
@@ -59,11 +88,15 @@ sub_models {
   layer_names: "__first_seq_1__"
   layer_names: "__last_seq_0__"
   layer_names: "__last_seq_1__"
+  layer_names: "__first_seq_2__"
+  layer_names: "__last_seq_2__"
   input_layer_names: "data"
   output_layer_names: "__first_seq_0__"
   output_layer_names: "__first_seq_1__"
   output_layer_names: "__last_seq_0__"
   output_layer_names: "__last_seq_1__"
+  output_layer_names: "__first_seq_2__"
+  output_layer_names: "__last_seq_2__"
   is_recurrent_layer_group: false
 }
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
index fed3790043..2afc3afef6 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
@@ -156,6 +156,7 @@ layers {
   }
   inputs {
     input_layer_name: "img"
+    input_parameter_name: "___mixed_6__.w1"
     proj_conf {
       type: "conv"
       name: "___mixed_6__.w1"
@@ -177,6 +178,7 @@ layers {
         output_y: 30
         img_size_y: 32
       }
+      num_filters: 64
     }
   }
   inputs {
@@ -218,6 +220,7 @@ layers {
   }
   inputs {
     input_layer_name: "img"
+    input_parameter_name: "___mixed_7__.w1"
     proj_conf {
       type: "convt"
       name: "___mixed_7__.w1"
@@ -239,6 +242,7 @@ layers {
         output_y: 32
         img_size_y: 63
       }
+      num_filters: 64
     }
   }
   inputs {
@@ -377,6 +381,22 @@ parameters {
   initial_strategy: 0
   initial_smart: true
 }
+parameters {
+  name: "___mixed_6__.w1"
+  size: 576
+  initial_mean: 0.0
+  initial_std: 0.471404520791
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___mixed_7__.w1"
+  size: 576
+  initial_mean: 0.0
+  initial_std: 0.471404520791
+  initial_strategy: 0
+  initial_smart: false
+}
 parameters {
   name: "___mixed_8__.w0"
   size: 30000
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
index b6905824f0..64530146a1 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
@@ -128,6 +128,7 @@ layers {
     input_layer_name: "__simple_gru_0__"
   }
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__last_seq_1__"
@@ -138,6 +139,7 @@ layers {
     input_layer_name: "__simple_gru_1__"
   }
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__fc_layer_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
index 0a83499b72..79fa4c74f0 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
@@ -210,6 +210,7 @@ layers {
     input_layer_name: "__lstm_group_0__"
   }
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__last_seq_1__"
@@ -220,6 +221,7 @@ layers {
     input_layer_name: "__lstm_group_1__"
   }
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__fc_layer_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr
index dacb40185f..68fa881b4f 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr
@@ -143,6 +143,7 @@ layers {
     input_layer_name: "__recurrent_layer_0__"
   }
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__first_seq_0__"
@@ -154,6 +155,7 @@ layers {
   }
   select_first: true
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__last_seq_1__"
@@ -164,6 +166,7 @@ layers {
     input_layer_name: "__lstmemory_0__"
   }
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__first_seq_1__"
@@ -175,6 +178,7 @@ layers {
   }
   select_first: true
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__last_seq_2__"
@@ -185,6 +189,7 @@ layers {
     input_layer_name: "__gru_0__"
   }
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__first_seq_2__"
@@ -196,6 +201,7 @@ layers {
   }
   select_first: true
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 parameters {
   name: "___fc_layer_0__.w0"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
index a0fb729e06..77b447aa9d 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
@@ -96,6 +96,7 @@ layers {
     input_layer_name: "rnn_forward"
   }
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__recurrent_group_1__"
@@ -145,6 +146,7 @@ layers {
   }
   select_first: true
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__recurrent_group_2__"
@@ -193,6 +195,7 @@ layers {
     input_layer_name: "rnn_subseq_forward"
   }
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__lstm_group_0___recurrent_group"
@@ -282,6 +285,7 @@ layers {
     input_layer_name: "__lstm_group_0__"
   }
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__gru_group_0___recurrent_group"
@@ -330,6 +334,7 @@ layers {
     input_layer_name: "__gru_group_0__"
   }
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__recurrent_group_3__"
@@ -378,6 +383,7 @@ layers {
     input_layer_name: "__fc_layer_0__"
   }
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 parameters {
   name: "___mixed_0__.w0"
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
index 25526bf409..7c8f6ea62f 100644
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -21,19 +21,22 @@ import data_type
 import topology
 import data_feeder
 import networks
+import evaluator
 from . import dataset
 from . import reader
+from . import plot
 import attr
 import pooling
 import inference
 import networks
 import py_paddle.swig_paddle as api
 import minibatch
+import plot
 
 __all__ = [
     'optimizer', 'layer', 'activation', 'parameters', 'init', 'trainer',
     'event', 'data_type', 'attr', 'pooling', 'data_feeder', 'dataset', 'reader',
-    'topology', 'networks', 'infer'
+    'topology', 'networks', 'infer', 'plot', 'evaluator'
 ]
 
 
diff --git a/python/paddle/v2/config_base.py b/python/paddle/v2/config_base.py
index 1ec1d7bbdf..b0e8da563e 100644
--- a/python/paddle/v2/config_base.py
+++ b/python/paddle/v2/config_base.py
@@ -65,13 +65,42 @@ class Layer(object):
     def __init__(self, name=None, parent_layers=None):
         assert isinstance(parent_layers, dict)
         self.name = name
-        self.__contex__ = {}
+        self.__context__ = {}
         self.__parent_layers__ = parent_layers
+        # some layer may have some extra parent layer
+        self.__extra_parent__ = []
+        # used for evaluator.
+        self.__children_layers__ = []
+
+    def extra_parent(self):
+        return self.__extra_parent__
+
+    def append_extra_parent(self, parent):
+        self.__extra_parent__.append(parent)
+
+    def append_child(self, layer, parent_names):
+        self.__children_layers__.append((layer, parent_names))
 
     def to_proto(self, context):
         """
         function to set proto attribute
         """
+        self.__context__ = context
+
+        # STEP: short cut if this layer is parsed before.
+        if self.context_name() in context:
+            if self.use_context_name():
+                return context[self.context_name()]
+            else:
+                return context[self.name]
+
+        # STEP: parse extra_parent that is not used by this layer but must
+        # be parsed before this layer.
+        for p in self.__extra_parent__:
+            p.to_proto(context=context)
+
+        # STEP: parse parent that is used by this layer, get the result and
+        # insert into kwargs of the next layer's to_proto_impl method.
         kwargs = dict()
         for layer_name in self.__parent_layers__:
             if not isinstance(self.__parent_layers__[layer_name],
@@ -83,12 +112,29 @@ class Layer(object):
                                self.__parent_layers__[layer_name])
             kwargs[layer_name] = v1_layer
 
+        # STEP: parse myself and add myself into context.
+        ret_val = self.to_proto_impl(**kwargs)
+        if self.context_name() is not None \
+                and self.context_name() not in context:
+            context[self.context_name()] = ret_val
+
+        # STEP: parse children that should be pased after this layer.
+        for layer, pnames in self.__children_layers__:
+            drop = False
+
+            # child will only be parsed if all parents are in context.
+            for pname in pnames:
+                if pname not in context:
+                    drop = True
+                    break
+            if drop:
+                continue
+            layer.to_proto(context=context)
+
+        # STEP: return v1 layer result
         if self.context_name() is None:
-            return self.to_proto_impl(**kwargs)
-        elif self.context_name() not in context:
-            context[self.context_name()] = self.to_proto_impl(**kwargs)
-        self.__contex__ = context
-        if self.use_context_name():
+            return ret_val
+        elif self.use_context_name():
             return context[self.context_name()]
         else:
             return context[self.name]
@@ -113,10 +159,13 @@ class Layer(object):
         this layer is called.
         :return:
         """
-        return self.__contex__[self.context_name()].size
+        return self.__context__[self.context_name()].size
 
 
-def __convert_to_v2__(method_name, parent_names, is_default_name=True):
+def __convert_to_v2__(method_name,
+                      parent_names,
+                      is_default_name=True,
+                      attach_parent=False):
     if is_default_name:
         wrapper = wrap_name_default(name_prefix=method_name)
     else:
@@ -129,9 +178,20 @@ def __convert_to_v2__(method_name, parent_names, is_default_name=True):
             parent_layers = dict()
             other_kwargs = dict()
             for pname in parent_names:
-                if kwargs.has_key(pname):
+                if pname in kwargs:
                     parent_layers[pname] = kwargs[pname]
 
+            if attach_parent:
+                pnames = [x.context_name() for x in parent_layers.values()]
+
+                for pname in parent_layers:
+                    layers = kwargs[pname]
+                    if not isinstance(layers, collections.Sequence):
+                        layers = [layers]
+
+                    for layer in layers:
+                        layer.append_child(self, pnames)
+
             for key in kwargs.keys():
                 if key not in parent_names:
                     other_kwargs[key] = kwargs[key]
diff --git a/python/paddle/v2/data_feeder.py b/python/paddle/v2/data_feeder.py
index bda8e22fd2..2698251b9e 100644
--- a/python/paddle/v2/data_feeder.py
+++ b/python/paddle/v2/data_feeder.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from py_paddle import DataProviderConverter
-
+import collections
 import paddle.trainer.PyDataProvider2 as pydp2
 
 __all__ = ['DataFeeder']
@@ -35,15 +35,30 @@ class DataFeeder(DataProviderConverter):
     DataFeeder converts this mini-batch data entries into Arguments in order
     to feed it to C++ interface.
     
-    The example usage:
+    The simple usage shows below
+
+    ..  code-block:: python
+
+        feeding = ['image', 'label']
+        data_types = enumerate_data_types_of_data_layers(topology)
+        feeder = DataFeeder(data_types=data_types, feeding=feeding)
+
+        minibatch_data = [([1.0, 2.0, 3.0, ...], 5)]
+
+        arg = feeder(minibatch_data)
+
+
+    If mini-batch data and data layers are not one to one mapping, we
+    could pass a dictionary to feeding parameter to represent the mapping
+    relationship.
 
 
     ..  code-block:: python
 
         data_types = [('image', paddle.data_type.dense_vector(784)),
                       ('label', paddle.data_type.integer_value(10))]
-        reader_dict = {'image':0, 'label':1}
-        feeder = DataFeeder(data_types=data_types, reader_dict=reader_dict)
+        feeding = {'image':0, 'label':1}
+        feeder = DataFeeder(data_types=data_types, feeding=feeding)
         minibatch_data = [
                            ( [1.0,2.0,3.0,4.0], 5, [6,7,8] ),  # first sample
                            ( [1.0,2.0,3.0,4.0], 5, [6,7,8] )   # second sample
@@ -52,7 +67,7 @@ class DataFeeder(DataProviderConverter):
         #                       [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ],  # first sample
         #                       [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ]   # second sample
         #                     ]
-        arg = feeder(minibatch_data)
+        arg = feeder.convert(minibatch_data)
 
     ..  note::
 
@@ -65,9 +80,9 @@ class DataFeeder(DataProviderConverter):
                        a tuple of (data_name, data_type).
 
     :type data_types: list
-    :param reader_dict: A dictionary to specify the position of each data
-                        in the input data.
-    :type feeding: dict
+    :param feeding: A dictionary or a sequence to specify the position of each
+                    data in the input data.
+    :type feeding: dict|collections.Sequence|None
     """
 
     def __init__(self, data_types, feeding=None):
@@ -75,6 +90,13 @@ class DataFeeder(DataProviderConverter):
         input_types = []
         if feeding is None:
             feeding = default_feeding_map(data_types)
+        elif isinstance(feeding, collections.Sequence):
+            feed_list = feeding
+            feeding = dict()
+            for i, name in enumerate(feed_list):
+                feeding[name] = i
+        elif not isinstance(feeding, dict):
+            raise TypeError("Feeding should be dict or sequence or None.")
 
         self.feeding = feeding
         for each in data_types:
diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py
index d9f7a830ee..41fda1e8f2 100644
--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/v2/dataset/cifar.py
@@ -12,15 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-CIFAR dataset: https://www.cs.toronto.edu/~kriz/cifar.html
+CIFAR dataset.
+
+This module will download dataset from
+https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into
+paddle reader creators.
+
+The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes,
+with 6000 images per class. There are 50000 training images and 10000 test
+images.
+
+The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes
+containing 600 images each. There are 500 training images and 100 testing
+images per class.
 
-TODO(yuyang18): Complete the comments.
 """
 
 import cPickle
 import itertools
 import numpy
-import paddle.v2.dataset.common
+from common import download
 import tarfile
 
 __all__ = ['train100', 'test100', 'train10', 'test10']
@@ -54,24 +65,60 @@ def reader_creator(filename, sub_name):
 
 
 def train100():
+    """
+    CIFAR-100 training set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 99].
+
+    :return: Training reader creator
+    :rtype: callable
+    """
     return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
-        'train')
+        download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'train')
 
 
 def test100():
-    return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
-        'test')
+    """
+    CIFAR-100 test set cretor.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Test reader creator.
+    :rtype: callable
+    """
+    return reader_creator(download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'test')
 
 
 def train10():
+    """
+    CIFAR-10 training set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Training reader creator
+    :rtype: callable
+    """
     return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'data_batch')
+        download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'data_batch')
 
 
 def test10():
+    """
+    CIFAR-10 test set cretor.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Test reader creator.
+    :rtype: callable
+    """
     return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'test_batch')
+        download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'test_batch')
+
+
+def fetch():
+    download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
+    download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
index 3021b68ddb..2eb018b8d6 100644
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@@ -17,6 +17,8 @@ import hashlib
 import os
 import shutil
 import sys
+import importlib
+import paddle.v2.dataset
 
 __all__ = ['DATA_HOME', 'download', 'md5file']
 
@@ -64,8 +66,11 @@ def download(url, module_name, md5sum):
     return filename
 
 
-def dict_add(a_dict, ele):
-    if ele in a_dict:
-        a_dict[ele] += 1
-    else:
-        a_dict[ele] = 1
+def fetch_all():
+    for module_name in filter(lambda x: not x.startswith("__"),
+                              dir(paddle.v2.dataset)):
+        if "fetch" in dir(
+                importlib.import_module("paddle.v2.dataset.%s" % module_name)):
+            getattr(
+                importlib.import_module("paddle.v2.dataset.%s" % module_name),
+                "fetch")()
diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py
index 9eab49ee39..12d648bf65 100644
--- a/python/paddle/v2/dataset/conll05.py
+++ b/python/paddle/v2/dataset/conll05.py
@@ -11,19 +11,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+Conll05 dataset.
+Paddle semantic role labeling Book and demo use this dataset as an example.
+Because Conll05 is not free in public, the default downloaded URL is test set
+of Conll05 (which is public). Users can change URL and MD5 to their Conll
+dataset. And a pre-trained word vector model based on Wikipedia corpus is used
+to initialize SRL model.
+"""
 
 import tarfile
 import gzip
 import itertools
 from common import download
-"""
-Conll 2005 dataset.  Paddle semantic role labeling Book and demo use this
-dataset as an example. Because Conll 2005 is not free in public, the default
-downloaded URL is test set of Conll 2005 (which is public). Users can change
-URL and MD5 to their Conll dataset.
-
-TODO(yuyang18): Complete comments.
-"""
 
 __all__ = ['test, get_dict', 'get_embedding']
 
@@ -179,6 +179,9 @@ def reader_creator(corpus_reader,
 
 
 def get_dict():
+    """
+    Get the word, verb and label dictionary of Wikipedia corpus.
+    """
     word_dict = load_dict(download(WORDDICT_URL, 'conll05st', WORDDICT_MD5))
     verb_dict = load_dict(download(VERBDICT_URL, 'conll05st', VERBDICT_MD5))
     label_dict = load_dict(download(TRGDICT_URL, 'conll05st', TRGDICT_MD5))
@@ -186,13 +189,35 @@ def get_dict():
 
 
 def get_embedding():
+    """
+    Get the trained word vector based on Wikipedia corpus.
+    """
     return download(EMB_URL, 'conll05st', EMB_MD5)
 
 
 def test():
+    """
+    Conll05 test set creator.
+
+    Because the training dataset is not free, the test dataset is used for
+    training. It returns a reader creator, each sample in the reader is nine
+    features, including sentence sequence, predicate, predicate context,
+    predicate context flag and tagged sequence.
+
+    :return: Training reader creator
+    :rtype: callable
+    """
     word_dict, verb_dict, label_dict = get_dict()
     reader = corpus_reader(
         download(DATA_URL, 'conll05st', DATA_MD5),
         words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
         props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
     return reader_creator(reader, word_dict, verb_dict, label_dict)
+
+
+def fetch():
+    download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
+    download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
+    download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
+    download(EMB_URL, 'conll05st', EMB_MD5)
+    download(DATA_URL, 'conll05st', DATA_MD5)
diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py
index 76019d9f54..5dc5abfe53 100644
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/v2/dataset/imdb.py
@@ -12,12 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-IMDB dataset: http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz
+IMDB dataset.
 
-TODO(yuyang18): Complete comments.
+This module downloads IMDB dataset from
+http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set
+of 25,000 highly polar movie reviews for training, and 25,000 for testing.
+Besides, this module also provides API for building dictionary.
 """
 
 import paddle.v2.dataset.common
+import collections
 import tarfile
 import Queue
 import re
@@ -30,8 +34,11 @@ URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
 MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
 
 
-# Read files that match pattern.  Tokenize and yield each file.
 def tokenize(pattern):
+    """
+    Read files that match the given pattern.  Tokenize and yield each file.
+    """
+
     with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb',
                                                         MD5)) as tarf:
         # Note that we should use tarfile.next(), which does
@@ -48,10 +55,14 @@ def tokenize(pattern):
 
 
 def build_dict(pattern, cutoff):
-    word_freq = {}
+    """
+    Build a word dictionary from the corpus. Keys of the dictionary are words,
+    and values are zero-based IDs of these words.
+    """
+    word_freq = collections.defaultdict(int)
     for doc in tokenize(pattern):
         for word in doc:
-            paddle.v2.dataset.common.dict_add(word_freq, word)
+            word_freq[word] += 1
 
     # Not sure if we should prune less-frequent words here.
     word_freq = filter(lambda x: x[1] > cutoff, word_freq.items())
@@ -109,17 +120,49 @@ def reader_creator(pos_pattern, neg_pattern, word_idx, buffer_size):
 
 
 def train(word_idx):
+    """
+    IMDB training set creator.
+
+    It returns a reader creator, each sample in the reader is an zero-based ID
+    sequence and label in [0, 1].
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :return: Training reader creator
+    :rtype: callable
+    """
     return reader_creator(
         re.compile("aclImdb/train/pos/.*\.txt$"),
         re.compile("aclImdb/train/neg/.*\.txt$"), word_idx, 1000)
 
 
 def test(word_idx):
+    """
+    IMDB test set creator.
+
+    It returns a reader creator, each sample in the reader is an zero-based ID
+    sequence and label in [0, 1].
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :return: Test reader creator
+    :rtype: callable
+    """
     return reader_creator(
         re.compile("aclImdb/test/pos/.*\.txt$"),
         re.compile("aclImdb/test/neg/.*\.txt$"), word_idx, 1000)
 
 
 def word_dict():
+    """
+    Build a word dictionary from the corpus.
+
+    :return: Word dictionary
+    :rtype: dict
+    """
     return build_dict(
         re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
+
+
+def fetch():
+    paddle.v2.dataset.common.download(URL, 'imdb', MD5)
diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py
index 97c160f111..41ca27e236 100644
--- a/python/paddle/v2/dataset/imikolov.py
+++ b/python/paddle/v2/dataset/imikolov.py
@@ -12,11 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-imikolov's simple dataset: http://www.fit.vutbr.cz/~imikolov/rnnlm/
+imikolov's simple dataset.
 
-Complete comments.
+This module will download dataset from 
+http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
+into paddle reader creators.
 """
 import paddle.v2.dataset.common
+import collections
 import tarfile
 
 __all__ = ['train', 'test', 'build_dict']
@@ -26,20 +29,23 @@ MD5 = '30177ea32e27c525793142b6bf2c8e2d'
 
 
 def word_count(f, word_freq=None):
-    add = paddle.v2.dataset.common.dict_add
-    if word_freq == None:
-        word_freq = {}
+    if word_freq is None:
+        word_freq = collections.defaultdict(int)
 
     for l in f:
         for w in l.strip().split():
-            add(word_freq, w)
-        add(word_freq, '<s>')
-        add(word_freq, '<e>')
+            word_freq[w] += 1
+        word_freq['<s>'] += 1
+        word_freq['<e>'] += 1
 
     return word_freq
 
 
 def build_dict():
+    """
+    Build a word dictionary from the corpus,  Keys of the dictionary are words,
+    and values are zero-based IDs of these words.
+    """
     train_filename = './simple-examples/data/ptb.train.txt'
     test_filename = './simple-examples/data/ptb.valid.txt'
     with tarfile.open(
@@ -84,8 +90,38 @@ def reader_creator(filename, word_idx, n):
 
 
 def train(word_idx, n):
+    """
+    imikolov training set creator.
+
+    It returns a reader creator, each sample in the reader is a word ID
+    tuple.
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :param n: sliding window size
+    :type n: int
+    :return: Training reader creator
+    :rtype: callable
+    """
     return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n)
 
 
 def test(word_idx, n):
+    """
+    imikolov test set creator.
+
+    It returns a reader creator, each sample in the reader is a word ID
+    tuple.
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :param n: sliding window size
+    :type n: int
+    :return: Test reader creator
+    :rtype: callable
+    """
     return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n)
+
+
+def fetch():
+    paddle.v2.dataset.common.download(URL, "imikolov", MD5)
diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py
index 16f2fcb99d..c1347d3c66 100644
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
@@ -15,7 +15,7 @@
 MNIST dataset.
 
 This module will download dataset from http://yann.lecun.com/exdb/mnist/ and
-parse train set and test set into paddle reader creators.
+parse training set and test set into paddle reader creators.
 """
 import paddle.v2.dataset.common
 import subprocess
@@ -76,12 +76,12 @@ def reader_creator(image_filename, label_filename, buffer_size):
 
 def train():
     """
-    MNIST train set creator.
+    MNIST training set creator.
 
     It returns a reader creator, each sample in the reader is image pixels in
     [0, 1] and label in [0, 9].
 
-    :return: Train reader creator
+    :return: Training reader creator
     :rtype: callable
     """
     return reader_creator(
@@ -106,3 +106,10 @@ def test():
                                           TEST_IMAGE_MD5),
         paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist',
                                           TEST_LABEL_MD5), 100)
+
+
+def fetch():
+    paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
+    paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+    paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
+    paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py
index 25fd8227da..837a859126 100644
--- a/python/paddle/v2/dataset/movielens.py
+++ b/python/paddle/v2/dataset/movielens.py
@@ -14,7 +14,12 @@
 """
 Movielens 1-M dataset.
 
-TODO(yuyang18): Complete comments.
+Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000
+movies, which was collected by GroupLens Research. This module will download
+Movielens 1-M dataset from 
+http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse training
+set and test set into paddle reader creators.
+
 """
 
 import zipfile
@@ -30,14 +35,24 @@ __all__ = [
 
 age_table = [1, 18, 25, 35, 45, 50, 56]
 
+URL = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
+MD5 = 'c4d9eecfca2ab87c1945afe126590906'
+
 
 class MovieInfo(object):
+    """
+    Movie id, title and categories information are stored in MovieInfo.
+    """
+
     def __init__(self, index, categories, title):
         self.index = int(index)
         self.categories = categories
         self.title = title
 
     def value(self):
+        """
+        Get information from a movie.
+        """
         return [
             self.index, [CATEGORIES_DICT[c] for c in self.categories],
             [MOVIE_TITLE_DICT[w.lower()] for w in self.title.split()]
@@ -52,6 +67,10 @@ class MovieInfo(object):
 
 
 class UserInfo(object):
+    """
+    User id, gender, age, and job information are stored in UserInfo.
+    """
+
     def __init__(self, index, gender, age, job_id):
         self.index = int(index)
         self.is_male = gender == 'M'
@@ -59,6 +78,9 @@ class UserInfo(object):
         self.job_id = int(job_id)
 
     def value(self):
+        """
+        Get information from a user.
+        """
         return [self.index, 0 if self.is_male else 1, self.age, self.job_id]
 
     def __str__(self):
@@ -77,10 +99,7 @@ USER_INFO = None
 
 
 def __initialize_meta_info__():
-    fn = download(
-        url='http://files.grouplens.org/datasets/movielens/ml-1m.zip',
-        module_name='movielens',
-        md5sum='c4d9eecfca2ab87c1945afe126590906')
+    fn = download(URL, "movielens", MD5)
     global MOVIE_INFO
     if MOVIE_INFO is None:
         pattern = re.compile(r'^(.*)\((\d+)\)$')
@@ -148,6 +167,9 @@ test = functools.partial(__reader_creator__, is_test=True)
 
 
 def get_movie_title_dict():
+    """
+    Get movie title dictionary.
+    """
     __initialize_meta_info__()
     return MOVIE_TITLE_DICT
 
@@ -160,11 +182,17 @@ def __max_index_info__(a, b):
 
 
 def max_movie_id():
+    """
+    Get the maximum value of movie id.
+    """
     __initialize_meta_info__()
     return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index
 
 
 def max_user_id():
+    """
+    Get the maximum value of user id.
+    """
     __initialize_meta_info__()
     return reduce(__max_index_info__, USER_INFO.viewvalues()).index
 
@@ -177,21 +205,33 @@ def __max_job_id_impl__(a, b):
 
 
 def max_job_id():
+    """
+    Get the maximum value of job id.
+    """
     __initialize_meta_info__()
     return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id
 
 
 def movie_categories():
+    """
+    Get movie categoriges dictionary.
+    """
     __initialize_meta_info__()
     return CATEGORIES_DICT
 
 
 def user_info():
+    """
+    Get user info dictionary.
+    """
     __initialize_meta_info__()
     return USER_INFO
 
 
 def movie_info():
+    """
+    Get movie info dictionary.
+    """
     __initialize_meta_info__()
     return MOVIE_INFO
 
@@ -205,5 +245,9 @@ def unittest():
     print train_count, test_count
 
 
+def fetch():
+    download(URL, "movielens", MD5)
+
+
 if __name__ == '__main__':
     unittest()
diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py
index 71689fd61b..4dd34e7383 100644
--- a/python/paddle/v2/dataset/sentiment.py
+++ b/python/paddle/v2/dataset/sentiment.py
@@ -113,7 +113,7 @@ def reader_creator(data):
 
 def train():
     """
-    Default train set reader creator
+    Default training set reader creator
     """
     data_set = load_sentiment_data()
     return reader_creator(data_set[0:NUM_TRAINING_INSTANCES])
@@ -125,3 +125,7 @@ def test():
     """
     data_set = load_sentiment_data()
     return reader_creator(data_set[NUM_TRAINING_INSTANCES:])
+
+
+def fetch():
+    nltk.download('movie_reviews', download_dir=common.DATA_HOME)
diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py
index 27f454b137..3469fd9ce1 100644
--- a/python/paddle/v2/dataset/uci_housing.py
+++ b/python/paddle/v2/dataset/uci_housing.py
@@ -14,7 +14,9 @@
 """
 UCI Housing dataset.
 
-TODO(yuyang18): Complete comments.
+This module will download dataset from
+https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and
+parse training set and test set into paddle reader creators.
 """
 
 import numpy as np
@@ -70,6 +72,15 @@ def load_data(filename, feature_num=14, ratio=0.8):
 
 
 def train():
+    """
+    UCI_HOUSING training set creator.
+
+    It returns a reader creator, each sample in the reader is features after
+    normalization and price number.
+
+    :return: Training reader creator
+    :rtype: callable
+    """
     global UCI_TRAIN_DATA
     load_data(download(URL, 'uci_housing', MD5))
 
@@ -81,6 +92,15 @@ def train():
 
 
 def test():
+    """
+    UCI_HOUSING test set creator.
+
+    It returns a reader creator, each sample in the reader is features after
+    normalization and price number.
+
+    :return: Test reader creator
+    :rtype: callable
+    """
     global UCI_TEST_DATA
     load_data(download(URL, 'uci_housing', MD5))
 
@@ -89,3 +109,7 @@ def test():
             yield d[:-1], d[-1:]
 
     return reader
+
+
+def fetch():
+    download(URL, 'uci_housing', MD5)
diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py
index c686870a49..0902f87741 100644
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
@@ -12,11 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-wmt14 dataset
+WMT14 dataset.
+The original WMT14 dataset is too large and a small set of data for set is
+provided. This module will download dataset from
+http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
+parse training set and test set into paddle reader creators.
+
 """
 import tarfile
+import gzip
 
-import paddle.v2.dataset.common
+from paddle.v2.dataset.common import download
+from paddle.v2.parameters import Parameters
 
 __all__ = ['train', 'test', 'build_dict']
 
@@ -24,7 +31,10 @@ URL_DEV_TEST = 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/de
 MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
 # this is a small set of data for test. The original data is too large and will be add later.
 URL_TRAIN = 'http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz'
-MD5_TRAIN = 'a755315dd01c2c35bde29a744ede23a6'
+MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
+# this is the pretrained model, whose bleu = 26.92
+URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz'
+MD5_MODEL = '4ce14a26607fb8a1cc23bcdedb1895e4'
 
 START = "<s>"
 END = "<e>"
@@ -94,12 +104,58 @@ def reader_creator(tar_file, file_name, dict_size):
 
 
 def train(dict_size):
+    """
+    WMT14 training set creator.
+
+    It returns a reader creator, each sample in the reader is source language
+    word ID sequence, target language word ID sequence and next word ID
+    sequence.
+
+    :return: Training reader creator
+    :rtype: callable
+    """
     return reader_creator(
-        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
-        'train/train', dict_size)
+        download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'train/train', dict_size)
 
 
 def test(dict_size):
+    """
+    WMT14 test set creator.
+
+    It returns a reader creator, each sample in the reader is source language
+    word ID sequence, target language word ID sequence and next word ID
+    sequence.
+
+    :return: Test reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'test/test', dict_size)
+
+
+def gen(dict_size):
     return reader_creator(
-        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
-        'test/test', dict_size)
+        download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'gen/gen', dict_size)
+
+
+def model():
+    tar_file = download(URL_MODEL, 'wmt14', MD5_MODEL)
+    with gzip.open(tar_file, 'r') as f:
+        parameters = Parameters.from_tar(f)
+    return parameters
+
+
+def get_dict(dict_size, reverse=True):
+    # if reverse = False, return dict = {'a':'001', 'b':'002', ...}
+    # else reverse = true, return dict = {'001':'a', '002':'b', ...}
+    tar_file = download(URL_TRAIN, 'wmt14', MD5_TRAIN)
+    src_dict, trg_dict = __read_to_dict__(tar_file, dict_size)
+    if reverse:
+        src_dict = {v: k for k, v in src_dict.items()}
+        trg_dict = {v: k for k, v in trg_dict.items()}
+    return src_dict, trg_dict
+
+
+def fetch():
+    download(URL_TRAIN, 'wmt14', MD5_TRAIN)
+    download(URL_MODEL, 'wmt14', MD5_MODEL)
diff --git a/python/paddle/v2/evaluator.py b/python/paddle/v2/evaluator.py
new file mode 100644
index 0000000000..588eefa391
--- /dev/null
+++ b/python/paddle/v2/evaluator.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.trainer_config_helpers.evaluators as evs
+import inspect
+from config_base import __convert_to_v2__
+
+__all__ = []
+
+
+def initialize():
+    def convert_to_new_name(nm):
+        return nm[:-len("_evaluator")]
+
+    for __ev_name__ in filter(lambda x: x.endswith('_evaluator'), evs.__all__):
+        __ev__ = getattr(evs, __ev_name__)
+        if hasattr(__ev__, 'argspec'):
+            argspec = __ev__.argspec
+        else:
+            argspec = inspect.getargspec(__ev__)
+        parent_names = filter(lambda x: x in ['input', 'label', 'weight'],
+                              argspec.args)
+        v2_ev = __convert_to_v2__(
+            __ev_name__,
+            parent_names=parent_names,
+            is_default_name='name' in argspec.args,
+            attach_parent=True)
+
+        __new_name__ = convert_to_new_name(__ev_name__)
+
+        globals()[__new_name__] = v2_ev
+        globals()[__new_name__].__name__ = __new_name__
+        __all__.append(__new_name__)
+
+
+initialize()
diff --git a/python/paddle/v2/event.py b/python/paddle/v2/event.py
index 1ad52b8baa..fd6050fa33 100644
--- a/python/paddle/v2/event.py
+++ b/python/paddle/v2/event.py
@@ -1,14 +1,13 @@
 """
-All training events.
+Testing and training events.
 
 There are:
 
+* TestResult
 * BeginIteration
 * EndIteration
 * BeginPass
 * EndPass
-
-TODO(yuyang18): Complete it!
 """
 import py_paddle.swig_paddle as api
 
diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py
index ec3c67d895..c178336303 100644
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
@@ -9,6 +9,17 @@ __all__ = ['infer']
 
 
 class Inference(object):
+    """
+    Inference combines neural network output and parameters together
+    to do inference.
+
+    :param outptut_layer: The neural network that should be inferenced.
+    :type output_layer: paddle.v2.config_base.Layer or the sequence
+                        of paddle.v2.config_base.Layer
+    :param parameters: The parameters dictionary.
+    :type parameters: paddle.v2.parameters.Parameters
+    """
+
     def __init__(self, output_layer, parameters):
         topo = topology.Topology(output_layer)
         gm = api.GradientMachine.createFromConfigProto(
@@ -37,14 +48,19 @@ class Inference(object):
         self.__gradient_machine__.finish()
 
     def iter_infer_field(self, field, **kwargs):
+        if not isinstance(field, list) and not isinstance(field, tuple):
+            field = [field]
+
         for result in self.iter_infer(**kwargs):
-            yield [each_result[field] for each_result in result]
+            for each_result in result:
+                item = [each_result[each_field] for each_field in field]
+                yield item
 
     def infer(self, field='value', **kwargs):
         retv = None
         for result in self.iter_infer_field(field=field, **kwargs):
             if retv is None:
-                retv = [[]] * len(result)
+                retv = [[] for i in xrange(len(result))]
             for i, item in enumerate(result):
                 retv[i].append(item)
         retv = [numpy.concatenate(out) for out in retv]
@@ -76,9 +92,11 @@ def infer(output_layer, parameters, input, feeding=None, field='value'):
     :type input: collections.Iterable
     :param feeding: Reader dictionary. Default could generate from input
                         value.
-    :param field: The prediction field. It should in [`value`, `ids`]. `value`
-                  means return the prediction probabilities, `ids` means return
-                  the prediction labels. Default is `value`
+    :param field: The prediction field. It should in [`value`, `id`, `prob`]. 
+                  `value` and `prob` mean return the prediction probabilities, 
+                  `id` means return the prediction labels. Default is `value`.
+                  Note that `prob` only used when output_layer is beam_search 
+                  or max_id.
     :type field: str
     :return: a numpy array
     :rtype: numpy.ndarray
diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py
index 1e4efedde3..384de9b9d5 100644
--- a/python/paddle/v2/layer.py
+++ b/python/paddle/v2/layer.py
@@ -33,40 +33,52 @@ The primary usage shows below.
 
 import collections
 import inspect
-from config_base import Layer, __convert_to_v2__
+import re
+
 import paddle.trainer_config_helpers as conf_helps
+from paddle.trainer.config_parser import \
+    RecurrentLayerGroupWithoutOutLinksBegin, RecurrentLayerGroupSetOutLink, \
+    RecurrentLayerGroupEnd, model_type
 from paddle.trainer_config_helpers.config_parser_utils import \
     parse_network_config as __parse__
 from paddle.trainer_config_helpers.default_decorators import wrap_act_default
 from paddle.trainer_config_helpers.default_decorators import \
     wrap_bias_attr_default
 from paddle.trainer_config_helpers.default_decorators import wrap_name_default
+from paddle.trainer_config_helpers.layers import RecurrentLayerGroupSetGenerator, Generator
 from paddle.trainer_config_helpers.layers import layer_support
-from paddle.trainer.config_parser import \
-    RecurrentLayerGroupWithoutOutLinksBegin, RecurrentLayerGroupSetOutLink, \
-    RecurrentLayerGroupEnd, model_type
 
 import activation
-import re
+import attr
 import data_type
+from config_base import Layer, __convert_to_v2__
 
 __all__ = ['parse_network', 'data']
 
 
-def parse_network(*outputs):
+def parse_network(output_layers, extra_layers=None):
     """
-    Parse all output layers and then generate a ModelConfig object.
+    Parse all layers in the neural network graph and
+    then generate a ModelConfig object.
 
     ..  note::
 
         This function is used internally in paddle.v2 module. User should never
         invoke this method.
 
-    :param outputs: Output layers.
-    :type outputs: Layer
+    :param output_layers: Output layers.
+    :type output_layers: Layer
+    :param extra_layers: Some layers in the neural network graph are not in the
+                         path of output_layers.
+    :type extra_layers: Layer
     :return: A ModelConfig object instance.
     :rtype: ModelConfig
     """
+    if not isinstance(output_layers, collections.Sequence):
+        output_layers = [output_layers]
+    if extra_layers is not None and not isinstance(extra_layers,
+                                                   collections.Sequence):
+        extra_layers = [extra_layers]
 
     def __real_func__():
         """
@@ -74,7 +86,11 @@ def parse_network(*outputs):
         the plain old paddle configuration function.
         """
         context = dict()
-        real_output = [each.to_proto(context=context) for each in outputs]
+        real_output = [each.to_proto(context=context) for each in output_layers]
+        if extra_layers is not None:
+            extra_output = [
+                each.to_proto(context=context) for each in extra_layers
+            ]
         conf_helps.outputs(real_output)
 
     return __parse__(__real_func__)
@@ -119,54 +135,23 @@ class DataLayerV2(Layer):
         return doc
 
 
-class WithExtraParent(Layer):
-    def extra_parent(self):
-        return self.__extra_parent__
-
-    def __init__(self, name=None, parent_layers=None):
-        self.__extra_parent__ = []
-        super(WithExtraParent, self).__init__(
-            name=name, parent_layers=parent_layers)
-
-    def append_extra_parent(self, parent):
-        self.__extra_parent__.append(parent)
-
-    def to_proto(self, context):
+class MemoryV2(Layer):
+    def __init__(self, name, extra_input=None, **kwargs):
         """
-        function to set proto attribute
+        Init memory object, if memory is inited inside recurrent_group step
+        function, it may depend on a boot_layer that should be initialized
+        outside recurrent_group, so we:
+            1. add RecurrentLayerInput to extra_parent of self.
+            2. add boot_layer to the extra_parent of RecurrentLayerInput.
+
+        :param extra_input: list of RecurrentLayerInput
+        :type extra_input: [RecurrentLayerInput]
         """
-        kwargs = dict()
-        for p in self.__extra_parent__:
-            p.to_proto(context=context)
-
-        for layer_name in self.__parent_layers__:
-            if not isinstance(self.__parent_layers__[layer_name],
-                              collections.Sequence):
-                v1_layer = self.__parent_layers__[layer_name].to_proto(
-                    context=context)
-            else:
-                v1_layer = map(lambda x: x.to_proto(context=context),
-                               self.__parent_layers__[layer_name])
-            kwargs[layer_name] = v1_layer
-
-        if self.context_name() is None:
-            return self.to_proto_impl(context=context, **kwargs)
-        elif self.context_name() not in context:
-            context[self.context_name()] = self.to_proto_impl(
-                context=context, **kwargs)
-
-        if self.use_context_name():
-            return context[self.context_name()]
-        else:
-            return context[self.name]
-
-
-class MemoryV2(WithExtraParent):
-    def __init__(self, name, **kwargs):
         self.name = name
         super(MemoryV2, self).__init__(name=name, parent_layers=dict())
         self.__kwargs__ = kwargs
         self.__boot_layer_name__ = None
+
         if 'boot_layer' in kwargs:
             begin_of_current_rnn = []
             # TODO(yuyang18): Fix inspect, it could be wrong when user invoke a
@@ -189,11 +174,10 @@ class MemoryV2(WithExtraParent):
             assert begin_of_current_rnn is not None
             for extra in begin_of_current_rnn:
                 self.append_extra_parent(extra)
-                assert isinstance(extra, WithExtraParent)
                 extra.append_extra_parent(kwargs['boot_layer'])
                 self.__boot_layer_name__ = kwargs['boot_layer'].name
 
-    def to_proto_impl(self, context, **kwargs):
+    def to_proto_impl(self, **kwargs):
         args = dict()
         for each in kwargs:
             args[each] = kwargs[each]
@@ -201,7 +185,7 @@ class MemoryV2(WithExtraParent):
             args[each] = self.__kwargs__[each]
 
         if self.__boot_layer_name__ is not None:
-            args['boot_layer'] = context[self.__boot_layer_name__]
+            args['boot_layer'] = self.__context__[self.__boot_layer_name__]
 
         size = args.get('size', None)
         if size is not None:
@@ -223,22 +207,6 @@ class MemoryV2(WithExtraParent):
         return True
 
 
-class LayerOutputV2(Layer):
-    """
-    LayerOutputV2 is used to store the result of LayerOutput in v1 api.
-    It will not store it's parents because layer_output has been parsed already.
-    """
-
-    def __init__(self, layer_output):
-        assert isinstance(layer_output, conf_helps.LayerOutput)
-        self.layer_output = layer_output
-        super(LayerOutputV2, self).__init__(
-            name=layer_output.name, parent_layers=dict())
-
-    def to_proto_impl(self):
-        return self.layer_output
-
-
 class StaticInputV2(object):
     def __init__(self, input, is_seq=False, size=None):
         assert isinstance(input, LayerV2)
@@ -250,6 +218,66 @@ class StaticInputV2(object):
         # assert input.size is not None or size is not None
 
 
+class BaseGeneratedInputV2(object):
+    def __init__(self):
+        self.bos_id = None
+        self.eos_id = None
+
+    def before_real_step(self):
+        raise NotImplementedError()
+
+    def after_real_step(self, *args):
+        raise NotImplementedError()
+
+
+class GeneratedInputV2(BaseGeneratedInputV2):
+    def __init__(self, size, embedding_name, embedding_size):
+        super(GeneratedInputV2, self).__init__()
+        self.size = size
+        self.embedding_name = embedding_name
+        self.embedding_size = embedding_size
+
+    def after_real_step(self, input):
+        return max_id(input=input, name='__beam_search_predict__')
+
+    def before_real_step(self):
+        predict_id = memory(
+            name='__beam_search_predict__',
+            size=self.size,
+            boot_with_const_id=self.bos_id)
+
+        trg_emb = embedding(
+            input=predict_id,
+            size=self.embedding_size,
+            param_attr=attr.ParamAttr(name=self.embedding_name))
+        return trg_emb
+
+
+class RecurrentLayerGroupSetGeneratorV2(Layer):
+    def __init__(self, eos_name, max_length, beam_size, num_results_per_sample):
+        self.eos_name = eos_name
+        self.max_length = max_length
+        self.beam_size = beam_size
+        self.num_results_per_sample = num_results_per_sample
+        super(RecurrentLayerGroupSetGeneratorV2, self).__init__(
+            name=eos_name, parent_layers={})
+
+    def to_proto_impl(self, **kwargs):
+        RecurrentLayerGroupSetGenerator(
+            Generator(
+                eos_layer_name=self.eos_name,
+                max_num_frames=self.max_length,
+                beam_size=self.beam_size,
+                num_results_per_sample=self.num_results_per_sample))
+        return self
+
+    def context_name(self):
+        return self.eos_name + ".fake"
+
+    def use_context_name(self):
+        return True
+
+
 class MixedLayerV2(Layer):
     """
     This class is use to support `with` grammar. If not, the following code
@@ -328,18 +356,24 @@ def mixed(size=0,
     return MixedLayerV2(size, input, name, act, bias_attr, layer_attr)
 
 
-class RecurrentLayerInput(WithExtraParent):
+class RecurrentLayerInput(Layer):
     def __init__(self, recurrent_name, index, parent_layers):
-        assert len(parent_layers) == 1
-        self.__parents__ = parent_layers.values()[0]
-        super(RecurrentLayerInput, self).__init__(
-            name=self.__parents__[index].name, parent_layers=parent_layers)
+        parents_len = len(parent_layers)
+        assert parents_len <= 1
+        if parents_len == 0:
+            self.__parents__ = []
+        else:
+            self.__parents__ = parent_layers.values()[0]
         self.__recurrent_name__ = recurrent_name
+        name = self.__parents__[
+            index].name if index >= 0 else self.context_name()
+        super(RecurrentLayerInput, self).__init__(
+            name=name, parent_layers=parent_layers)
 
     def context_name(self):
         return self.__recurrent_name__ + ".begin"
 
-    def to_proto_impl(self, context, **kwargs):
+    def to_proto_impl(self, **kwargs):
         model_type('recurrent_nn')
         RecurrentLayerGroupWithoutOutLinksBegin(
             name=self.__recurrent_name__,
@@ -436,6 +470,11 @@ def recurrent_group(step, input, name=None):
         for i in xrange(len(non_static_inputs))
     ]
 
+    extra_input = None
+    if len(non_static_inputs) == 0:
+        extra_input = RecurrentLayerInput(
+            recurrent_name=name, index=-1, parent_layers={})
+
     def __real_step__(*args):
         rnn_input = list(args)
         static_inputs = filter(lambda x: isinstance(x, StaticInputV2), input)
@@ -443,6 +482,7 @@ def recurrent_group(step, input, name=None):
             mem_name = "__%s_memory__" % static_input.input.name
             mem = memory(
                 name=mem_name,
+                extra_input=extra_input,
                 is_seq=static_input.is_seq,
                 size=static_input.input.calculate_size,
                 boot_layer=static_input.input)
@@ -472,6 +512,73 @@ def recurrent_group(step, input, name=None):
         return retv
 
 
+@wrap_name_default()
+def beam_search(step,
+                input,
+                bos_id,
+                eos_id,
+                beam_size,
+                max_length=500,
+                name=None,
+                num_results_per_sample=None):
+    if num_results_per_sample is None:
+        num_results_per_sample = beam_size
+    assert num_results_per_sample <= beam_size
+    # logger.warning("num_results_per_sample should be less than beam_size")
+
+    if isinstance(input, StaticInputV2) or isinstance(input,
+                                                      BaseGeneratedInputV2):
+        input = [input]
+
+    generated_input_index = -1
+
+    real_input = []
+    for i, each_input in enumerate(input):
+        assert isinstance(each_input, StaticInputV2) or isinstance(
+            each_input, BaseGeneratedInputV2)
+        if isinstance(each_input, BaseGeneratedInputV2):
+            assert generated_input_index == -1
+            generated_input_index = i
+        else:
+            real_input.append(each_input)
+
+    assert generated_input_index != -1
+
+    gipt = input[generated_input_index]
+    assert isinstance(gipt, BaseGeneratedInputV2)
+
+    gipt.bos_id = bos_id
+    gipt.eos_id = eos_id
+
+    def __real_step__(*args):
+        eos_name = "__%s_eos_layer__" % name
+        generator = RecurrentLayerGroupSetGeneratorV2(
+            eos_name, max_length, beam_size, num_results_per_sample)
+
+        args = list(args)
+        before_step_layer = gipt.before_real_step()
+        before_step_layer.append_child(
+            layer=generator, parent_names=[before_step_layer.name])
+        args.insert(generated_input_index, before_step_layer)
+
+        predict = gipt.after_real_step(step(*args))
+
+        eos_layer = eos(input=predict, eos_id=eos_id, name=eos_name)
+        predict.append_child(layer=eos_layer, parent_names=[predict.name])
+
+        return predict
+
+    # tmp = paddle.layer.recurrent_group(
+    #     step=__real_step__,
+    #     input=real_input,
+    #     reverse=False,
+    #     name=name,
+    #     is_generating=True)
+    tmp = recurrent_group(step=__real_step__, input=real_input, name=name)
+
+    return tmp
+
+
 __projection_names__ = filter(lambda x: x.endswith('_projection'),
                               dir(conf_helps))
 
diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index 1a01d95c20..feefd7d758 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -47,6 +47,35 @@ class Optimizer(object):
 
 
 class Momentum(Optimizer):
+    """
+    SGD Optimizer.
+
+    SGD is an optimization method, trying to find a neural network that
+    minimize the "cost/error" of it by iteration. In paddle's implementation
+    SGD Optimizer is synchronized, which means all gradients will be wait to
+    calculate and reduced into one gradient, then do optimize operation.
+
+    The neural network consider the learning problem of minimizing an objective
+    function, that has the form of a sum
+
+    ..  math::
+
+        Q(w) = \\sum_{i}^{n} Q_i(w)
+
+    The value of function Q sometimes is the cost of neural network (Mean
+    Square Error between prediction and label for example). The function Q is
+    parametrised by w, the weight/bias of neural network. And weights is what to
+    be learned. The i is the i-th observation in (trainning) data.
+
+    So, the SGD method will optimize the weight by
+
+    ..  math::
+
+        w = w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w)
+
+    where :math:`\\eta` is learning rate. And :math:`n` is batch size.
+    """
+
     def __init__(self, momentum=None, sparse=False, **kwargs):
         learning_method = v1_optimizers.MomentumOptimizer(
             momentum=momentum, sparse=sparse)
@@ -55,6 +84,26 @@ class Momentum(Optimizer):
 
 
 class Adam(Optimizer):
+    """
+    Adam optimizer.
+    The details of please refer `Adam: A Method for Stochastic Optimization
+    <https://arxiv.org/abs/1412.6980>`_
+
+    ..  math::
+
+        m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
+        v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
+        w & = w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}}
+
+    :param beta1: the :math:`\\beta_1` in equation.
+    :type beta1: float
+    :param beta2: the :math:`\\beta_2` in equation.
+    :type beta2: float
+    :param epsilon: the :math:`\\epsilon` in equation. It is used to prevent
+                        divided by zero.
+    :type epsilon: float
+    """
+
     def __init__(self, beta1=0.9, beta2=0.999, epsilon=1e-8, **kwargs):
         learning_method = v1_optimizers.AdamOptimizer(
             beta1=beta1, beta2=beta2, epsilon=epsilon)
@@ -62,6 +111,24 @@ class Adam(Optimizer):
 
 
 class Adamax(Optimizer):
+    """
+    Adamax optimizer.
+
+    The details of please refer this `Adam: A Method for Stochastic Optimization
+    <https://arxiv.org/abs/1412.6980>`_
+
+    ..  math::
+
+        m_t & = \\beta_1 * m_{t-1} + (1-\\beta_1)* \\nabla Q_i(w) \\\\
+        u_t & = max(\\beta_2*u_{t-1}, abs(\\nabla Q_i(w))) \\\\
+        w_t & = w_{t-1} - (\\eta/(1-\\beta_1^t))*m_t/u_t
+
+    :param beta1: the :math:`\\beta_1` in the equation.
+    :type beta1: float
+    :param beta2: the :math:`\\beta_2` in the equation.
+    :type beta2: float
+    """
+
     def __init__(self, beta1=0.9, beta2=0.999, **kwargs):
         learning_method = v1_optimizers.AdamaxOptimizer(
             beta1=beta1, beta2=beta2)
@@ -69,12 +136,40 @@ class Adamax(Optimizer):
 
 
 class AdaGrad(Optimizer):
+    """
+    Adagrad(for ADAptive GRAdient algorithm) optimizer.
+
+    For details please refer this `Adaptive Subgradient Methods for
+    Online Learning and Stochastic Optimization
+    <http://www.magicbroom.info/Papers/DuchiHaSi10.pdf>`_.
+
+    ..  math::
+
+        G &= \\sum_{\\tau=1}^{t} g_{\\tau} g_{\\tau}^T \\\\
+        w & = w - \\eta diag(G)^{-\\frac{1}{2}} \\circ g
+    """
+
     def __init__(self, **kwargs):
         learning_method = v1_optimizers.AdaGradOptimizer()
         super(AdaGrad, self).__init__(learning_method=learning_method, **kwargs)
 
 
 class DecayedAdaGrad(Optimizer):
+    """
+    AdaGrad method with decayed sum gradients. The equations of this method
+    show as follow.
+
+    ..  math::
+
+        E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\
+        learning\\_rate &= 1/sqrt( ( E(g_t^2) + \\epsilon )
+
+    :param rho: The :math:`\\rho` parameter in that equation
+    :type rho: float
+    :param epsilon: The :math:`\\epsilon` parameter in that equation.
+    :type epsilon: float
+    """
+
     def __init__(self, rho=0.95, epsilon=1e-06, **kwargs):
         learning_method = v1_optimizers.DecayedAdaGradOptimizer(
             rho=rho, epsilon=epsilon)
@@ -83,6 +178,24 @@ class DecayedAdaGrad(Optimizer):
 
 
 class AdaDelta(Optimizer):
+    """
+    AdaDelta method. The details of adadelta please refer to this
+    `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD
+    <http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf>`_.
+
+    ..  math::
+
+        E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\
+        learning\\_rate &= sqrt( ( E(dx_{t-1}^2) + \\epsilon ) / ( \\
+                          E(g_t^2) + \\epsilon ) ) \\\\
+        E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\\_rate)^2
+
+    :param rho: :math:`\\rho` in equation
+    :type rho: float
+    :param epsilon: :math:`\\rho` in equation
+    :type epsilon: float
+    """
+
     def __init__(self, rho=0.95, epsilon=1e-06, **kwargs):
         learning_method = v1_optimizers.AdaDeltaOptimizer(
             rho=rho, epsilon=epsilon)
@@ -91,6 +204,24 @@ class AdaDelta(Optimizer):
 
 
 class RMSProp(Optimizer):
+    """
+    RMSProp(for Root Mean Square Propagation) optimizer. For details please
+    refer this `slide <http://www.cs.toronto.edu/~tijmen/csc321/slides/
+    lecture_slides_lec6.pdf>`_.
+
+    The equations of this method as follows:
+
+    ..  math::
+
+        v(w, t) & = \\rho v(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
+        w & = w - \\frac{\\eta} {\\sqrt{v(w,t) + \\epsilon}} \\nabla Q_{i}(w)
+
+    :param rho: the :math:`\\rho` in the equation. The forgetting factor.
+    :type rho: float
+    :param epsilon: the :math:`\\epsilon` in the equation.
+    :type epsilon: float
+    """
+
     def __init__(self, rho=0.95, epsilon=1e-6, **kwargs):
         learning_method = v1_optimizers.RMSPropOptimizer(
             rho=rho, epsilon=epsilon)
diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py
index 05dc5c68dd..d686d09f22 100644
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
@@ -159,7 +159,8 @@ class Parameters(object):
         if not self.has_key(key):
             raise ValueError("No such parameter %s" % key)
         conf = self.__param_conf__[key]
-        return tuple(map(int, conf.dims))
+        dims = conf.dims if conf.dims else (1, conf.size)
+        return tuple(map(int, dims))
 
     def __setitem__(self, key, value):
         """
diff --git a/python/paddle/v2/plot/__init__.py b/python/paddle/v2/plot/__init__.py
new file mode 100644
index 0000000000..acd3013db4
--- /dev/null
+++ b/python/paddle/v2/plot/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from plot import Ploter
+
+__all__ = ['Ploter']
diff --git a/python/paddle/v2/plot/plot.py b/python/paddle/v2/plot/plot.py
new file mode 100644
index 0000000000..6f7bd039b0
--- /dev/null
+++ b/python/paddle/v2/plot/plot.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+
+class PlotData(object):
+    def __init__(self):
+        self.step = []
+        self.value = []
+
+    def append(self, step, value):
+        self.step.append(step)
+        self.value.append(value)
+
+    def reset(self):
+        self.step = []
+        self.value = []
+
+
+class Ploter(object):
+    def __init__(self, *args):
+        self.__args__ = args
+        self.__plot_data__ = {}
+        for title in args:
+            self.__plot_data__[title] = PlotData()
+        # demo in notebooks will use Ploter to plot figure, but when we convert
+        # the ipydb to py file for testing, the import of matplotlib will make the
+        # script crash. So we can use `export DISABLE_PLOT=True` to disable import
+        # these libs
+        self.__disable_plot__ = os.environ.get("DISABLE_PLOT")
+        if not self.__plot_is_disabled__():
+            import matplotlib.pyplot as plt
+            from IPython import display
+            self.plt = plt
+            self.display = display
+
+    def __plot_is_disabled__(self):
+        return self.__disable_plot__ == "True"
+
+    def append(self, title, step, value):
+        assert isinstance(title, basestring)
+        assert self.__plot_data__.has_key(title)
+        data = self.__plot_data__[title]
+        assert isinstance(data, PlotData)
+        data.append(step, value)
+
+    def plot(self):
+        if self.__plot_is_disabled__():
+            return
+
+        titles = []
+        for title in self.__args__:
+            data = self.__plot_data__[title]
+            assert isinstance(data, PlotData)
+            if len(data.step) > 0:
+                titles.append(title)
+                self.plt.plot(data.step, data.value)
+        self.plt.legend(titles, loc='upper left')
+        self.display.clear_output(wait=True)
+        self.display.display(self.plt.gcf())
+        self.plt.gcf().clear()
+
+    def reset(self):
+        for key in self.__plot_data__:
+            data = self.__plot_data__[key]
+            assert isinstance(data, PlotData)
+            data.reset()
diff --git a/python/paddle/v2/plot/tests/CMakeLists.txt b/python/paddle/v2/plot/tests/CMakeLists.txt
new file mode 100644
index 0000000000..da550a178c
--- /dev/null
+++ b/python/paddle/v2/plot/tests/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_test(NAME test_ploter
+  COMMAND bash ${PROJ_ROOT}/python/paddle/v2/plot/tests/run_tests.sh
+  ${PYTHON_EXECUTABLE})
diff --git a/python/paddle/v2/plot/tests/__init__.py b/python/paddle/v2/plot/tests/__init__.py
new file mode 100644
index 0000000000..d1abfc08f1
--- /dev/null
+++ b/python/paddle/v2/plot/tests/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import test_ploter
+
+__all__ = ['test_ploter.py']
diff --git a/python/paddle/v2/plot/tests/run_tests.sh b/python/paddle/v2/plot/tests/run_tests.sh
new file mode 100755
index 0000000000..9c1a4a71ce
--- /dev/null
+++ b/python/paddle/v2/plot/tests/run_tests.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+pushd `dirname $0` > /dev/null
+SCRIPTPATH=$PWD
+popd > /dev/null
+
+cd $SCRIPTPATH
+$1 -m pip install ../../../../../paddle/dist/*.whl
+
+export DISABLE_PLOT="True"
+test_list="test_ploter.py"
+
+export PYTHONPATH=$PWD/../../../../../python/
+
+for fn in $test_list
+do
+  echo "test $fn"
+  $1 $fn
+  if [ $? -ne 0 ]; then
+    exit 1
+  fi
+done
diff --git a/python/paddle/v2/plot/tests/test_ploter.py b/python/paddle/v2/plot/tests/test_ploter.py
new file mode 100644
index 0000000000..a75f853ed9
--- /dev/null
+++ b/python/paddle/v2/plot/tests/test_ploter.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from paddle.v2.plot import Ploter
+
+
+class TestCommon(unittest.TestCase):
+    def test_append(self):
+        title1 = "title1"
+        title2 = "title2"
+        plot_test = Ploter(title1, title2)
+        plot_test.append(title1, 1, 2)
+        plot_test.append(title1, 2, 5)
+        plot_test.append(title2, 3, 4)
+        self.assertEqual(plot_test.__plot_data__[title1].step, [1, 2])
+        self.assertEqual(plot_test.__plot_data__[title1].value, [2, 5])
+        self.assertEqual(plot_test.__plot_data__[title2].step, [3])
+        self.assertEqual(plot_test.__plot_data__[title2].value, [4])
+        plot_test.reset()
+        self.assertEqual(plot_test.__plot_data__[title1].step, [])
+        self.assertEqual(plot_test.__plot_data__[title1].value, [])
+        self.assertEqual(plot_test.__plot_data__[title2].step, [])
+        self.assertEqual(plot_test.__plot_data__[title2].value, [])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/tests/test_layer.py b/python/paddle/v2/tests/test_layer.py
index 89cc928dd7..c67f3b84d9 100644
--- a/python/paddle/v2/tests/test_layer.py
+++ b/python/paddle/v2/tests/test_layer.py
@@ -19,6 +19,7 @@ import paddle.v2.data_type as data_type
 import paddle.v2.layer as layer
 import paddle.v2.pooling as pooling
 import paddle.v2.networks as networks
+import paddle.v2.evaluator as evaluator
 
 pixel = layer.data(name='pixel', type=data_type.dense_vector(128))
 label = layer.data(name='label', type=data_type.integer_value(10))
@@ -58,13 +59,13 @@ class ImageLayerTest(unittest.TestCase):
                         num_channels=16,
                         pool_type=pooling.Max())
         maxout = layer.maxout(input=conv, num_channels=16, groups=4)
-        print layer.parse_network(maxpool, spp, maxout)
+        print layer.parse_network([maxpool, spp, maxout])
 
     def test_norm_layer(self):
         norm1 = layer.img_cmrnorm(input=conv, size=5)
         norm2 = layer.batch_norm(input=conv)
         norm3 = layer.sum_to_one_norm(input=conv)
-        print layer.parse_network(norm1, norm2, norm3)
+        print layer.parse_network([norm1, norm2, norm3])
 
 
 class AggregateLayerTest(unittest.TestCase):
@@ -77,7 +78,8 @@ class AggregateLayerTest(unittest.TestCase):
         first_seq = layer.first_seq(input=pixel)
         concat = layer.concat(input=[last_seq, first_seq])
         seq_concat = layer.seq_concat(a=last_seq, b=first_seq)
-        print layer.parse_network(pool, last_seq, first_seq, concat, seq_concat)
+        print layer.parse_network(
+            [pool, last_seq, first_seq, concat, seq_concat])
 
 
 class MathLayerTest(unittest.TestCase):
@@ -94,8 +96,10 @@ class MathLayerTest(unittest.TestCase):
         tensor = layer.tensor(a=pixel, b=pixel, size=1000)
         cos_sim = layer.cos_sim(a=pixel, b=pixel)
         trans = layer.trans(input=tensor)
-        print layer.parse_network(addto, linear_comb, interpolation, power,
-                                  scaling, slope, tensor, cos_sim, trans)
+        print layer.parse_network([
+            addto, linear_comb, interpolation, power, scaling, slope, tensor,
+            cos_sim, trans
+        ])
 
 
 class ReshapeLayerTest(unittest.TestCase):
@@ -109,7 +113,8 @@ class ReshapeLayerTest(unittest.TestCase):
         repeat = layer.repeat(input=pixel, num_repeats=4)
         reshape = layer.seq_reshape(input=pixel, reshape_size=4)
         rotate = layer.rotate(input=pixel, height=16, width=49)
-        print layer.parse_network(block_expand, expand, repeat, reshape, rotate)
+        print layer.parse_network(
+            [block_expand, expand, repeat, reshape, rotate])
 
 
 class RecurrentLayerTest(unittest.TestCase):
@@ -118,7 +123,7 @@ class RecurrentLayerTest(unittest.TestCase):
         recurrent = layer.recurrent(input=word)
         lstm = layer.lstmemory(input=word)
         gru = layer.grumemory(input=word)
-        print layer.parse_network(recurrent, lstm, gru)
+        print layer.parse_network([recurrent, lstm, gru])
 
 
 class CostLayerTest(unittest.TestCase):
@@ -138,10 +143,10 @@ class CostLayerTest(unittest.TestCase):
         cost10 = layer.sum_cost(input=inference)
         cost11 = layer.huber_cost(input=score, label=label)
 
-        print layer.parse_network(cost1, cost2)
-        print layer.parse_network(cost3, cost4)
-        print layer.parse_network(cost5, cost6)
-        print layer.parse_network(cost7, cost8, cost9, cost10, cost11)
+        print layer.parse_network([cost1, cost2])
+        print layer.parse_network([cost3, cost4])
+        print layer.parse_network([cost5, cost6])
+        print layer.parse_network([cost7, cost8, cost9, cost10, cost11])
 
         crf = layer.crf(input=inference, label=label)
         crf_decoding = layer.crf_decoding(input=inference, size=3)
@@ -150,8 +155,8 @@ class CostLayerTest(unittest.TestCase):
         nce = layer.nce(input=inference, label=label, num_classes=3)
         hsigmoid = layer.hsigmoid(input=inference, label=label, num_classes=3)
 
-        print layer.parse_network(crf, crf_decoding, ctc, warp_ctc, nce,
-                                  hsigmoid)
+        print layer.parse_network(
+            [crf, crf_decoding, ctc, warp_ctc, nce, hsigmoid])
 
 
 class OtherLayerTest(unittest.TestCase):
@@ -159,7 +164,7 @@ class OtherLayerTest(unittest.TestCase):
         maxid = layer.max_id(input=inference)
         sampling_id = layer.sampling_id(input=inference)
         eos = layer.eos(input=maxid, eos_id=5)
-        print layer.parse_network(maxid, sampling_id, eos)
+        print layer.parse_network([maxid, sampling_id, eos])
 
     def test_slicing_joining_layer(self):
         pad = layer.pad(input=conv, pad_c=[2, 3], pad_h=[1, 2], pad_w=[3, 1])
@@ -262,5 +267,20 @@ class NetworkTests(unittest.TestCase):
         print layer.parse_network(vgg_out)
 
 
+class EvaluatorTest(unittest.TestCase):
+    def test_evaluator(self):
+        img = layer.data(name='pixel', type=data_type.dense_vector(784))
+        output = layer.fc(input=img,
+                          size=10,
+                          act=activation.Softmax(),
+                          name='fc_here')
+        lbl = layer.data(name='label', type=data_type.integer_value(10))
+        cost = layer.cross_entropy_cost(input=output, label=lbl)
+
+        evaluator.classification_error(input=output, label=lbl)
+        print layer.parse_network(cost)
+        print layer.parse_network(output)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/topology.py b/python/paddle/v2/topology.py
index f0679c5675..737b6bf1e2 100644
--- a/python/paddle/v2/topology.py
+++ b/python/paddle/v2/topology.py
@@ -17,7 +17,6 @@ import collections
 from paddle.proto.ModelConfig_pb2 import ModelConfig
 
 import layer as v2_layer
-from layer import WithExtraParent
 
 __all__ = ['Topology']
 
@@ -41,9 +40,8 @@ def __bfs_travel__(callback, *layers):
         __break__ = callback(each_layer)
         if __break__:
             return
-        __layers__ = each_layer.__parent_layers__.values()
-        if isinstance(each_layer, WithExtraParent):
-            __layers__ = __layers__ + each_layer.extra_parent()
+        __layers__ = each_layer.__parent_layers__.values() + \
+                     each_layer.extra_parent()
         __bfs_travel__(callback, *__layers__)
 
 
@@ -53,14 +51,26 @@ class Topology(object):
     and network configs.
     """
 
-    def __init__(self, layers):
-        if not isinstance(layers, collections.Sequence):
-            __check_layer_type__(layers)
-            layers = [layers]
-        for layer in layers:
-            __check_layer_type__(layer)
+    def __init__(self, layers, extra_layers=None):
+        def __check__(layers):
+            if not isinstance(layers, collections.Sequence):
+                __check_layer_type__(layers)
+                layers = [layers]
+            for layer in layers:
+                __check_layer_type__(layer)
+            return layers
+
+        layers = __check__(layers)
         self.layers = layers
-        self.__model_config__ = v2_layer.parse_network(*layers)
+        if extra_layers is not None:
+            extra_layers = __check__(extra_layers)
+
+        self.__model_config__ = v2_layer.parse_network(
+            layers, extra_layers=extra_layers)
+
+        if extra_layers is not None:
+            self.layers.extend(extra_layers)
+
         assert isinstance(self.__model_config__, ModelConfig)
 
     def proto(self):
diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
index 7bd3e2c565..68b4967cc0 100644
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -1,3 +1,6 @@
+"""
+Module Trainer
+"""
 import collections
 
 import py_paddle.swig_paddle as api
@@ -9,10 +12,6 @@ from . import optimizer as v2_optimizer
 from . import parameters as v2_parameters
 
 __all__ = ['SGD']
-"""
-Trainer package
-TODO(yuyang18): Complete comments.
-"""
 
 
 def default_event_handler(event):
@@ -29,7 +28,8 @@ def default_event_handler(event):
 class SGD(object):
     """
     Simple SGD Trainer.
-    TODO(yuyang18): Complete comments
+    SGD Trainer combines data reader, network topolopy and update_equation together
+    to train/test a neural network.
 
     :param update_equation: The optimizer object.
     :type update_equation: paddle.v2.optimizer.Optimizer
@@ -37,9 +37,12 @@ class SGD(object):
     :type cost: paddle.v2.config_base.Layer
     :param parameters: The parameters dictionary.
     :type parameters: paddle.v2.parameters.Parameters
+    :param extra_layers: Some layers in the neural network graph are not
+                         in the path of cost layer.
+    :type extra_layers: paddle.v2.config_base.Layer
     """
 
-    def __init__(self, cost, parameters, update_equation):
+    def __init__(self, cost, parameters, update_equation, extra_layers=None):
 
         if not isinstance(parameters, v2_parameters.Parameters):
             raise TypeError('parameters should be parameters')
@@ -47,11 +50,17 @@ class SGD(object):
         if not isinstance(update_equation, v2_optimizer.Optimizer):
             raise TypeError("update equation parameter must be "
                             "paddle.v2.optimizer.Optimizer")
-        topology = Topology(cost)
+        topology = Topology(cost, extra_layers=extra_layers)
         self.__optimizer__ = update_equation
         self.__topology__ = topology
         self.__parameters__ = parameters
         self.__topology_in_proto__ = topology.proto()
+
+        # In local mode, disable sparse_remote_update.
+        for param in self.__topology_in_proto__.parameters:
+            if param.sparse_remote_update:
+                param.sparse_remote_update = False
+
         self.__data_types__ = topology.data_type()
         gm = api.GradientMachine.createFromConfigProto(
             self.__topology_in_proto__, api.CREATE_MODE_NORMAL,
@@ -65,14 +74,16 @@ class SGD(object):
         """
         Training method. Will train num_passes of input data.
 
-        :param reader:
+        :param reader: A reader that reads and yeilds data items. Usually we use a
+                       batched reader to do mini-batch training.
+        :type reader: collections.Iterable
         :param num_passes: The total train passes.
         :param event_handler: Event handler. A method will be invoked when event
                               occurred.
         :type event_handler: (BaseEvent) => None
         :param feeding: Feeding is a map of neural network input name and array
                         index that reader returns.
-        :type feeding: dict
+        :type feeding: dict|list
         :return:
         """
         if event_handler is None:
@@ -123,6 +134,16 @@ class SGD(object):
         self.__gradient_machine__.finish()
 
     def test(self, reader, feeding=None):
+        """
+        Testing method. Will test input data.
+
+        :param reader: A reader that reads and yeilds data items.
+        :type reader: collections.Iterable  
+        :param feeding: Feeding is a map of neural network input name and array
+                        index that reader returns.
+        :type feeding: dict
+        :return:
+        """
         feeder = DataFeeder(self.__data_types__, feeding)
         evaluator = self.__gradient_machine__.makeEvaluator()
         out_args = api.Arguments.createArguments(0)
diff --git a/python/setup.py.in b/python/setup.py.in
index 68ca35265c..4ac35e3b8d 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -7,7 +7,8 @@ packages=['paddle',
           'paddle.utils',
           'paddle.v2',
           'paddle.v2.dataset',
-          'paddle.v2.reader']
+          'paddle.v2.reader',
+          'paddle.v2.plot']
 
 setup(name='paddle',
       version='${PADDLE_VERSION}',