diff --git a/.clang_format.hook b/.clang_format.hook
new file mode 100755
index 0000000000..1d92821686
--- /dev/null
+++ b/.clang_format.hook
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -e
+
+readonly VERSION="3.8"
+
+version=$(clang-format -version)
+
+if ! [[ $version == *"$VERSION"* ]]; then
+    echo "clang-format version check failed."
+    echo "a version contains '$VERSION' is needed, but get '$version'"
+    echo "you can install the right version, and make an soft-link to '\$PATH' env"
+    exit -1
+fi
+
+clang-format $@
diff --git a/.gitignore b/.gitignore
index c84b2fc8c7..9622ab78e0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,4 +24,5 @@ cmake-build-*
 python/paddle/v2/framework/core.so
 CMakeFiles
 cmake_install.cmake
-
+paddle/.timestamp
+python/paddlepaddle.egg-info/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 61b989dc69..a772125df6 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -17,14 +17,20 @@
     -   id: detect-private-key
         files: (?!.*third_party)^.*$ | (?!.*book)^.*$
     -   id: end-of-file-fixer
--   repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
-    sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
+-   repo: local
     hooks:
-    -   id: clang-formater
--   repo: https://github.com/dnephin/pre-commit-golang
-    sha: e4693a4c282b4fc878eda172a929f7a6508e7d16
+    -   id: clang-format-with-version-check
+        name: clang-format
+        description: Format files with ClangFormat.
+        entry: ./.clang_format.hook -i
+        language: system
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
+-   repo: https://github.com/PaddlePaddle/pre-commit-golang
+    sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
     hooks:
-      -   id: go-fmt
-          files: (.*\.go)
-      -   id: go-lint
-          files: (.*\.go)
+    -   id: go-fmt
+        types:
+        - go
+    -   id: gometalinter
+        types:
+        - go
diff --git a/.travis.yml b/.travis.yml
index 498674469b..b4b83fcdbc 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,6 +4,7 @@ cache:
     - $HOME/.ccache
     - $HOME/.cache/pip
     - $TRAVIS_BUILD_DIR/build/third_party
+    - $TRAVIS_BUILD_DIR/build_android/third_party
 sudo: required
 dist: trusty
 os:
@@ -11,6 +12,7 @@ os:
 env:
   - JOB=build_doc
   - JOB=check_style
+  - JOB=build_android
 addons:
   apt:
     packages:
@@ -35,10 +37,12 @@ before_install:
   - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
   # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
   # protobuf version.
-  - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
-  - pip install rarfile
+  - pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
+  - pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker
   - curl https://glide.sh/get | bash
   - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
+  - go get -u github.com/alecthomas/gometalinter
+  - gometalinter --install
   - |
     function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fb1c85bf74..ad559672ad 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,10 +13,9 @@
 # limitations under the License
 
 cmake_minimum_required(VERSION 3.0)
-
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
-set(PROJ_ROOT ${CMAKE_CURRENT_SOURCE_DIR})
-set(PROJ_BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR})
+set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 
 include(system)
 
@@ -37,6 +36,8 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
+option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    ${AVX_FOUND})
+option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
@@ -54,6 +55,7 @@ option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
+option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -75,6 +77,10 @@ if(ANDROID)
         "Disable PYTHON when cross-compiling for Android" FORCE)
     set(WITH_RDMA OFF CACHE STRING
         "Disable RDMA when cross-compiling for Android" FORCE)
+    set(WITH_MKLDNN OFF CACHE STRING
+        "Disable MKLDNN when cross-compiling for Android" FORCE)
+    set(WITH_MKLML OFF CACHE STRING
+        "Disable MKLML package when cross-compiling for Android" FORCE)
 endif(ANDROID)
 
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
@@ -88,6 +94,7 @@ endif()
 
 ########################################################################################
 
+include(external/mklml)     # download mklml package
 include(external/zlib)      # download, build, install zlib
 include(external/gflags)    # download, build, install gflags
 include(external/glog)      # download, build, install glog
@@ -95,6 +102,7 @@ include(external/gtest)     # download, build, install gtest
 include(external/protobuf)  # download, build, install protobuf
 include(external/python)    # download, build, install python
 include(external/openblas)  # download, build, install openblas
+include(external/mkldnn)    # download, build, install mkldnn
 include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
 include(external/any)       # download libn::any
@@ -114,8 +122,8 @@ include(version)            # set PADDLE_VERSION
 include(coveralls)          # set code coverage
 
 
-include_directories("${PROJ_ROOT}")
-include_directories("${PROJ_ROOT}/paddle/cuda/include")
+include_directories("${PADDLE_SOURCE_DIR}")
+include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
 include_directories(${Boost_INCLUDE_DIRS})
@@ -130,14 +138,19 @@ set(EXTERNAL_LIBS
 )
 
 if(WITH_GPU)
-    list(APPEND EXTERNAL_LIB ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
+    list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
     if(NOT WITH_DSO)
-        list(APPEND EXTERNAL_LIB ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
+        list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
     endif(NOT WITH_DSO)
 endif(WITH_GPU)
 
+if(WITH_MKLDNN)
+    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB} ${MKLDNN_IOMP_LIB})
+endif()
+
 if(USE_NNPACK)
-  list(APPEND EXTERNAL_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB} "rt")
+    include(external/nnpack)
+    list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS})
 endif(USE_NNPACK)
 
 add_subdirectory(proto)
@@ -152,10 +165,12 @@ if(WITH_GOLANG)
     add_subdirectory(go)
 endif(WITH_GOLANG)
 
+set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 add_subdirectory(paddle)
 if(WITH_PYTHON)
   add_subdirectory(python)
 endif()
+
 if(WITH_DOC)
     add_subdirectory(doc)
 endif()
diff --git a/Dockerfile b/Dockerfile
index ed5910d93b..98f61ba586 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -25,27 +25,26 @@ COPY ./paddle/scripts/docker/root/ /root/
 RUN apt-get update && \
     apt-get install -y \
     git python-pip python-dev openssh-server bison  \
-    wget unzip tar xz-utils bzip2 gzip coreutils ntp \
+    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
     curl sed grep graphviz libjpeg-dev zlib1g-dev  \
-    python-numpy python-matplotlib gcc g++ \
-    automake locales clang-format-3.8 swig doxygen cmake  \
+    python-matplotlib gcc-4.8 g++-4.8 \
+    automake locales clang-format swig doxygen cmake  \
     liblapack-dev liblapacke-dev libboost-dev \
     clang-3.8 llvm-3.8 libclang-3.8-dev \
     net-tools && \
     apt-get clean -y
 
 # Install Go and glide
-RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
-    tar -C /usr/local -xzf go.tgz && \
+RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
+    tar -xz -C /usr/local && \
     mkdir /root/gopath && \
     mkdir /root/gopath/bin && \
-    mkdir /root/gopath/src && \
-    rm go.tgz
+    mkdir /root/gopath/src
 ENV GOROOT=/usr/local/go GOPATH=/root/gopath
 # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
 ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
 # install glide
-RUN curl -q https://glide.sh/get | sh
+RUN curl -s -q https://glide.sh/get | sh
 
 # git credential to skip password typing
 RUN git config --global credential.helper store
@@ -56,19 +55,23 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
 # version util jupyter fixes this issue.
 RUN pip install --upgrade pip && \
-    pip install -U 'protobuf==3.1.0' && \
-    pip install -U wheel pillow BeautifulSoup && \
+    pip install -U wheel && \
     pip install -U docopt PyYAML sphinx && \
-    pip install -U sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip install pre-commit 'requests==2.9.2' 'ipython==5.3.0' && \
+    pip install -U sphinx-rtd-theme==0.1.9 recommonmark
+
+RUN pip install pre-commit 'ipython==5.3.0' && \
     pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip install rarfile
+    pip install opencv-python
+
+COPY ./python/requirements.txt /root/
+RUN pip install -r /root/requirements.txt
 
 # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
 # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
 RUN apt-get install -y libssl-dev libffi-dev
 RUN pip install certifi urllib3[secure]
 
+
 # Install woboq_codebrowser to /woboq
 RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
     (cd /woboq \
diff --git a/Dockerfile.android b/Dockerfile.android
index fa24f6f06c..c0fa58c384 100644
--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -14,6 +14,17 @@ RUN apt-get update && \
     wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
     apt-get clean -y
 
+# Install Go and glide
+RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
+    tar -C /usr/local -xzf go.tgz && \
+    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src && \
+    rm go.tgz
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
+
 # git credential to skip password typing
 RUN git config --global credential.helper store
 
diff --git a/README.md b/README.md
index 2a6beeb342..b9793c3eab 100644
--- a/README.md
+++ b/README.md
@@ -72,7 +72,7 @@ We provide [English](http://doc.paddlepaddle.org/develop/doc/) and
 
 - [Deep Learning 101](http://book.paddlepaddle.org/index.html)
 
-  You might want to start from the this online interactive book that can run in Jupyter Notebook.
+  You might want to start from this online interactive book that can run in Jupyter Notebook.
 
 - [Distributed Training](http://doc.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)
 
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 913f711aff..854066fd1d 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -15,23 +15,44 @@
 
 set(CBLAS_FOUND OFF)
 
-## Find MKL First.
-set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
-set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL")
+## Find MKLML First.
+if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
+  set(CBLAS_FOUND ON)
+  set(CBLAS_PROVIDER MKLML)
+  set(CBLAS_INC_DIR ${MKLML_INC_DIR})
+  set(CBLAS_LIBRARIES ${MKLML_LIB})
+
+  add_definitions(-DPADDLE_USE_MKLML)
+  add_definitions(-DLAPACK_FOUND)
+
+  message(STATUS "Found cblas and lapack in MKLML "
+    "(include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  return()
+endif()
+
+## Then find MKL.
+set(INTEL_MKL_ROOT "/opt/intel/mkl" CACHE PATH "Folder contains intel mkl libs")
+set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains env MKL")
+
+set(MKL_INCLUDE_SEARCH_PATHS
+  ${MKL_ROOT}/include
+  ${INTEL_MKL_ROOT}/include)
+set(MKL_LIB_SEARCH_PATHS
+  ${MKL_ROOT}/lib
+  ${MKL_ROOT}/lib/intel64
+  ${INTEL_MKL_ROOT}/lib
+  ${INTEL_MKL_ROOT}/lib/intel64)
 
 find_path(MKL_INC_DIR mkl.h PATHS
-  ${MKL_ROOT}/include)
+  ${MKL_INCLUDE_SEARCH_PATHS})
 find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS
-  ${MKL_ROOT}/include)
+  ${MKL_INCLUDE_SEARCH_PATHS})
 find_library(MKL_CORE_LIB NAMES mkl_core PATHS
-  ${MKL_ROOT}/lib
-  ${MKL_ROOT}/lib/intel64)
+  ${MKL_LIB_SEARCH_PATHS})
 find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS
-  ${MKL_ROOT}/lib
-  ${MKL_ROOT}/lib/intel64)
+  ${MKL_LIB_SEARCH_PATHS})
 find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
-  ${MKL_ROOT}/lib
-  ${MKL_ROOT}/lib/intel64)
+  ${MKL_LIB_SEARCH_PATHS})
 
 if(MKL_LAPACK_INC_DIR AND MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
   set(CBLAS_FOUND ON)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index a4f98ec7d4..51c3b918cc 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -28,6 +28,10 @@ if(NOT WITH_TIMER)
     add_definitions(-DPADDLE_DISABLE_TIMER)
 endif(NOT WITH_TIMER)
 
+if(USE_EIGEN_FOR_BLAS)
+    add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
+endif(USE_EIGEN_FOR_BLAS)
+
 if(NOT WITH_PROFILER)
     add_definitions(-DPADDLE_DISABLE_PROFILER)
 endif(NOT WITH_PROFILER)
@@ -67,6 +71,28 @@ else()
     include_directories(${CUDA_TOOLKIT_INCLUDE})
 endif(NOT WITH_GPU)
 
+if(WITH_MKLDNN)
+    add_definitions(-DPADDLE_USE_MKLDNN)
+    if (WITH_MKLML AND MKLDNN_IOMP_DIR)
+        message(STATUS "Enable Intel OpenMP at ${MKLDNN_IOMP_DIR}")
+        set(OPENMP_FLAGS "-fopenmp")
+        set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+        set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
+    else()
+        find_package(OpenMP)
+        if(OPENMP_FOUND)
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+        else()
+            message(WARNING "Can not find OpenMP."
+                 "Some performance features in MKLDNN may not be available")
+        endif()
+    endif()
+
+endif(WITH_MKLDNN)
+
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
 
@@ -102,12 +128,19 @@ if(WITH_GOLANG)
       message(FATAL_ERROR "no glide executeble found: $ENV{GOPATH}/bin/glide")
     endif()
 
-    add_custom_target(go_vendor)
-    add_custom_command(TARGET go_vendor
+    # this command will only run when the file it depends is missing
+    # or has changed, or the output is missing.
+    add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/glide
       COMMAND env GOPATH=${GOPATH} ${GLIDE} install
+      COMMAND touch ${CMAKE_BINARY_DIR}/glide
+      DEPENDS ${PADDLE_SOURCE_DIR}/go/glide.lock
       WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go"
-    )
-    add_dependencies(go_vendor go_path)
+      )
+
+    # depends on the custom command which outputs
+    # ${CMAKE_BINARY_DIR}/glide, the custom command does not need to
+    # run every time this target is built.
+    add_custom_target(go_vendor DEPENDS ${CMAKE_BINARY_DIR}/glide go_path)
   endif()
 
 endif(WITH_GOLANG)
diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake
index 6bbcd730e1..8d5d533126 100644
--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@@ -27,7 +27,8 @@ set(IGNORE_PATTERN
     .*cblas\\.h.*
     .*\\.pb\\.txt
     .*LtrDataProvider.*
-    .*MultiDataProvider.*)
+    .*MultiDataProvider.*
+    .*pb.*)
 
 # add_style_check_target
 #
@@ -41,27 +42,21 @@ macro(add_style_check_target TARGET_NAME)
     if(WITH_STYLE_CHECK)
         set(SOURCES_LIST ${ARGN})
         list(REMOVE_DUPLICATES SOURCES_LIST)
-        list(SORT SOURCES_LIST)
-
         foreach(filename ${SOURCES_LIST})
-            set(LINT ON)
             foreach(pattern ${IGNORE_PATTERN})
                 if(filename MATCHES ${pattern})
-                    message(STATUS "DROP LINT ${filename}")
-                    set(LINT OFF)
+                    list(REMOVE_ITEM SOURCES_LIST ${filename})
                 endif()
             endforeach()
-            if(LINT MATCHES ON)
-                get_filename_component(base_filename ${filename} NAME)
-                set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint)
-                add_custom_command(OUTPUT ${CUR_GEN}
-                    PRE_BUILD
-                    COMMAND env ${py_env} "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
-                                "--filter=${STYLE_FILTER}"
-                                "--write-success=${CUR_GEN}" ${filename}
-                    DEPENDS ${filename}
-                    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-            endif()
         endforeach()
+
+        if(SOURCES_LIST)
+            add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+                COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/scripts/cpplint.py"
+                        "--filter=${STYLE_FILTER}"
+                        ${SOURCES_LIST}
+                COMMENT "cpplint: Checking source code style"
+                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})        
+        endif()
     endif()
 endmacro()
diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake
index dcfbc5d012..5e3e437a8d 100644
--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@@ -108,6 +108,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
         ENDIF()
         IF(ANDROID_ABI STREQUAL "arm64-v8a")
             SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
+            SET(CMAKE_SYSTEM_PROCESSOR aarch64)
         ENDIF()
         SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
     ENDIF()
@@ -166,7 +167,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
     ENDIF()
 
     IF(ANDROID_ABI STREQUAL "arm64-v8a")
-      LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
+        LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
     ENDIF()
 
     STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}")
@@ -193,6 +194,10 @@ ELSE()
         SET(CMAKE_ANDROID_STANDALONE_TOOLCHAIN ${ANDROID_STANDALONE_TOOLCHAIN})
     ENDIF()
     SET(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ABI})
-    SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE})
-    SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON})
+    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
+        SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE})
+        IF(ANDROID_ABI STREQUAL "armeabi-v7a")
+            SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON})
+        ENDIF()
+    ENDIF()
 ENDIF()
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index 69f40df516..2c84061ff5 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -2,7 +2,7 @@ if(NOT WITH_GPU)
     return()
 endif()
 
-set(CUDNN_ROOT "" CACHE PATH "CUDNN ROOT")
+set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT")
 find_path(CUDNN_INCLUDE_DIR cudnn.h
     PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include
     $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE}
diff --git a/cmake/external/any.cmake b/cmake/external/any.cmake
index 45e3764e84..85cce80b70 100644
--- a/cmake/external/any.cmake
+++ b/cmake/external/any.cmake
@@ -7,8 +7,8 @@ INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any)
 ExternalProject_Add(
     extern_lib_any
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/thelink2012/any.git"
-    GIT_TAG         "8fef1e93710a0edf8d7658999e284a1142c4c020"
+    GIT_REPOSITORY  "https://github.com/PaddlePaddle/any.git"
+    GIT_TAG         "15595d8324be9e8a9a80d9ae442fdd12bd66df5d"
     PREFIX          ${ANY_SOURCE_DIR}
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ""
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 3e6cedbb0d..f7483f6be9 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -7,17 +7,8 @@ INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/extern_eigen3)
 ExternalProject_Add(
     extern_eigen3
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    # for latest version, please get from official website
-    # URL            "https://bitbucket.org/eigen/eigen/get/3.3.4.tar.gz"
-    # URL_MD5        "1a47e78efe365a97de0c022d127607c3"
-
-    # for no-ssl http support, please get from bazel's mirror
-    # URL           "http://mirror.bazel.build/bitbucket.org/eigen/eigen/get/f3a22f35b044.tar.gz"
-    # URL_MD5       "4645c66075982da6fa0bcf6b20f3e8f7"
-
-    # get from github mirror
     GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
-    GIT_TAG         "a46d2e7337c4656f00abe54a8115f6d76153a048"
+    GIT_TAG         "master"
     PREFIX          ${EIGEN_SOURCE_DIR}
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ""
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index a0d0a892c4..16e5bef4cd 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -28,7 +28,14 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
 ExternalProject_Add(
     extern_gflags
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
+    # TODO(yiwang): The annoying warnings mentioned in
+    # https://github.com/PaddlePaddle/Paddle/issues/3277 are caused by
+    # gflags.  I fired a PR https://github.com/gflags/gflags/pull/230
+    # to fix it.  Before it gets accepted by the gflags team, we use
+    # my personal fork, which contains above fix, temporarily.  Let's
+    # change this back to the official Github repo once my PR is
+    # merged.
+    GIT_REPOSITORY  "https://github.com/wangkuiyi/gflags.git"
     PREFIX          ${GFLAGS_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index bd401faa6e..8a594a825a 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -52,6 +52,7 @@ ExternalProject_Add(
 
 ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
-ADD_DEPENDENCIES(glog extern_glog)
+ADD_DEPENDENCIES(glog extern_glog gflags)
+LINK_LIBRARIES(glog gflags)
 
 LIST(APPEND external_project_dependencies glog)
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index 77e06e983e..e3970073a1 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -34,9 +34,15 @@ IF(WITH_TESTING)
             "${GTEST_INSTALL_DIR}/lib/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE)
     ENDIF(WIN32)
 
+    IF(WITH_MKLML)
+        # wait for mklml downloading completed
+        SET(GTEST_DEPENDS   ${MKLML_PROJECT})
+    ENDIF()
+
     ExternalProject_Add(
         extern_gtest
         ${EXTERNAL_PROJECT_LOG_ARGS}
+        DEPENDS         ${GTEST_DEPENDS}
         GIT_REPOSITORY  "https://github.com/google/googletest.git"
         GIT_TAG         "release-1.8.0"
         PREFIX          ${GTEST_SOURCES_DIR}
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
new file mode 100644
index 0000000000..25c6b4ef52
--- /dev/null
+++ b/cmake/external/mkldnn.cmake
@@ -0,0 +1,67 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(NOT ${WITH_MKLDNN})
+  return()
+ENDIF(NOT ${WITH_MKLDNN})
+
+INCLUDE(ExternalProject)
+
+SET(MKLDNN_PROJECT        "extern_mkldnn")
+SET(MKLDNN_SOURCES_DIR    ${THIRD_PARTY_PATH}/mkldnn)
+SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
+SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
+
+IF(WIN32 OR APPLE)
+    MESSAGE(WARNING 
+        "Windows or Mac is not supported with MKLDNN in Paddle yet."
+        "Force WITH_MKLDNN=OFF")
+    SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in Windows and MacOS" FORCE)
+    return()
+ENDIF()
+
+SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
+MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
+SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
+
+INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR})
+
+IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
+    SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
+    SET(MKLDNN_MKLROOT   ${MKLML_ROOT})
+    SET(MKLDNN_IOMP_LIB  ${MKLML_IOMP_LIB})
+    SET(MKLDNN_IOMP_DIR  ${MKLML_LIB_DIR})
+    MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}")
+ENDIF()
+
+ExternalProject_Add(
+    ${MKLDNN_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    DEPENDS             ${MKLDNN_DEPENDS}
+    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
+    GIT_TAG             "v0.9"
+    PREFIX              ${MKLDNN_SOURCES_DIR}
+    UPDATE_COMMAND      ""
+    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
+    CMAKE_ARGS          -DMKLROOT=${MKLDNN_MKLROOT}
+    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
+                        -DMKLROOT:PATH=${MKLDNN_MKLROOT}
+)
+
+ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
+ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
+MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIB}")
+LIST(APPEND external_project_dependencies mkldnn)
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
new file mode 100644
index 0000000000..e9fd3d4bed
--- /dev/null
+++ b/cmake/external/mklml.cmake
@@ -0,0 +1,67 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(NOT ${WITH_MKLML})
+  return()
+ENDIF(NOT ${WITH_MKLML})
+
+IF(WIN32 OR APPLE)
+    MESSAGE(WARNING
+        "Windows or Mac is not supported with MKLML in Paddle yet."
+        "Force WITH_MKLML=OFF")
+    SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in Windows and MacOS" FORCE)
+    return()
+ENDIF()
+
+INCLUDE(ExternalProject)
+
+SET(MKLML_PROJECT       "extern_mklml")
+SET(MKLML_VER           "mklml_lnx_2018.0.20170720")
+SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
+SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
+SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
+SET(MKLML_DST_DIR       "mklml")
+SET(MKLML_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
+SET(MKLML_INSTALL_DIR   ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
+SET(MKLML_ROOT          ${MKLML_INSTALL_DIR}/${MKLML_VER})
+SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)
+SET(MKLML_LIB_DIR       ${MKLML_ROOT}/lib)
+SET(MKLML_LIB           ${MKLML_LIB_DIR}/libmklml_intel.so)
+SET(MKLML_IOMP_LIB      ${MKLML_LIB_DIR}/libiomp5.so)
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
+
+INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
+
+FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt
+  "PROJECT(MKLML)\n"
+  "cmake_minimum_required(VERSION 3.0)\n"
+  "install(DIRECTORY ${MKLML_VER}\n"
+  "        DESTINATION ${MKLML_DST_DIR})\n")
+
+ExternalProject_Add(
+    ${MKLML_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX                ${MKLML_SOURCE_DIR}
+    DOWNLOAD_DIR          ${MKLML_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget --no-check-certificate -qO- ${MKLML_URL} | tar xz -C ${MKLML_DOWNLOAD_DIR}
+    DOWNLOAD_NO_PROGRESS  1
+    UPDATE_COMMAND        ""
+    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT}
+    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT}
+)
+
+ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
+ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
+LIST(APPEND external_project_dependencies mklml)
diff --git a/paddle/function/nnpack/nnpack.cmake b/cmake/external/nnpack.cmake
similarity index 54%
rename from paddle/function/nnpack/nnpack.cmake
rename to cmake/external/nnpack.cmake
index 7182730ae8..d42bcb0f32 100644
--- a/paddle/function/nnpack/nnpack.cmake
+++ b/cmake/external/nnpack.cmake
@@ -7,10 +7,24 @@ set(NNPACK_ROOT $ENV{NNPACK_ROOT} CACHE PATH "Folder contains NNPACK")
 find_path(NNPACK_INC_DIR nnpack.h PATHS ${NNPACK_ROOT}/include)
 find_library(NNPACK_LIB NAMES nnpack PATHS ${NNPACK_ROOT}/lib)
 find_library(PTHREADPOOL_LIB NAMES pthreadpool PATHS ${NNPACK_ROOT}/lib)
+find_library(NNPACK_UKERNELS_LIB NAMES nnpack_ukernels PATHS ${NNPACK_ROOT}/lib)
+find_library(NNPACK_CPUFEATURES_LIB NAMES cpufeatures PATHS ${NNPACK_ROOT}/lib)
 
 if(NNPACK_INC_DIR AND NNPACK_LIB AND PTHREADPOOL_LIB)
   set(NNPACK_FOUND ON)
   INCLUDE_DIRECTORIES(${NNPACK_INC_DIR})
+
+  set(NNPACK_LIBS)
+  list(APPEND NNPACK_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB})
+  if (NNPACK_UKERNELS_LIB)
+    list(APPEND NNPACK_LIBS ${NNPACK_UKERNELS_LIB})
+  endif()
+  if (NNPACK_CPUFEATURES_LIB)
+    list(APPEND NNPACK_LIBS ${NNPACK_CPUFEATURES_LIB})
+  endif()
+  if(NOT ANDROID)
+    list(APPEND NNPACK_LIBS "rt")
+  endif()
 else()
   message(FATAL_ERROR "Cannot find NNPACK in (${NNPACK_ROOT})")
 endif()
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 60a1041936..0eeccbf7d8 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -69,9 +69,22 @@ ENDIF(NOT ${CBLAS_FOUND})
 MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}")
 INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
 
-ADD_LIBRARY(cblas STATIC IMPORTED)
-SET_PROPERTY(TARGET cblas PROPERTY IMPORTED_LOCATION ${CBLAS_LIBRARIES})
+# FIXME(gangliao): generate cblas target to track all high performance
+# linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
+SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
+FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+IF(${CBLAS_PROVIDER} MATCHES MKL)
+    ADD_LIBRARY(cblas SHARED ${dummyfile})
+ELSE()
+    ADD_LIBRARY(cblas STATIC ${dummyfile})
+ENDIF()
+TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
+
 IF(NOT ${CBLAS_FOUND})
     ADD_DEPENDENCIES(cblas extern_openblas)
     LIST(APPEND external_project_dependencies cblas)
+ELSE()
+    IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
+        ADD_DEPENDENCIES(cblas mklml)
+    ENDIF()
 ENDIF(NOT ${CBLAS_FOUND})
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index 67a359d4b5..490c87d67e 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -24,7 +24,6 @@ IF(WITH_PYTHON)
 ENDIF(WITH_PYTHON)
 
 SET(py_env "")
-SET(USE_VIRTUALENV_FOR_TEST 1)
 IF(PYTHONINTERP_FOUND)
     find_python_module(pip REQUIRED)
     find_python_module(numpy REQUIRED)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index c31e62fc08..ff246b2eb4 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -110,7 +110,7 @@ set(COMMON_FLAGS
     -Wno-error=literal-suffix
     -Wno-error=sign-compare
     -Wno-error=unused-local-typedefs
-    -Wno-error=parentheses-equality # Warnings in Pybind11
+    -Wno-error=parentheses-equality # Warnings in pybind11
 )
 
 set(GPU_COMMON_FLAGS
@@ -124,6 +124,7 @@ set(GPU_COMMON_FLAGS
     -Wno-error=literal-suffix
     -Wno-error=unused-local-typedefs
     -Wno-error=unused-function  # Warnings in Numpy Header.
+    -Wno-error=array-bounds # Warnings in Eigen::array
 )
 
 if (APPLE)
@@ -189,6 +190,7 @@ endif()
 # Modern gpu architectures: Pascal
 if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0")
       list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
+      list(APPEND CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
 endif()
 
 # Custom gpu architecture
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 716955c7b4..d2aab938d4 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -104,6 +104,7 @@ function(merge_static_libs TARGET_NAME)
   foreach(lib ${libs})
     list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
   endforeach()
+  list(REMOVE_DUPLICATES libs_deps)
 
   if(APPLE) # Use OSX's libtool to merge archives
     # To produce a library we need at least one source file.
@@ -127,7 +128,7 @@ function(merge_static_libs TARGET_NAME)
       # Get the file names of the libraries to be merged
       set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
     endforeach()
-		add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
       COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
       COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
   else() # general UNIX: use "ar" to extract objects and re-add to a common lib
@@ -145,11 +146,11 @@ function(merge_static_libs TARGET_NAME)
         DEPENDS ${lib} ${objdir}
         WORKING_DIRECTORY ${objdir})
 
-      # Empty dummy source file that goes into merged library
-      set(mergebase ${lib}.mergebase.c)
-      add_custom_command(OUTPUT ${mergebase}
-        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}
-        DEPENDS ${objlistfile})
+      # Empty dummy source file that goes into merged library		
+      set(mergebase ${lib}.mergebase.c)		
+      add_custom_command(OUTPUT ${mergebase}		
+        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}		
+        DEPENDS ${objlistfile})		
 
       list(APPEND mergebases "${mergebase}")
     endforeach()
@@ -184,6 +185,16 @@ function(cc_library TARGET_NAME)
       add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
       target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
     endif()
+    
+    # cpplint code style
+    foreach(source_file ${cc_library_SRCS})
+      string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+      endif()
+    endforeach()
+    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS})
+
   else(cc_library_SRCS)
     if (cc_library_DEPS)
       merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
@@ -234,6 +245,14 @@ function(nv_library TARGET_NAME)
         add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
         target_link_libraries(${TARGET_NAME} ${nv_library_DEPS})
       endif()
+      # cpplint code style
+      foreach(source_file ${nv_library_SRCS})
+        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+          list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        endif()
+      endforeach()
+      add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS})
     else(nv_library_SRCS)
       if (nv_library_DEPS)
         merge_static_libs(${TARGET_NAME} ${nv_library_DEPS})
@@ -285,8 +304,22 @@ function(go_library TARGET_NAME)
     set(${TARGET_NAME}_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}")
   endif()
 
-  # Add dummy code to support `make target_name` under Terminal Command
   set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
+
+  # This custom command will always run since it depends on a not
+  # existing file.
+  add_custom_command(
+    OUTPUT dummy_rebulid_${TARGET_NAME}
+    COMMAND cmake -E touch ${dummyfile}
+    )
+  # Create a custom target that depends on the custom command output
+  # file, so the custom command can be referenced as a dependency by
+  # `add_dependencies`.
+  add_custom_target(rebuild_${TARGET_NAME}
+    DEPENDS dummy_rebulid_${TARGET_NAME}
+    )
+
+  # Add dummy code to support `make target_name` under Terminal Command
   file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
   if (go_library_SHARED OR go_library_shared)
     add_library(${TARGET_NAME} SHARED ${dummyfile})
@@ -297,6 +330,12 @@ function(go_library TARGET_NAME)
     add_dependencies(${TARGET_NAME} ${go_library_DEPS})
   endif(go_library_DEPS)
 
+  # The "source file" of the library is `${dummyfile}` which never
+  # change, so the target will never rebuild. Make the target depends
+  # on the custom command that touches the library "source file", so
+  # rebuild will always happen.
+  add_dependencies(${TARGET_NAME} rebuild_${TARGET_NAME})
+
   set(${TARGET_NAME}_LIB_PATH "${CMAKE_CURRENT_BINARY_DIR}/${${TARGET_NAME}_LIB_NAME}" CACHE STRING "output library path for target ${TARGET_NAME}")
 
   file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
@@ -337,7 +376,7 @@ function(go_test TARGET_NAME)
   string(REPLACE "${PADDLE_GO_PATH}" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
   add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${go_test_DEPS})
   add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test -race
     -c -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
     ".${CMAKE_CURRENT_SOURCE_REL_DIR}"
     WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
@@ -364,3 +403,16 @@ function(py_proto_compile TARGET_NAME)
   protobuf_generate_python(py_srcs ${py_proto_compile_SRCS})
   add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs})
 endfunction()
+
+function(py_test TARGET_NAME)
+  if(WITH_TESTING)
+    set(options STATIC static SHARED shared)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})  
+    add_test(NAME ${TARGET_NAME}
+             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python
+             python2 ${py_test_SRCS}
+             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+  endif()
+endfunction()
diff --git a/cmake/package.cmake b/cmake/package.cmake
index ff49a2d08e..79e02147f3 100644
--- a/cmake/package.cmake
+++ b/cmake/package.cmake
@@ -12,7 +12,7 @@ set(CPACK_PACKAGE_DESCRIPTION "")
 set(CPACK_DEBIAN_PACKAGE_DEPENDS "libpython2.7-dev, libstdc++6, python-pip, curl, libgfortran3, python-pip-whl")
 set(CPACK_DEBIAN_PACKAGE_SECTION Devel)
 set(CPACK_DEBIAN_PACKAGE_VERSION ${PADDLE_VERSION})
-set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJ_ROOT}/paddle/scripts/deb/postinst")
+set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PADDLE_SOURCE_DIR}/paddle/scripts/deb/postinst")
 #set(CPACK_GENERATOR "DEB")
 # Start cpack
 include (CMakePackageConfigHelpers)
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 87ad9d91d8..0da4969d31 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -118,7 +118,6 @@ endfunction()
 macro(add_unittest_without_exec TARGET_NAME)
     add_executable(${TARGET_NAME} ${ARGN})
     link_paddle_test(${TARGET_NAME})
-    add_style_check_target(${TARGET_NAME} ${ARGN})
 endmacro()
 
 # add_unittest
@@ -142,17 +141,20 @@ endmacro()
 function(create_resources res_file output_file)
   add_custom_command(
     OUTPUT ${output_file}
-    COMMAND python ARGS ${PROJ_ROOT}/cmake/make_resource.py ${res_file} ${output_file}
-    DEPENDS ${res_file} ${PROJ_ROOT}/cmake/make_resource.py)
+    COMMAND python ARGS ${PADDLE_SOURCE_DIR}/cmake/make_resource.py ${res_file} ${output_file}
+    DEPENDS ${res_file} ${PADDLE_SOURCE_DIR}/cmake/make_resource.py)
 endfunction()
 
 
 # Create a python unittest using run_python_tests.sh,
 # which takes care of making correct running environment
 function(add_python_test TEST_NAME)
-  add_test(NAME ${TEST_NAME}
-        COMMAND env PADDLE_PACKAGE_DIR=${PADDLE_PYTHON_PACKAGE_DIR}
-        bash ${PROJ_ROOT}/paddle/scripts/run_python_tests.sh
-        ${USE_VIRTUALENV_FOR_TEST} ${PYTHON_EXECUTABLE} ${ARGN}
-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+    foreach(arg ${ARGN})
+        get_filename_component(py_fn ${arg} NAME_WE)
+        set(TRG_NAME ${TEST_NAME}_${py_fn})
+        add_test(NAME ${TRG_NAME}
+                COMMAND env PYTHONPATH=${PADDLE_PYTHON_PACKAGE_DIR}
+                python2 ${arg}
+                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+    endforeach()
 endfunction()
diff --git a/cmake/version.cmake b/cmake/version.cmake
index ac1583a24c..cde650128a 100644
--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@@ -4,7 +4,7 @@ set(tmp_version "HEAD")
 while ("${PADDLE_VERSION}" STREQUAL "")
   execute_process(
     COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 ${tmp_version}
-    WORKING_DIRECTORY ${PROJ_ROOT}
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
     OUTPUT_VARIABLE GIT_TAG_NAME
     RESULT_VARIABLE GIT_RESULT
     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index 4f4a9187bc..a4a843c610 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -104,6 +104,11 @@ cross_channel_norm
 ------------------
 ..  autoclass:: paddle.v2.layer.cross_channel_norm
     :noindex:
+
+row_l2_norm
+-----------
+..  autoclass:: paddle.v2.layer.row_l2_norm
+    :noindex:
     
 Recurrent Layers
 ================
@@ -198,6 +203,10 @@ identity_projection
 ..  autoclass:: paddle.v2.layer.identity_projection
     :noindex:
 
+slice_projection
+-------------------
+..  autoclass:: paddle.v2.layer.slice_projection
+    :noindex:
 
 table_projection
 ----------------
@@ -248,6 +257,16 @@ seq_concat
 ..  autoclass:: paddle.v2.layer.seq_concat
     :noindex:
 
+kmax_sequence_score
+-------------------
+..  autoclass:: paddle.v2.layer.kmax_sequence_score
+    :noindex:
+
+sub_nested_seq
+--------------
+..  autoclass:: paddle.v2.layer.sub_nested_seq
+    :noindex:
+
 Reshaping Layers
 ================
 
@@ -316,6 +335,11 @@ scaling
 ..  autoclass:: paddle.v2.layer.scaling
     :noindex:
 
+clip
+----
+..  autoclass:: paddle.v2.layer.clip
+    :noindex:
+
 slope_intercept
 ---------------
 ..  autoclass:: paddle.v2.layer.slope_intercept
@@ -338,6 +362,11 @@ trans
 ..  autoclass:: paddle.v2.layer.trans
     :noindex:
 
+scale_shift
+-----------
+..  autoclass:: paddle.v2.layer.scale_shift
+    :noindex:
+
 Sampling Layers
 ===============
 
@@ -474,6 +503,11 @@ prelu
 ..  autoclass:: paddle.v2.layer.prelu
     :noindex:
 
+gated_unit
+-----------
+..  autoclass:: paddle.v2.layer.gated_unit
+    :noindex:
+
 Detection output Layer
 ======================
 
diff --git a/doc/design/auto_gradient_check.md b/doc/design/auto_gradient_check.md
new file mode 100644
index 0000000000..1f4d4ec16f
--- /dev/null
+++ b/doc/design/auto_gradient_check.md
@@ -0,0 +1,146 @@
+## Auto Gradient Checker Design
+
+## Backgraound：
+- Operator forward computing is easy to check if the result is right because it has a clear definition. **But** backpropagation is a notoriously difficult algorithm to debug and get right:
+  - 1. you should get the right backpropagation formula according to the forward computation.
+  - 2. you should implement it right in CPP.
+  - 3. it's difficult to prepare test data.
+
+- Auto gradient check gets a numeric gradient by forward Operator and use it as a reference of the backward Operator's result. It has several advantages:
+  - 1. numeric gradient checker only need forward operator.
+  - 2. user only need to prepare the input data for forward Operator.
+
+## Mathematical Theory
+The following two document from stanford has a detailed explanation of how to get numeric gradient and why it's useful.
+
+- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
+- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
+
+
+## Numeric Gradient Implementation
+### Python Interface
+```python
+def get_numeric_gradient(op,
+                         input_values,
+                         output_name,
+                         input_to_check,
+                         delta=0.005,
+                         local_scope=None):
+    """
+    Get Numeric Gradient for an operator's input.
+
+    :param op: C++ operator instance, could be an network
+    :param input_values: The input variables. Should be an dictionary, key is
+    variable name. Value is numpy array.
+    :param output_name: The final output variable name.
+    :param input_to_check: The input variable need to get gradient.
+    :param delta: The perturbation value for numeric gradient method. The
+    smaller delta is, the more accurate result will get. But if that delta is
+     too small, it could occur numerical stability problem.
+    :param local_scope: The local scope used for get_numeric_gradient.
+    :return: The gradient array in numpy format.
+    """
+```
+
+### Explaination:
+
+- Why need `output_name`
+  - One Operator may have multiple Output, you can get independent gradient from each Output. So user should set one output to calculate.
+
+- Why need `input_to_check`
+  - One operator may have multiple inputs. Gradient Op can calculate the gradient of these Inputs at the same time. But Numeric Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times.
+
+
+### Core Algorithm Implementation
+
+
+```python
+    # we only compute gradient of one element each time.
+    # we use a for loop to compute the gradient of every element.
+    for i in xrange(tensor_size):
+        # get one input element throw it's index i.
+        origin = tensor_to_check.get_float_element(i)
+
+        # add delta to it, run op and then get the sum of the result tensor.
+        x_pos = origin + delta
+        tensor_to_check.set_float_element(i, x_pos)
+        y_pos = get_output()
+
+        # plus delta to this element, run op and get the sum of the result tensor.
+        x_neg = origin - delta
+        tensor_to_check.set_float_element(i, x_neg)
+        y_neg = get_output()
+
+        # restore old value
+        tensor_to_check.set_float_element(i, origin)
+
+        # compute the gradient of this element and store it into a numpy array.
+        gradient_flat[i] = (y_pos - y_neg) / delta / 2
+
+    # reshape the gradient result to the shape of the source tensor.
+    return gradient_flat.reshape(tensor_to_check.get_dims())
+```
+
+## Auto Graident Checker Framework
+
+Each Operator Kernel has three kinds of Gradient:
+
+- 1. Numeric Gradient
+- 2. CPU Operator Gradient
+- 3. GPU Operator Gradient(if supported)
+
+Numeric Gradient Only relies on forward Operator. So we use Numeric Gradient as the reference value.
+
+- 1. calculate the numeric gradient.
+- 2. calculate CPU kernel Gradient with the backward Operator and compare it with the numeric gradient.
+- 3. calculate GPU kernel Gradient with the backward Operator and compare it with the numeric gradient.(if support GPU)
+
+#### Python Interface
+
+```python
+    def check_grad(self,
+                   forward_op,
+                   input_vars,
+                   inputs_to_check,
+                   output_name,
+                   no_grad_set=None,
+                   only_cpu=False,
+                   max_relative_error=0.005):
+        """
+        :param forward_op: used to create backward_op
+        :param input_vars: numpy value of input variable. The following
+            computation will use these variables.
+        :param inputs_to_check: inputs var names that should check gradient.
+        :param output_name: output name that used to
+        :param max_relative_error: The relative tolerance parameter.
+        :param no_grad_set: used when create backward ops
+        :param only_cpu: only compute and check gradient on cpu kernel.
+        :return:
+        """
+```
+
+### How to check if two numpy array is close enough?
+if `abs_numeric_grad` is nearly zero, then use abs error for numeric_grad, not relative
+
+```python
+numeric_grad = ...
+operator_grad = numpy.array(scope.find_var(grad_var_name(name)).get_tensor())
+
+abs_numeric_grad = numpy.abs(numeric_grad)
+# if abs_numeric_grad is nearly zero, then use abs error for numeric_grad, not relative
+# error.
+abs_numeric_grad[abs_numeric_grad < 1e-3] = 1
+
+diff_mat = numpy.abs(abs_numeric_grad - operator_grad) / abs_numeric_grad
+max_diff = numpy.max(diff_mat)
+```
+
+
+#### Notes：
+1，The Input data for auto gradient checker should be reasonable to avoid numeric problem.
+
+
+#### Refs:
+
+- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
+- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
diff --git a/doc/design/cluster_train/large_model_dist_train.md b/doc/design/cluster_train/large_model_dist_train.md
new file mode 100644
index 0000000000..0c4b5bc24c
--- /dev/null
+++ b/doc/design/cluster_train/large_model_dist_train.md
@@ -0,0 +1,101 @@
+# Alalysis of large model distributed training in Paddle
+
+***NOTE: This is only some note for how we implemeted this scheme in V1, not a new design.***
+
+## What is it
+
+We often encounter cases that the embedding layer parameters(sparse) are so large that we can not store it in the trainer's memory when training. So we need to put them to several servers, and fetch them row by row instead of fetch all of the parameters.
+
+## How to use
+
+Specify command-line argument like  `--loadsave_parameters_in_pserver=true --ports_num_for_sparse=1  --use_old_updater=1` when starting the paddle trainer. And also add something like `--ports_num_for_sparse=1 --pserver_num_threads=5` when starting pserver processes.
+
+Accrodingly, configure your embedding layers like:
+
+```python
+SPARSE_REMOTE=True
+
+w1 = data_layer(name="w1", size=dict_size)
+emb1 = embedding_layer(input=w1, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE))
+w2 = data_layer(name="w2", size=dict_size)
+emb2 = embedding_layer(input=w2, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE))
+...
+```
+
+## Implementation details
+
+```c++
+enum MatType {
+  MAT_NORMAL,
+  MAT_NORMAL_SHARED,
+  MAT_VALUE_SHARED,
+  MAT_SPARSE_ROW_IDS,
+  MAT_SPARSE_ROW_AUTO_GROW,
+  MAT_CACHE_ROW,
+  MAT_SPARSE_ROW,
+  MAT_SPARSE_ROW_PREFETCH,
+  MAT_SPARSE_ROW_PREFETCH_FULL_SIZE,
+};
+```
+
+`MAT_SPARSE_ROW_PREFETCH` is what we use when configured to fetch only row of matrix when training.
+
+In `trainer_internal.cpp:L93 trainOneBatch`:
+
+```c++
+  if (config_->getOptConfig().use_sparse_remote_updater()) {
+    REGISTER_TIMER("prefetch");
+    gradientMachine_->prefetch(inArgs);
+    parameterUpdater_->getParametersRemote();
+  }
+```
+
+When doing actual network forward and backward, at the beginning of each batch, the trainer will try to download one row of data from pserver.
+
+In `trainer/RemoteParameterUpdater.cpp`: `parameterUpdater_->getParametersRemote();`:
+
+```c++
+if (fullSize) {
+    ...
+} else {
+getParams = [&] {
+    parameterClient_->getParameterSparse(
+        /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
+};
+applyL1 = [](Parameter& para, real decayRate) {
+    para.getMat(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate);
+};
+}
+```
+
+Calling `parameterClient_->getParameterSparse` will do remote call to pserver's `getParameterSparse`:
+
+```c++
+void ParameterServer2::getParameterSparse(const SendParameterRequest& request,
+                                          std::vector<Buffer>& inputBuffers,
+                                          SendParameterResponse* response,
+                                          std::vector<Buffer>* outputBuffers) {
+  (void)inputBuffers;
+  auto& buffer = *readWriteBuffer_;
+  size_t numReals = 0;
+  for (const auto& block : request.blocks()) {
+    numReals += getParameterConfig(block).dims(1);
+  }
+  buffer.resize(numReals);
+
+  VLOG(3) << "pserver: getParameterSparse, numReals=" << numReals;
+
+  ReadLockGuard guard(parameterMutex_);
+  size_t offset = 0;
+  for (const auto& block : request.blocks()) {
+    size_t width = getParameterConfig(block).dims(1);
+    Buffer buf = {buffer.data() + offset, width};
+    int type = request.send_back_parameter_type();
+    sendBackParameterSparse(block, type, response, &buf, width, outputBuffers);
+    offset += width;
+  }
+}
+```
+
+`getParameterConfig(block).dims(1)` returns the width of the current "parameter block"(a shard of parameter object),
+then `getParameterSparse` remote call returns only one row of data to the client.
diff --git a/doc/design/cluster_train/save_model.md b/doc/design/cluster_train/save_model.md
index b70f00176b..b755185c81 100644
--- a/doc/design/cluster_train/save_model.md
+++ b/doc/design/cluster_train/save_model.md
@@ -75,10 +75,11 @@ snapshot to a model will be a TODO for future.
 ### Trainer Election
 
 One trainer will be elected as the one to save the model. When using
-etcd, trainer ID is a randomly generated UUID, we will utilize etcd to
-elect one trainer. When not using etcd, unique trainer IDs will be
-given by the administrator, the trainer whose ID is "0" is elected to
-save the model.
+etcd, trainer ID is a randomly generated UUID, the trainer will
+contact the master server requesting to save the model, and find out
+if itself is elected. When the master server is not used, unique
+trainer IDs will be given by the administrator, the trainer whose ID
+is "0" is elected to save the model.
 
 ### Model Save Path
 
diff --git a/doc/design/mkldnn/README.MD b/doc/design/mkldnn/README.MD
new file mode 100644
index 0000000000..fe8da907d9
--- /dev/null
+++ b/doc/design/mkldnn/README.MD
@@ -0,0 +1,111 @@
+# Intel® MKL-DNN on PaddlePaddle: Design Doc
+
+我们计划将Intel深度神经网络数学库(**MKL-DNN**\[[1](#references)\])集成到PaddlePaddle，充分展现英特尔平台的优势，有效提升PaddlePaddle在英特尔架构上的性能。
+
+我们短期内的基本目标是：
+
+- 完成常用layer的MKL-DNN实现。
+- 完成常见深度神经网络VGG，GoogLeNet 和 ResNet的MKL-DNN实现。
+
+
+## Contents
+
+- [Overview](#overview)
+- [Actions](#actions)
+ 	- [CMake](#cmake)
+	- [Layers](#layers)
+	- [Activations](#activations)
+	- [Unit Tests](#unit-tests)
+	- [Protobuf Messages](#protobuf-messages)
+	- [Python API](#python-api)
+	- [Demos](#demos)
+	- [Benchmarking](#benchmarking)
+	- [Others](#others)
+- [Design Concerns](#design-concerns)
+
+## Overview
+
+我们会把MKL-DNN作为第三方库集成进PaddlePaddle，整体框架图
+<div align="center">
+<img src="image/overview.png" width=350><br/>
+Figure 1. PaddlePaddle on IA.
+</div>
+
+## Actions
+我们把集成方案大致分为了如下几个方面。
+
+### CMake
+我们会在`CMakeLists.txt`中会添加`WITH_MKLDNN`的选项，当设置这个值为`ON`的时候会启用编译MKL-DNN功能。同时会自动开启OpenMP用于提高MKL-DNN的性能。
+
+同时，我们会引入`WITH_MKLML`选项，用于选择是否使用MKL-DNN自带的MKLML安装包。这个安装包可以独立于MKL-DNN使用，但是建议在开启MKL-DNN的同时也打开MKLML的开关，这样才能发挥最好的性能。
+
+所以，我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件，它们会在编译PaddlePaddle的时候下载对应的软件包，并放到PaddlePaddle的third party目录中。
+
+**备注**：当`WITH_MKLML=ON`的时候，会优先使用这个包作为PaddlePaddle的CBLAS和LAPACK库，所以会稍微改动`cmake/cblas.cmake`中的逻辑。
+
+### Layers
+所有MKL-DNN相关的C++ layers，都会按照PaddlePaddle的目录结构存放在
+`paddle/gserver/layers`中，并且文件名都会一以*Mkldnn*开头。
+
+所有MKL-DNN的layers都会继承于一个叫做`MkldnnLayer`的父类，该父类继承于PaddlePaddle的基类`Layer`。
+
+### Activations
+由于在PaddlePaddle中，激活函数是独立于layer概念的，所以会在`paddle/gserver/activations`目录下添加一个`MkldnnActivation.h`文件定义一些用于MKL-DNN的接口，实现方法还是会在`ActivationFunction.cpp`文件。
+
+### Unit Tests
+会在`paddle/gserver/test`目录下添加`test_Mkldnn.cpp`和`MkldnnTester.*`用于MKL-DNN的测试。
+
+Activation的测试，计划在PaddlePaddle原有的测试文件上直接添加新的测试type。
+
+### Protobuf Messages
+根据具体layer的需求可能会在`proto/ModelConfig.proto`里面添加必要的选项。
+
+### Python API
+目前只考虑**v1 API**。
+
+计划在`python/paddle/trainer/config_parser.py`里面添加`use_mkldnn`这个选择，方便用户选择使用MKL-DNN的layers。
+
+具体实现方式比如：
+
+```python
+use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+if use_mkldnn
+    self.layer_type = mkldnn_*
+```
+
+所有MKL-DNN的layer type会以*mkldnn_*开头，以示区分。 
+
+并且可能在`python/paddle/trainer_config_helper`目录下的`activations.py `和`layers.py`里面添加必要的MKL-DNN的接口。
+
+### Demos
+
+会在`v1_api_demo`目录下添加一个`mkldnn`的文件夹，里面放入一些用于MKL-DNN测试的demo脚本。
+
+### Benchmarking
+会考虑添加部分逻辑在`benchmark/paddle/image/run.sh`，添加使用MKL-DNN的测试。
+
+### Others
+1. 如果在使用MKL-DNN的情况下，会把CPU的Buffer对齐为64。
+2. 深入PaddlePaddle，寻找有没有其他可以优化的可能，进一步优化。比如可能会用OpenMP改进SGD的更新性能。
+
+## Design Concerns
+
+为了更好的符合PaddlePaddle的代码风格\[[2](#references)\]，同时又尽可能少的牺牲MKL-DNN的性能\[[3](#references)\]。
+
+我们总结出一些特别需要注意的点：
+
+1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数，我们决定使用已有的`deviceId_`变量来区分layer的属性，定义`-2`为`MkldnnLayer`特有的设备ID。
+2. 重写父类Layer的**init**函数，修改`deviceId_`为`-2`，代表这个layer是用于跑在MKL-DNN的环境下。
+3. 创建`MkldnnMatrix`，用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。
+4. 创建`MkldnnBase`，定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MkldnnStream`和`CpuEngine`，和未来可能还会用到`FPGAEngine`等。
+5. 在**Argument**里添加两个`MkldnnMatrixPtr`，取名为`mkldnnValue`和`mkldnnGrad`，用于存放`MkldnnLayer`会用到的memory buffer。 并且添加函数cvt(会修改为一个更加合适的函数名)，用于处理"CPU device"和"MKL-DNN device"之间memory的相互转化。
+6. 在父类`Layer`中的`getOutput`函数中添加一段逻辑，用于判断`deviceId`，并针对device在MKL-DNN和CPU之间不统一的情况，做一个前期转换。 也就是调用`Argument`的cvt函数把output统一到需要的device上。
+7. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag，用于选择是否使用MKL-DNN的相关功能。
+8. 关于MKLDNN参数的保存。由于MKLDNN参数的格式与PaddlePaddle原有的格式存在不一样的情况，所以需要在保存参数时同时保存该格式信息。目前准备扩展[Header](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/parameter/Parameter.h#L247)里面的`int32_t version`。这个值不管是在v1还是在v2里面，一直保存的是0，所以可以充分利用这个信息，定义一个枚举处理所有MKLDNN的参数格式，从而`MKLDNNLayer`就可以从输入的参数中获取需要的格式信息。
+
+## References
+
+1. [Intel Math Kernel Library for Deep Neural Networks (Intel MKL-DNN)](https://github.com/01org/mkl-dnn "Intel MKL-DNN")
+2. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。但是在PaddlePaddle中，无论是重构前的layer还是重构后的op，都不会想要知道next layer/op的信息。
+3. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的CUDNN部分使用的也是`NCHW`，所以不存在这个问题)，所以需要引入一个转换方法，并且只需要在必要的时候转换这种格式，才能更好的发挥MKL-DNN的性能。
+
diff --git a/doc/design/mkldnn/image/overview.png b/doc/design/mkldnn/image/overview.png
new file mode 100644
index 0000000000..84b455c282
Binary files /dev/null and b/doc/design/mkldnn/image/overview.png differ
diff --git a/doc/design/releasing_process.md b/doc/design/releasing_process.md
index 3692a5248a..0c10e78280 100644
--- a/doc/design/releasing_process.md
+++ b/doc/design/releasing_process.md
@@ -11,6 +11,15 @@ Paddle每次发新的版本，遵循以下流程:
 	* 编译这个版本的Ubuntu Deb包。如果失败，修复Ubuntu Deb包编译问题，Patch号加一，返回第二步。
 	* 使用Regression Test List作为检查列表，测试Docker镜像/ubuntu安装包的功能正确性
 		* 如果失败，记录下所有失败的例子，在这个`release/版本号`分支中，修复所有bug后，Patch号加一，返回第二步
+	* 编译这个版本的python wheel包，并发布到pypi。
+		* 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513)，在使用twine上传之前，需要重命名wheel包中platform相关的后缀，比如将`linux_x86_64`修改成`manylinux1_x86_64`。
+		* pypi上的package名称为paddlepaddle和paddlepaddle_gpu，如果要上传GPU版本的包，需要修改build/python/setup.py中，name: "paddlepaddle_gpu"并重新打包wheel包：`python setup.py bdist_wheel`。
+		* 上传方法：
+			```
+			cd build/python
+			pip install twine
+			twine upload dist/[package to upload]
+			```
 4. 第三步完成后，将`release/版本号`分支合入master分支，并删除`release/版本号`分支。将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
 5. 编译master分支的Docker发行镜像，发布到dockerhub。编译ubuntu的deb包，发布到github release页面
 6. 协同完成Release Note的书写
diff --git a/doc/design/scope.md b/doc/design/scope.md
index afe6bc028c..c9e0be716b 100644
--- a/doc/design/scope.md
+++ b/doc/design/scope.md
@@ -37,8 +37,8 @@ Scope is an association of a name to variable. All variables belong to `Scope`.
 ```cpp
 class Scope {
  public:
-  Variable* CreateVariable(const std::string& name);
-  const Variable* GetVariable(const std::string& name) const;
+  Variable* NewVar(const std::string& name);
+  const Variable* FindVar(const std::string& name) const;
 
  private:
     std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
@@ -58,12 +58,12 @@ class Scope {
  public:
   Scope(const std::shared_ptr<Scope>& scope): parent_(scope) {}
 
-  Variable* GetVariable(const std::string& name) const {
+  Variable* FindVar(const std::string& name) const {
     auto it = vars_.find(name);
     if (it != vars_.end()) {
       return it->second.get();
     } else if (parent_ != nullptr) {
-      return parent_->GetVariable(name);
+      return parent_->FindVar(name);
     } else {
       return nullptr;
     }
@@ -95,10 +95,10 @@ class Scope {
   static std::shared_ptr<Scope> Create(const std::shared_ptr<Scope>& parent = nullptr);
 
   // return nullptr if not found.
-  Variable* GetVariable(const std::string& name) const;
+  Variable* FindVar(const std::string& name) const;
 
   // return if already contains same name variable.
-  Variable* CreateVariable(const std::string& name);
+  Variable* NewVar(const std::string& name);
 
  private:
   std::shared_ptr<Scope> parent_;
@@ -107,11 +107,11 @@ class Scope {
 ```
 ## Only scope can create a variable
 
-To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `CreateVariable` can construct `Variable`.
+To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `NewVar` can construct `Variable`.
 
 ## When scope destroyed, all variables inside this scope should be destroyed together
 
-The scope hold unique pointers for all variables. User can `GetVariable` from scope, but he should not hold this pointer as a member variable. Because when scope is destroyed, all variables inside this scope will be destroyed together.
+The scope hold unique pointers for all variables. User can `FindVar` from scope, but he should not hold this pointer as a member variable. Because when scope is destroyed, all variables inside this scope will be destroyed together.
 
 ## Sharing a parent scope
 
@@ -121,4 +121,4 @@ Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shar
 
 ## Orthogonal interface
 
-`GetVariable` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `CreateVariable` will return a `Error` when there is a name conflict locally. Combine `GetVariable` and `CreateVariable`, we can implement `CreateOrGetVariable` easily.
+`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `NewVar` will return a `Error` when there is a name conflict locally. Combine `FindVar` and `NewVar`, we can implement `NewVar` easily.
diff --git a/doc/design/simple_op_design.md b/doc/design/simple_op_design.md
index 49ca5db5da..5e07c29c56 100644
--- a/doc/design/simple_op_design.md
+++ b/doc/design/simple_op_design.md
@@ -49,6 +49,7 @@ message AttrProto {
 message VarProto {
 	required string name = 1;
 	required string comment = 2;
+	required bool is_tensor = 3;
 };
 
 message OpProto {
diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst
index c14160d55e..138efb566e 100644
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -311,3 +311,13 @@ Paddle二进制在运行时捕获了浮点数异常，只要出现浮点数异
 * 训练数据有问题，导致参数收敛到了一些奇异的情况。或者输入数据尺度过大，有些特征的取值达到数百万，这时进行矩阵乘法运算就可能导致浮点数溢出。
 
 主要的解决办法是减小学习律或者对数据进行归一化处理。
+
+15. 编译安装后执行 import paddle.v2 as paddle 报ImportError: No module named v2
+------------------------------------------------------------------------
+先查看一下是否曾经安装过paddle v1版本，有的话需要先卸载：
+
+pip uninstall py_paddle paddle
+
+然后安装paddle的python环境, 在build目录下执行
+
+pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl
diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
index c0608ede8e..2f14614894 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -68,7 +68,7 @@ As a simple example, consider the following:
 
 1. **BLAS Dependencies(optional)**
   
-    CMake will search BLAS libraries from system. If not found, OpenBLAS will be downloaded, built and installed automatically.
+    CMake will search BLAS libraries from the system. If not found, OpenBLAS will be downloaded, built and installed automatically.
     To utilize preinstalled BLAS， you can simply specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
 
     ```bash
@@ -131,9 +131,9 @@ As a simple example, consider the following:
     To build GPU version, you will need the following installed:
 
         1. a CUDA-capable GPU
-        2. A supported version of Linux with a gcc compiler and toolchain
+        2. A supported version of Linux with a GCC compiler and toolchain
         3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
-        4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
+        4. NVIDIA cuDNN Library (available at https://developer.nvidia.com/cudnn)
 
     The CUDA development environment relies on tight integration with the host development environment,
     including the host compiler and C runtime libraries, and is therefore only supported on
@@ -172,6 +172,7 @@ export PATH=<path to install>/bin:$PATH
 # install PaddlePaddle Python modules.
 sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
 ```
+
 ## <span id="centos">Build on Centos 7</span>
 
 ### Install Dependencies
@@ -192,9 +193,9 @@ sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
     To build GPU version, you will need the following installed:
 
         1. a CUDA-capable GPU
-        2. A supported version of Linux with a gcc compiler and toolchain
+        2. A supported version of Linux with a GCC compiler and toolchain
         3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
-        4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
+        4. NVIDIA cuDNN Library (available at https://developer.nvidia.com/cudnn)
 
     The CUDA development environment relies on tight integration with the host development environment,
     including the host compiler and C runtime libraries, and is therefore only supported on
@@ -222,7 +223,7 @@ mkdir build && cd build
 ``` 
 
 Finally, you can build and install PaddlePaddle:
-
+  
 ```bash
 # you can add build option here, such as:    
 cmake3 .. -DCMAKE_INSTALL_PREFIX=<path to install>
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index 87c286a1af..84e3317774 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -3,6 +3,43 @@ PaddlePaddle的Docker容器使用方式
 
 PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Docker能在所有主要操作系统（包括Linux，Mac OS X和Windows）上运行。 请注意，您需要更改 `Dockers设置 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 才能充分利用Mac OS X和Windows上的硬件资源。
 
+Docker使用入门
+------------------------------
+
+几个基础的概念帮助理解和使用Docker：
+
+- *镜像*：一个Docker镜像是一个打包好的软件。它包含了这个软件本身和它所依赖的运行环境。PaddlePaddle的Docker镜像就包含了PaddlePaddle的Python库以及其依赖的多个Python库。这样我们可以直接在Docker中运行需要的程序而不需要安装后在执行。可以执行：
+
+  .. code-block:: bash
+
+     docker images
+
+  来列出当前系统中的所有镜像，同样可以执行：
+
+  .. code-block:: bash
+		  
+     docker pull paddlepaddle/paddle:0.10.0
+
+  来下载Docker镜像，paddlepaddle/paddle是从官方镜像源Dockerhub.com下载的，推荐国内用户使用ocker.paddlepaddle.org/paddle下载。
+
+- *容器*： 如果说一个Docker镜像就是一个程序，那容器就是这个程序运行时产生的“进程”。
+  实际上，一个容器就是一个操作系统的进程，但是是运行在独立的进程空间，文件系统以及网络之上。
+  可以执行：
+
+  .. code-block:: bash
+
+     docker run paddlepaddle/paddle:0.10.0
+
+  来使用一个镜像启动一个容器。
+
+- 默认情况下，Docker容器会运行在独立的文件系统空间之上，我们无法在Docker容器中
+  访问到主机上的文件。可以通过*挂载Volume*的方式，将主机上的文件或目录挂载到
+  Docker容器中。下面的命令把当前目录挂载到了容器中的 /data 目录下，容器使用
+  debian镜像，并且启动后执行 :code:`ls /data`。
+
+  .. code-block:: bash
+
+     docker run --rm -v $(pwd):/data debian ls /data
 
 PaddlePaddle发布的Docker镜像使用说明
 ------------------------------
@@ -12,11 +49,11 @@ PaddlePaddle需要的所有编译工具。把编译出来的PaddlePaddle也打
 像，称为生产镜像，里面涵盖了PaddlePaddle运行所需的所有环境。每次
 PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以及开发镜像。运
 行镜像包括纯CPU版本和GPU版本以及其对应的非AVX版本。我们会在
-`dockerhub.com <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 提供最新
-的Docker镜像，可以在"tags"标签下找到最新的Paddle镜像版本。为了方便在国
-内的开发者下载Docker镜像，我们提供了国内的镜像服务器供大家使用。如果您
-在国内，请把文档里命令中的paddlepaddle/paddle替换成
-docker.paddlepaddle.org/paddle。
+`dockerhub.com <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 
+和国内镜像`docker.paddlepaddle.org` 提供最新
+的Docker镜像，可以在"tags"标签下找到最新的Paddle镜像版本。
+
+**注意：为了方便在国内的开发者下载Docker镜像，我们提供了国内的镜像服务器供大家使用。如果您在国内，请把文档里命令中的paddlepaddle/paddle替换成docker.paddlepaddle.org/paddle。**
 
 1. 开发镜像：:code:`paddlepaddle/paddle:0.10.0-dev`
 
@@ -37,13 +74,13 @@ docker.paddlepaddle.org/paddle。
 
    .. code-block:: bash
 
-      docker run -it --rm paddlepaddle/paddle:0.10.0-dev /bin/bash
+      docker run -it --rm -v $(pwd):/paddle  paddlepaddle/paddle:0.10.0-dev /bin/bash
 
    或者，可以以后台进程方式运行容器：
 
    .. code-block:: bash
 
-      docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:0.10.0-dev
+      docker run -d -p 2202:22 -p 8888:8888 -v $(pwd):/paddle paddlepaddle/paddle:0.10.0-dev /usr/sbin/sshd -D
 
    然后用密码 :code:`root` SSH进入容器：
 
@@ -68,6 +105,8 @@ docker.paddlepaddle.org/paddle。
 
    如果输出是No，就需要选择使用no-AVX的镜像
 
+   **注：在0.10.0之后的版本，PaddlePaddle都可以自动判断硬件是否支持AVX，所以无需判断AVX即可使用**
+
    以上方法在GPU镜像里也能用，只是请不要忘记提前在物理机上安装GPU最新驱动。
    为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)来运行镜像。
 
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index b6fd3329b2..94860240f6 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -63,12 +63,35 @@ CPU-only version and a CUDA GPU version and their no-AVX versions.
 
 We put the docker images on `dockerhub.com
 <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_. You can find the
-latest versions under "tags" tab at dockerhub.com. If you are in
-China, you can use our Docker image registry mirror to speed up the
-download process. To use it, please replace all paddlepaddle/paddle in
-the commands to docker.paddlepaddle.org/paddle.
+latest versions under "tags" tab at dockerhub.com. 
 
-1. Production images, this image might have multiple variants:
+** NOTE: If you are in China, you can use our Docker image registry mirror to speed up the download process. To use it, please replace all paddlepaddle/paddle in the commands to docker.paddlepaddle.org/paddle.**
+
+
+1. development image :code:`paddlepaddle/paddle:<version>-dev`
+
+   This image has packed related develop tools and runtime
+   environment. Users and developers can use this image instead of
+   their own local computer to accomplish development, build,
+   releasing, document writing etc. While different version of paddle
+   may depends on different version of libraries and tools, if you
+   want to setup a local environment, you must pay attention to the
+   versions.  The development image contains:
+   
+   - gcc/clang
+   - nvcc
+   - Python
+   - sphinx
+   - woboq
+   - sshd
+     
+   Many developers use servers with GPUs, they can use ssh to login to
+   the server and run :code:`docker exec` to enter the docker
+   container and start their work.  Also they can start a development
+   docker image with SSHD service, so they can login to the container
+   and start work.
+
+2. Production images, this image might have multiple variants:
 
    - GPU/AVX：:code:`paddlepaddle/paddle:<version>-gpu`
    - GPU/no-AVX：:code:`paddlepaddle/paddle:<version>-gpu-noavx`
@@ -84,7 +107,7 @@ the commands to docker.paddlepaddle.org/paddle.
 
       if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
 
-   
+   **NOTE：versions after 0.10.0 will automatically detect system AVX support, so manual detect is not needed in this case.**
    To run the CPU-only image as an interactive container:
 
    .. code-block:: bash
@@ -103,29 +126,6 @@ the commands to docker.paddlepaddle.org/paddle.
 
       nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0-gpu /bin/bash
 
-2. development image :code:`paddlepaddle/paddle:<version>-dev`
-
-   This image has packed related develop tools and runtime
-   environment. Users and developers can use this image instead of
-   their own local computer to accomplish development, build,
-   releasing, document writing etc. While different version of paddle
-   may depends on different version of libraries and tools, if you
-   want to setup a local environment, you must pay attention to the
-   versions.  The development image contains:
-   
-   - gcc/clang
-   - nvcc
-   - Python
-   - sphinx
-   - woboq
-   - sshd
-     
-   Many developers use servers with GPUs, they can use ssh to login to
-   the server and run :code:`docker exec` to enter the docker
-   container and start their work.  Also they can start a development
-   docker image with SSHD service, so they can login to the container
-   and start work.
-
 
 Train Model Using Python API
 ----------------------------
diff --git a/doc/templates/conf.py.cn.in b/doc/templates/conf.py.cn.in
index 95cad835b1..41b35b5b23 100644
--- a/doc/templates/conf.py.cn.in
+++ b/doc/templates/conf.py.cn.in
@@ -13,22 +13,18 @@
 # serve to show the default.
 import sys
 import os, subprocess
+sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python'))
 import shlex
 from recommonmark import parser, transform
-try:
-   import py_paddle
-   import paddle
-   import paddle.v2
-except ImportError:
-   print("Must install paddle python package before generating documentation")
-   sys.exit(1)
+import paddle
+import paddle.v2
 
 MarkdownParser = parser.CommonMarkParser
 AutoStructify = transform.AutoStructify
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-templates_path = ["@PROJ_ROOT@/doc_theme/templates"]
+templates_path = ["@PADDLE_SOURCE_DIR@/doc_theme/templates"]
 
 # -- General configuration ------------------------------------------------
 
@@ -124,7 +120,7 @@ html_theme = 'sphinx_rtd_theme'
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['@PROJ_ROOT@/doc_theme/static']
+html_static_path = ['@PADDLE_SOURCE_DIR@/doc_theme/static']
 
 # Output file base name for HTML help builder.
 htmlhelp_basename = project + 'doc'
diff --git a/doc/templates/conf.py.en.in b/doc/templates/conf.py.en.in
index b477f0120c..5822c2481d 100644
--- a/doc/templates/conf.py.en.in
+++ b/doc/templates/conf.py.en.in
@@ -13,15 +13,11 @@
 # serve to show the default.
 import sys
 import os, subprocess
+sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python'))
 import shlex
 from recommonmark import parser, transform
-try:
-   import py_paddle
-   import paddle
-   import paddle.v2
-except ImportError:
-   print("Must install paddle python package before generating documentation")
-   sys.exit(1)
+import paddle
+import paddle.v2
 
 
 MarkdownParser = parser.CommonMarkParser
@@ -29,7 +25,7 @@ AutoStructify = transform.AutoStructify
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-templates_path = ["@PROJ_ROOT@/doc_theme/templates"]
+templates_path = ["@PADDLE_SOURCE_DIR@/doc_theme/templates"]
 
 # -- General configuration ------------------------------------------------
 
@@ -124,7 +120,7 @@ html_theme = 'sphinx_rtd_theme'
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['@PROJ_ROOT@/doc_theme/static']
+html_static_path = ['@PADDLE_SOURCE_DIR@/doc_theme/static']
 
 # Output file base name for HTML help builder.
 htmlhelp_basename = project + 'doc'
diff --git a/go/cmd/master/master.go b/go/cmd/master/master.go
index 54fa254863..739c4c01e0 100644
--- a/go/cmd/master/master.go
+++ b/go/cmd/master/master.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package main
 
 import (
@@ -5,12 +19,15 @@ import (
 	"net"
 	"net/http"
 	"net/rpc"
+	"os"
+	"os/signal"
 	"strconv"
 	"strings"
 	"time"
 
 	"github.com/namsral/flag"
 	log "github.com/sirupsen/logrus"
+	"github.com/topicai/candy"
 
 	"github.com/PaddlePaddle/Paddle/go/master"
 	"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
@@ -20,11 +37,18 @@ func main() {
 	port := flag.Int("port", 8080, "port of the master server.")
 	ttlSec := flag.Int("ttl", 60, "etcd lease TTL in seconds.")
 	endpoints := flag.String("endpoints", "http://127.0.0.1:2379", "comma separated etcd endpoints. If empty, fault tolerance will not be enabled.")
-	taskTimeoutDur := flag.Duration("task_timout_dur", 20*time.Minute, "task timout duration.")
-	taskTimeoutMax := flag.Int("task_timeout_max", 3, "max timtout count for each task before it being declared failed task.")
-	chunkPerTask := flag.Int("chunk_per_task", 10, "chunk per task.")
+	taskTimeoutDur := flag.Duration("task-timout-dur", 20*time.Minute, "task timout duration.")
+	taskTimeoutMax := flag.Int("task-timeout-max", 3, "max timtout count for each task before it being declared failed task.")
+	chunkPerTask := flag.Int("chunk-per-task", 10, "chunk per task.")
+	logLevel := flag.String("log-level", "info",
+		"log level, possible values: debug, info, warning, error, fatal, panic")
 	flag.Parse()
 
+	level, e := log.ParseLevel(*logLevel)
+	candy.Must(e)
+
+	log.SetLevel(level)
+
 	if *endpoints == "" {
 		log.Warningln("-endpoints not set, fault tolerance not be enabled.")
 	}
@@ -46,6 +70,20 @@ func main() {
 		store = &master.InMemStore{}
 	}
 
+	shutdown := func() {
+		log.Infoln("shutting down gracefully")
+		err := store.Shutdown()
+		if err != nil {
+			log.Errorln(err)
+		}
+	}
+
+	// Guaranteed to run even panic happens.
+	defer shutdown()
+
+	c := make(chan os.Signal, 1)
+	signal.Notify(c, os.Interrupt)
+
 	s, err := master.NewService(store, *chunkPerTask, *taskTimeoutDur, *taskTimeoutMax)
 	if err != nil {
 		log.Fatal(err)
@@ -62,8 +100,12 @@ func main() {
 		log.Fatal(err)
 	}
 
-	err = http.Serve(l, nil)
-	if err != nil {
-		log.Fatal(err)
-	}
+	go func() {
+		err = http.Serve(l, nil)
+		if err != nil {
+			log.Fatal(err)
+		}
+	}()
+
+	<-c
 }
diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go
index b331b8126c..bec5775d54 100644
--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
@@ -1,9 +1,25 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package main
 
 import (
 	"net"
 	"net/http"
 	"net/rpc"
+	"os"
+	"os/signal"
 	"strconv"
 	"time"
 
@@ -16,10 +32,11 @@ import (
 
 func main() {
 	port := flag.Int("port", 0, "port of the pserver")
-	index := flag.Int("index", -1, "index of this pserver, should be larger or equal than 0")
+	index := flag.Int("index", -1, "index of the pserver, set to -1 if use etcd for auto pserver index registry")
 	etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379",
 		"comma separated endpoint string for pserver to connect to etcd")
-	etcdTimeout := flag.Duration("etcd-timeout", 5*time.Second, "timeout for etcd calls")
+	dialTimeout := flag.Duration("dial-timeout", 5*time.Second, "dial timeout")
+	etcdTTL := flag.Int("etcd-ttl", 5, "etcd time to live in seconds")
 	numPservers := flag.Int("num-pservers", 1, "total pserver count in a training job")
 	checkpointPath := flag.String("checkpoint-path", "/checkpoints/", "save checkpoint path")
 	checkpointInterval := flag.Duration("checkpoint-interval", 600*time.Second, "save checkpoint per interval seconds")
@@ -39,16 +56,34 @@ func main() {
 	if *index >= 0 {
 		idx = *index
 	} else {
-		e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, *etcdTimeout)
-		idx, err = e.Register()
+		e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, *dialTimeout, *etcdTTL)
+		idx, err = e.Register(*port)
 		candy.Must(err)
 
-		cp, err = pserver.NewCheckpointFromFile(*checkpointPath, idx, e)
+		cp, err = pserver.LoadCheckpoint(e, idx)
 		if err != nil {
-			log.Errorf("Fetch checkpoint failed, %s", err)
+			if err == pserver.ErrCheckpointNotFound {
+				log.Infof("Could not find the pserver checkpoint.")
+			} else {
+				panic(err)
+			}
+		}
+	}
+
+	shutdown := func() {
+		log.Infoln("shutting down gracefully")
+		sErr := e.Shutdown()
+		if sErr != nil {
+			log.Errorln(sErr)
 		}
 	}
 
+	// Guaranteed to run even panic happens.
+	defer shutdown()
+
+	c := make(chan os.Signal, 1)
+	signal.Notify(c, os.Interrupt)
+
 	s, err := pserver.NewService(idx, *checkpointInterval, *checkpointPath, e, cp)
 	candy.Must(err)
 
@@ -59,7 +94,11 @@ func main() {
 	l, err := net.Listen("tcp", ":"+strconv.Itoa(*port))
 	candy.Must(err)
 
-	log.Infof("start pserver at port %d", *port)
-	err = http.Serve(l, nil)
-	candy.Must(err)
+	go func() {
+		log.Infof("start pserver at port %d", *port)
+		err = http.Serve(l, nil)
+		candy.Must(err)
+	}()
+
+	<-c
 }
diff --git a/go/connection/conn.go b/go/connection/conn.go
index 977e8cc123..ffa8db689d 100644
--- a/go/connection/conn.go
+++ b/go/connection/conn.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package connection
 
 import (
diff --git a/go/glide.lock b/go/glide.lock
index f71ae643d6..1ecdd21752 100644
--- a/go/glide.lock
+++ b/go/glide.lock
@@ -1,15 +1,106 @@
-hash: a8faea3a363468a88917ddeb3b1c9ea36886fb2c622acbad42604fa9cb4d3855
-updated: 2017-07-11T10:04:40.786745417+08:00
+hash: 1b9b07408ca7fac27a374dc2ccd2433e4bff090484008a037df967284949a582
+updated: 2017-08-07T23:37:48.867469328Z
 imports:
+- name: github.com/beorn7/perks
+  version: 4c0e84591b9aa9e6dcfdf3e020114cd81f89d5f9
+  subpackages:
+  - quantile
+- name: github.com/boltdb/bolt
+  version: 583e8937c61f1af6513608ccc75c97b6abdf4ff9
+- name: github.com/cockroachdb/cmux
+  version: 112f0506e7743d64a6eb8fedbcff13d9979bbf92
 - name: github.com/coreos/etcd
-  version: cb2a496c4ddd1c87a9f280e116649b599999ec79
+  version: d0d1a87aa96ae14914751d42264262cb69eda170
   subpackages:
+  - alarm
+  - auth
   - auth/authpb
+  - client
   - clientv3
   - clientv3/concurrency
+  - compactor
+  - discovery
+  - embed
+  - error
+  - etcdserver
+  - etcdserver/api
+  - etcdserver/api/etcdhttp
+  - etcdserver/api/v2http
+  - etcdserver/api/v2http/httptypes
+  - etcdserver/api/v3client
+  - etcdserver/api/v3election
+  - etcdserver/api/v3election/v3electionpb
+  - etcdserver/api/v3election/v3electionpb/gw
+  - etcdserver/api/v3lock
+  - etcdserver/api/v3lock/v3lockpb
+  - etcdserver/api/v3lock/v3lockpb/gw
+  - etcdserver/api/v3rpc
   - etcdserver/api/v3rpc/rpctypes
+  - etcdserver/auth
   - etcdserver/etcdserverpb
+  - etcdserver/etcdserverpb/gw
+  - etcdserver/membership
+  - etcdserver/stats
+  - lease
+  - lease/leasehttp
+  - lease/leasepb
+  - mvcc
+  - mvcc/backend
   - mvcc/mvccpb
+  - pkg/adt
+  - pkg/contention
+  - pkg/cors
+  - pkg/cpuutil
+  - pkg/crc
+  - pkg/debugutil
+  - pkg/fileutil
+  - pkg/httputil
+  - pkg/idutil
+  - pkg/ioutil
+  - pkg/logutil
+  - pkg/monotime
+  - pkg/netutil
+  - pkg/pathutil
+  - pkg/pbutil
+  - pkg/runtime
+  - pkg/schedule
+  - pkg/srv
+  - pkg/tlsutil
+  - pkg/transport
+  - pkg/types
+  - pkg/wait
+  - proxy/grpcproxy/adapter
+  - raft
+  - raft/raftpb
+  - rafthttp
+  - snap
+  - snap/snappb
+  - store
+  - version
+  - wal
+  - wal/walpb
+- name: github.com/coreos/go-semver
+  version: 8ab6407b697782a06568d4b7f1db25550ec2e4c6
+  subpackages:
+  - semver
+- name: github.com/coreos/go-systemd
+  version: 48702e0da86bd25e76cfef347e2adeb434a0d0a6
+  subpackages:
+  - daemon
+  - journal
+  - util
+- name: github.com/coreos/pkg
+  version: 3ac0863d7acf3bc44daf49afef8919af12f704ef
+  subpackages:
+  - capnslog
+- name: github.com/dgrijalva/jwt-go
+  version: d2709f9f1f31ebcda9651b03077758c1f3a0018c
+- name: github.com/ghodss/yaml
+  version: 0ca9ea5df5451ffdf184b4428c902747c2c11cd7
+- name: github.com/gogo/protobuf
+  version: 909568be09de550ed094403c2bf8a261b5bb730a
+  subpackages:
+  - proto
 - name: github.com/golang/protobuf
   version: 4bd1920723d7b7c925de087aa32e2187708897f7
   subpackages:
@@ -17,14 +108,63 @@ imports:
   - proto
 - name: github.com/golang/snappy
   version: 553a641470496b2327abcac10b36396bd98e45c9
+- name: github.com/google/btree
+  version: 925471ac9e2131377a91e1595defec898166fe49
+- name: github.com/grpc-ecosystem/go-grpc-prometheus
+  version: 6b7015e65d366bf3f19b2b2a000a831940f0f7e0
+- name: github.com/grpc-ecosystem/grpc-gateway
+  version: 18d159699f2e83fc5bb9ef2f79465ca3f3122676
+  subpackages:
+  - runtime
+  - runtime/internal
+  - utilities
+- name: github.com/jonboulle/clockwork
+  version: 2eee05ed794112d45db504eb05aa693efd2b8b09
+- name: github.com/matttproud/golang_protobuf_extensions
+  version: c12348ce28de40eed0136aa2b644d0ee0650e56c
+  subpackages:
+  - pbutil
 - name: github.com/namsral/flag
   version: 71ceffbeb0ba60fccc853971bb3ed4d7d90bfd04
 - name: github.com/PaddlePaddle/recordio
-  version: edfb82af0739c84f241c87390ec5649c7b28c129
+  version: 0432dee9fd4b24fb6840fb20a8c055b0c933fb81
+- name: github.com/prometheus/client_golang
+  version: c5b7fccd204277076155f10851dad72b76a49317
+  subpackages:
+  - prometheus
+- name: github.com/prometheus/client_model
+  version: 6f3806018612930941127f2a7c6c453ba2c527d2
+  subpackages:
+  - go
+- name: github.com/prometheus/common
+  version: 49fee292b27bfff7f354ee0f64e1bc4850462edf
+  subpackages:
+  - expfmt
+  - internal/bitbucket.org/ww/goautoneg
+  - model
+- name: github.com/prometheus/procfs
+  version: a1dba9ce8baed984a2495b658c82687f8157b98f
+  subpackages:
+  - xfs
+- name: github.com/satori/go.uuid
+  version: 879c5887cd475cd7864858769793b2ceb0d44feb
 - name: github.com/sirupsen/logrus
-  version: 7f976d3a76720c4c27af2ba716b85d2e0a7e38b1
+  version: a3f95b5c423586578a4e099b11a46c2479628cac
 - name: github.com/topicai/candy
   version: 1b9030d056fa9f8c4b1f9c91b52fe4b8ab4cd8cc
+- name: github.com/ugorji/go
+  version: ded73eae5db7e7a0ef6f55aace87a2873c5d2b74
+  subpackages:
+  - codec
+- name: github.com/xiang90/probing
+  version: 07dd2e8dfe18522e9c447ba95f2fe95262f63bb2
+- name: golang.org/x/crypto
+  version: 1351f936d976c60a0a48d728281922cf63eafb8d
+  repo: https://github.com/golang/crypto.git
+  vcs: git
+  subpackages:
+  - bcrypt
+  - blowfish
 - name: golang.org/x/net
   version: c8c74377599bd978aee1cf3b9b63a8634051cec2
   subpackages:
@@ -36,11 +176,15 @@ imports:
   - lex/httplex
   - trace
 - name: golang.org/x/sys
-  version: abf9c25f54453410d0c6668e519582a9e1115027
+  version: 0f826bdd13b500be0f1d4004938ad978fcc6031e
+  repo: https://github.com/golang/sys.git
+  vcs: git
   subpackages:
   - unix
 - name: golang.org/x/text
-  version: cfdf022e86b4ecfb646e1efbd7db175dd623a8fa
+  version: 836efe42bb4aa16aaa17b9c155d8813d336ed720
+  repo: https://github.com/golang/text.git
+  vcs: git
   subpackages:
   - secure/bidirule
   - transform
@@ -60,4 +204,18 @@ imports:
   - stats
   - tap
   - transport
-testImports: []
+- name: gopkg.in/yaml.v2
+  version: cd8b52f8269e0feb286dfeef29f8fe4d5b397e0b
+testImports:
+- name: github.com/davecgh/go-spew
+  version: 04cdfd42973bb9c8589fd6a731800cf222fde1a9
+  subpackages:
+  - spew
+- name: github.com/pmezard/go-difflib
+  version: d8ed2627bdf02c080bf22230dbb337003b7aba2d
+  subpackages:
+  - difflib
+- name: github.com/stretchr/testify
+  version: 05e8a0eda380579888eb53c394909df027f06991
+  subpackages:
+  - assert
diff --git a/go/glide.yaml b/go/glide.yaml
index ab472c7cda..a90e71b615 100644
--- a/go/glide.yaml
+++ b/go/glide.yaml
@@ -6,8 +6,21 @@ import:
   subpackages:
   - clientv3
   - clientv3/concurrency
+  - embed
+  - etcdserver
 - package: github.com/namsral/flag
   version: ^1.7.4-pre
 - package: github.com/sirupsen/logrus
   version: ^1.0.0
 - package: github.com/topicai/candy
+- package: golang.org/x/crypto
+  repo: https://github.com/golang/crypto.git
+  vcs: git
+- package: golang.org/x/sys
+  repo: https://github.com/golang/sys.git
+  vcs: git
+- package: golang.org/x/text
+  repo: https://github.com/golang/text.git
+  vcs: git
+- package: github.com/satori/go.uuid
+  version: v1.1.0
diff --git a/go/master/CMakeLists.txt b/go/master/CMakeLists.txt
index 30531e6469..93efa4eaf7 100644
--- a/go/master/CMakeLists.txt
+++ b/go/master/CMakeLists.txt
@@ -1,3 +1,17 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 if(WITH_TESTING)
   go_test(master_test)
 endif()
diff --git a/go/master/c/CMakeLists.txt b/go/master/c/CMakeLists.txt
index d900850be0..082d9f3f59 100644
--- a/go/master/c/CMakeLists.txt
+++ b/go/master/c/CMakeLists.txt
@@ -1 +1,15 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 go_library(paddle_master SHARED DEPS paddle_go_optimizer)
diff --git a/go/master/c/client.go b/go/master/c/client.go
index 31f4311974..b5759c30b1 100644
--- a/go/master/c/client.go
+++ b/go/master/c/client.go
@@ -1,13 +1,29 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package main
 
 /*
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
-
 #define PADDLE_MASTER_OK    0
 #define PADDLE_MASTER_ERROR -1
 
+#define PADDLE_SAVE_MODEL_OK   1
+#define PADDLE_SAVE_MODEL_SKIP 0
+
 typedef int paddle_master_client;
 */
 import "C"
@@ -19,11 +35,9 @@ import (
 	"unsafe"
 
 	"github.com/PaddlePaddle/Paddle/go/master"
-	"github.com/coreos/etcd/clientv3"
 	log "github.com/sirupsen/logrus"
 )
 
-var nullPtr = unsafe.Pointer(uintptr(0))
 var mu sync.Mutex
 var handleMap = make(map[C.paddle_master_client]*master.Client)
 var curHandle C.paddle_master_client
@@ -52,32 +66,32 @@ func remove(client C.paddle_master_client) *master.Client {
 }
 
 //export paddle_new_etcd_master_client
+//
+// bufSize is the record buffer size.
 func paddle_new_etcd_master_client(etcdEndpoints *C.char, timeout int, bufSize int) C.paddle_master_client {
 	p := C.GoString(etcdEndpoints)
-	cli, err := clientv3.New(clientv3.Config{
-		Endpoints:   strings.Split(p, ","),
-		DialTimeout: time.Second * time.Duration(timeout),
-	})
-	if err != nil {
-		panic(err)
-	}
-	ch := make(chan string, 1)
-	a, err := master.GetKey(cli, master.DefaultAddrPath, timeout)
+	endpoints := strings.Split(p, ",")
+	c, err := master.NewClient(
+		master.WithEtcd(endpoints, time.Duration(timeout)*time.Second),
+		master.WithBuffer(bufSize),
+	)
 	if err != nil {
 		panic(err)
 	}
-	ch <- a
-	go master.WatchKey(cli, master.DefaultAddrPath, ch)
-	c := master.NewClient(ch, bufSize)
+
 	return add(c)
 }
 
 //export paddle_new_master_client
+//
+// bufSize is the record buffer size.
 func paddle_new_master_client(addr *C.char, bufSize int) C.paddle_master_client {
 	a := C.GoString(addr)
-	ch := make(chan string, 1)
-	ch <- a
-	c := master.NewClient(ch, bufSize)
+	c, err := master.NewClient(master.WithAddr(a), master.WithBuffer(bufSize))
+	if err != nil {
+		panic(err)
+	}
+
 	return add(c)
 }
 
@@ -86,6 +100,12 @@ func paddle_release_master_client(client C.paddle_master_client) {
 	remove(client)
 }
 
+//export paddle_start_get_records
+func paddle_start_get_records(client C.paddle_master_client, pass C.int) {
+	c := get(client)
+	c.StartGetRecords(int(pass))
+}
+
 //export paddle_set_dataset
 func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int) C.int {
 	c := get(client)
@@ -104,23 +124,28 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int
 	return C.PADDLE_MASTER_OK
 }
 
-// return value:
-//     0:ok
-//    -1:error
+// paddle_next_record gets the nexts training record.
+//
+// returns number of bytes of the records if success, -1 if failed, -2 if pass end.
+//
 //export paddle_next_record
 func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
 	c := get(client)
 	r, err := c.NextRecord()
 	if err != nil {
-		// Error
-		// TODO: return the type of error?
-		*record = (*C.uchar)(nullPtr)
+		// NOTE: use errors to indicate pass ends
+		if err.Error() == master.ErrAllTaskFailed.Error() ||
+			err.Error() == master.ErrNoMoreAvailable.Error() ||
+			err.Error() == master.ErrPassBefore.Error() {
+			return -2
+		}
+		*record = (*C.uchar)(nil)
 		return -1
 	}
 
 	if len(r) == 0 {
 		// Empty record
-		*record = (*C.uchar)(nullPtr)
+		*record = (*C.uchar)(nil)
 		return 0
 	}
 
@@ -130,6 +155,29 @@ func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
 	return C.int(size)
 }
 
+// paddle_request_save_model requests the master server to approve the
+// caller to save the model.
+//
+// returns 1 if the save the model request is approved, 0 if the
+// request is rejected because other trainer is saving the model, -1
+// if error happened.
+//
+//export paddle_request_save_model
+func paddle_request_save_model(client C.paddle_master_client, trainerID string, blockMS int) C.int {
+	c := get(client)
+	need, err := c.RequestSaveModel(trainerID, time.Duration(blockMS)*time.Millisecond)
+	if err != nil {
+		log.Errorln(err)
+		return C.PADDLE_MASTER_ERROR
+	}
+
+	if need {
+		return C.PADDLE_SAVE_MODEL_OK
+	}
+
+	return C.PADDLE_SAVE_MODEL_SKIP
+}
+
 //export mem_free
 func mem_free(p unsafe.Pointer) {
 	// "free" may be a better name for this function, but doing so
diff --git a/go/master/client.go b/go/master/client.go
index a2ca3f3ef8..62801b9b7f 100644
--- a/go/master/client.go
+++ b/go/master/client.go
@@ -1,17 +1,34 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package master
 
 import (
 	"os"
+	"time"
 
 	"github.com/PaddlePaddle/Paddle/go/connection"
 	"github.com/PaddlePaddle/recordio"
+	"github.com/coreos/etcd/clientv3"
 	log "github.com/sirupsen/logrus"
 )
 
 // Client is the client of the master server.
 type Client struct {
-	conn *connection.Conn
-	ch   chan record
+	conn    *connection.Conn
+	ch      chan record
+	bufSize int
 }
 
 type record struct {
@@ -19,33 +36,104 @@ type record struct {
 	err error
 }
 
-// NewClient creates a new Client.
+// WithBuffer sets the client to buffer the training record.
 //
 // bufSize is the record buffer size. NextRecord will read from this
 // buffer.
-func NewClient(addrCh <-chan string, bufSize int) *Client {
+func WithBuffer(bufSize int) func(*Client) error {
+	return func(c *Client) error {
+		if bufSize <= 0 {
+			return nil
+		}
+		c.bufSize = bufSize
+		return nil
+	}
+}
+
+// WithAddr sets the client to use fixed master address.
+func WithAddr(addr string) func(c *Client) error {
+	return func(c *Client) error {
+		ch := make(chan string, 1)
+		ch <- addr
+		go c.monitorMaster(ch)
+		return nil
+	}
+}
+
+// WithEtcd sets the client to use etcd for master discovery.
+func WithEtcd(endpoints []string, timeout time.Duration) func(*Client) error {
+	return func(c *Client) error {
+		cli, err := clientv3.New(clientv3.Config{
+			Endpoints:   endpoints,
+			DialTimeout: timeout,
+		})
+		if err != nil {
+			return err
+		}
+
+		ch := make(chan string, 1)
+		a, err := GetKey(cli, DefaultAddrPath, timeout)
+		if err != nil {
+			return err
+		}
+
+		if a != "" {
+			// Master is registered, send to the master address
+			// channel.
+			ch <- a
+		}
+
+		go watchKey(cli, DefaultAddrPath, ch)
+		go c.monitorMaster(ch)
+		return nil
+	}
+}
+
+// NewClient creates a new Client.
+func NewClient(opts ...func(*Client) error) (*Client, error) {
 	c := &Client{}
 	c.conn = connection.New()
-	c.ch = make(chan record, bufSize)
-	go c.monitorMaster(addrCh)
-	go c.getRecords()
-	return c
+
+	for _, opt := range opts {
+		err := opt(c)
+		if err != nil {
+			return nil, err
+		}
+	}
+	c.ch = make(chan record, c.bufSize)
+	// FIXME: connection is created asyncrosly in monitorMaster go routine,
+	//        ensure the connection is ready for use before calling c.addClient.
+	time.Sleep(time.Second)
+	return c, nil
+}
+
+// StartGetRecords must be called at beginning of each pass
+func (c *Client) StartGetRecords(passID int) {
+	go c.getRecords(passID)
 }
 
-func (c *Client) getRecords() {
+func (c *Client) getRecords(passID int) {
 	for {
-		t, err := c.getTask()
+		t, err := c.getTask(passID)
 		if err != nil {
-			// TODO(helin): wait before move on with next
-			// getTask call.
-			log.Errorln(err)
-			continue
+			if err.Error() == ErrPassBefore.Error() ||
+				err.Error() == ErrNoMoreAvailable.Error() ||
+				err.Error() == ErrAllTaskFailed.Error() {
+				c.ch <- record{nil, err}
+				break
+			}
+			if err.Error() == ErrPassAfter.Error() {
+				// wait util last pass finishes
+				time.Sleep(time.Second * 3)
+				continue
+			}
+			log.Errorf("getTask error: %s", err)
 		}
 
 		for _, chunk := range t.Chunks {
-			f, err := os.Open(chunk.Path)
-			if err != nil {
-				log.Errorln(err)
+			f, e := os.Open(chunk.Path)
+			if e != nil {
+				log.Errorln(e)
 				continue
 			}
 
@@ -68,7 +156,10 @@ func (c *Client) getRecords() {
 		// We treat a task as finished whenever the last data
 		// instance of the task is read. This is not exactly
 		// correct, but a reasonable approximation.
-		c.taskFinished(t.Meta.ID)
+		err = c.taskFinished(t.Meta.ID)
+		if err != nil {
+			log.Errorln(err)
+		}
 	}
 }
 
@@ -98,18 +189,21 @@ func (c *Client) monitorMaster(addrCh <-chan string) {
 	}
 }
 
-// SetDataset set dataset for the master server to dispatch.
+// SetDataset sets dataset to dispatch for the master server.
+//
+// SetDataset can be call multiple times at one pass. But only the first call
+// will be honored.
 //
-// SetDataset can be call multiple times from different nodes. But
-// only the first call will be honored.
+// After all tasks are done, another call of SetDataset will start another pass.
 func (c *Client) SetDataset(globPaths []string) error {
-	return c.conn.Call("Service.SetDataset", globPaths, nil)
+	err := c.conn.Call("Service.SetDataset", globPaths, nil)
+	return err
 }
 
 // getTask gets a new task from the master server.
-func (c *Client) getTask() (Task, error) {
+func (c *Client) getTask(passID int) (Task, error) {
 	var t Task
-	err := c.conn.Call("Service.GetTask", 0, &t)
+	err := c.conn.Call("Service.GetTask", passID, &t)
 	return t, err
 }
 
@@ -131,3 +225,11 @@ func (c *Client) NextRecord() ([]byte, error) {
 	r := <-c.ch
 	return r.r, r.err
 }
+
+// RequestSaveModel requests the master server to approve the caller
+// to save the model.
+func (c *Client) RequestSaveModel(trainerID string, blockDur time.Duration) (bool, error) {
+	var need bool
+	err := c.conn.Call("Service.RequestSaveModel", SaveModelRequest{TrainerID: trainerID, BlockDur: blockDur}, &need)
+	return need, err
+}
diff --git a/go/master/client_internal_test.go b/go/master/client_internal_test.go
index 49263474c8..d5f3d79464 100644
--- a/go/master/client_internal_test.go
+++ b/go/master/client_internal_test.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package master
 
 import (
@@ -40,22 +54,22 @@ func TestGetFinishTask(t *testing.T) {
 		panic(err)
 	}
 	go func(l net.Listener) {
-		s, err := NewService(&InMemStore{}, chunkPerTask, time.Second, 1)
-		if err != nil {
-			panic(err)
+		s, sErr := NewService(&InMemStore{}, chunkPerTask, time.Second, 1)
+		if sErr != nil {
+			panic(sErr)
 		}
 
 		server := rpc.NewServer()
-		err = server.Register(s)
-		if err != nil {
-			panic(err)
+		sErr = server.Register(s)
+		if sErr != nil {
+			panic(sErr)
 		}
 
 		mux := http.NewServeMux()
 		mux.Handle(rpc.DefaultRPCPath, server)
-		err = http.Serve(l, mux)
-		if err != nil {
-			panic(err)
+		sErr = http.Serve(l, mux)
+		if sErr != nil {
+			panic(sErr)
 		}
 	}(l)
 
@@ -66,11 +80,21 @@ func TestGetFinishTask(t *testing.T) {
 
 	for i := 0; i < totalTask*chunkPerTask; i++ {
 		w := recordio.NewWriter(f, -1, -1)
-		w.Write(nil)
+		_, err = w.Write(nil)
+		if err != nil {
+			panic(err)
+		}
+
 		// call Close to force RecordIO writing a chunk.
-		w.Close()
+		err = w.Close()
+		if err != nil {
+			panic(err)
+		}
+	}
+	err = f.Close()
+	if err != nil {
+		panic(err)
 	}
-	f.Close()
 
 	// Manually intialize client to avoid calling c.getRecords()
 	c := &Client{}
@@ -79,48 +103,56 @@ func TestGetFinishTask(t *testing.T) {
 	ch := make(chan string, 1)
 	ch <- addr
 	go c.monitorMaster(ch)
-	c.SetDataset([]string{path})
+
+	err = c.SetDataset([]string{path})
+	if err != nil {
+		panic(err)
+	}
+
 	checkOnePass := func(i int) {
 		var tasks []Task
 		for idx := 0; idx < totalTask; idx++ {
-			task, err := c.getTask()
-			if err != nil {
-				t.Fatalf("Error: %v, pass: %d\n", err, i)
+			task, cErr := c.getTask(i)
+			if cErr != nil && cErr.Error() != ErrNoMoreAvailable.Error() && cErr.Error() != ErrPassAfter.Error() {
+				t.Fatalf("error: %v, pass: %d\n", cErr, i)
 			}
 			tasks = append(tasks, task)
 		}
 
-		_, err = c.getTask()
-		if err == nil {
+		// getting task before task finishes should return error
+		_, cErr := c.getTask(i)
+		if cErr == nil {
 			t.Fatalf("Should get error, pass: %d\n", i)
 		}
 
-		err = c.taskFinished(tasks[0].Meta.ID)
-		if err != nil {
-			t.Fatalf("Error: %v, pass: %d\n", err, i)
+		cErr = c.taskFinished(tasks[0].Meta.ID)
+		if cErr != nil {
+			t.Fatalf("Error: %v, pass: %d\n", cErr, i)
 		}
-
-		err = c.taskFailed(tasks[0].Meta)
-		if err != nil {
-			t.Fatalf("Error: %v, pass: %d\n", err, i)
+		// call taskFailed once won't put the task to failed queue, just ensure
+		// the call
+		cErr = c.taskFailed(tasks[0].Meta)
+		if cErr != nil {
+			t.Fatalf("Error: %v, pass: %d\n", cErr, i)
 		}
 
 		tasks = tasks[1:]
-		task, err := c.getTask()
-		if err != nil {
-			t.Fatal(err)
+		_, cErr = c.getTask(i)
+		if cErr != nil && cErr.Error() != ErrNoMoreAvailable.Error() && cErr.Error() != ErrPassAfter.Error() {
+			t.Fatalf("Should be ErrNoMoreAvailable or ErrPassAfter: %s", cErr)
 		}
-		tasks = append(tasks, task)
 
 		for _, task := range tasks {
-			err = c.taskFinished(task.Meta.ID)
-			if err != nil {
-				t.Fatalf("Error: %v, pass: %d\n", err, i)
+			cErr = c.taskFinished(task.Meta.ID)
+			if cErr != nil {
+				t.Fatal(cErr)
 			}
 		}
 	}
 
 	for i := 0; i < 10; i++ {
+		// init pass data
+		c.StartGetRecords(i)
 		checkOnePass(i)
 	}
 }
diff --git a/go/master/client_test.go b/go/master/client_test.go
index 6666d3860c..79b9cc844d 100644
--- a/go/master/client_test.go
+++ b/go/master/client_test.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package master_test
 
 import (
@@ -6,8 +20,10 @@ import (
 	"net/http"
 	"net/rpc"
 	"os"
+	"runtime"
 	"strconv"
 	"strings"
+	"sync"
 	"testing"
 	"time"
 
@@ -15,6 +31,18 @@ import (
 	"github.com/PaddlePaddle/recordio"
 )
 
+// tool function for testing output goroutine ids
+func goid() int {
+	var buf [64]byte
+	n := runtime.Stack(buf[:], false)
+	idField := strings.Fields(strings.TrimPrefix(string(buf[:n]), "goroutine "))[0]
+	id, err := strconv.Atoi(idField)
+	if err != nil {
+		panic(fmt.Sprintf("cannot get goroutine id: %v", err))
+	}
+	return id
+}
+
 func TestNextRecord(t *testing.T) {
 	const (
 		path  = "/tmp/master_client_TestFull"
@@ -31,7 +59,7 @@ func TestNextRecord(t *testing.T) {
 		panic(err)
 	}
 	go func(l net.Listener) {
-		s, err := master.NewService(&master.InMemStore{}, 10, time.Second, 1)
+		s, err := master.NewService(&master.InMemStore{}, 1, time.Second*60, 1)
 		if err != nil {
 			panic(err)
 		}
@@ -55,32 +83,67 @@ func TestNextRecord(t *testing.T) {
 		panic(err)
 	}
 
-	w := recordio.NewWriter(f, -1, -1)
+	w := recordio.NewWriter(f, 1, -1)
 	for i := 0; i < total; i++ {
-		w.Write([]byte{byte(i)})
+		_, err = w.Write([]byte{byte(i)})
+		if err != nil {
+			panic(err)
+		}
+	}
+
+	err = w.Close()
+	if err != nil {
+		panic(err)
 	}
-	w.Close()
-	f.Close()
-	curAddr := make(chan string, 1)
-	curAddr <- fmt.Sprintf(":%d", p)
-	c := master.NewClient(curAddr, 10)
-	c.SetDataset([]string{path})
-	for pass := 0; pass < 50; pass++ {
-		received := make(map[byte]bool)
-		for i := 0; i < total; i++ {
-			r, err := c.NextRecord()
-			if err != nil {
-				t.Fatal(pass, i, "Read error:", err)
-			}
 
-			if len(r) != 1 {
-				t.Fatal(pass, i, "Length should be 1.", r)
+	err = f.Close()
+	if err != nil {
+		panic(err)
+	}
+
+	// start several client to test task fetching
+	var wg sync.WaitGroup
+	for i := 0; i < 4; i++ {
+		wg.Add(1)
+		// test for multiple concurrent clients
+		go func() {
+			defer wg.Done()
+			// each go-routine needs a single client connection instance
+			c, e := master.NewClient(master.WithAddr(fmt.Sprintf(":%d", p)), master.WithBuffer(1))
+			if e != nil {
+				t.Fatal(e)
 			}
+			e = c.SetDataset([]string{path})
+			if e != nil {
+				panic(e)
+			}
+			// test for n passes
+			for pass := 0; pass < 10; pass++ {
+				c.StartGetRecords(pass)
 
-			if received[r[0]] {
-				t.Fatal(pass, i, "Received duplicate.", received, r)
+				received := make(map[byte]bool)
+				taskid := 0
+				for {
+					r, e := c.NextRecord()
+					if e != nil {
+						// ErrorPassAfter will wait, else break for next pass
+						if e.Error() == master.ErrPassBefore.Error() ||
+							e.Error() == master.ErrNoMoreAvailable.Error() {
+							break
+						}
+						t.Fatal(pass, taskid, "Read error:", e)
+					}
+					if len(r) != 1 {
+						t.Fatal(pass, taskid, "Length should be 1.", r)
+					}
+					if received[r[0]] {
+						t.Fatal(pass, taskid, "Received duplicate.", received, r)
+					}
+					taskid++
+					received[r[0]] = true
+				}
 			}
-			received[r[0]] = true
-		}
+		}()
 	}
+	wg.Wait()
 }
diff --git a/go/master/etcd_client.go b/go/master/etcd_client.go
index 04c1394e96..94848d887e 100644
--- a/go/master/etcd_client.go
+++ b/go/master/etcd_client.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package master
 
 import (
@@ -25,15 +39,12 @@ type EtcdClient struct {
 	statePath string
 	client    *clientv3.Client
 	lock      *concurrency.Mutex
+	sess      *concurrency.Session
 }
 
 // NewEtcdClient creates a new EtcdClient.
 func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePath string, ttlSec int) (*EtcdClient, error) {
 	log.Debugf("Connecting to etcd at %v", endpoints)
-	// TODO(helin): gracefully shutdown etcd store. Becuase etcd
-	// store holds a etcd lock, even though the lock will expire
-	// when the lease timeout, we need to implement graceful
-	// shutdown to release the lock.
 	cli, err := clientv3.New(clientv3.Config{
 		Endpoints:   endpoints,
 		DialTimeout: dialTimeout,
@@ -53,14 +64,14 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat
 	// one master running, but split-brain problem may cause
 	// multiple master servers running), and the cluster management
 	// software will kill one of them.
-	log.Debugf("Trying to acquire lock at %s.", lockPath)
+	log.Infof("Trying to acquire lock at %s.", lockPath)
 	err = lock.Lock(context.TODO())
 	if err != nil {
 		return nil, err
 	}
-	log.Debugf("Successfully acquired lock at %s.", lockPath)
+	log.Infof("Successfully acquired lock at %s.", lockPath)
 
-	put := clientv3.OpPut(addrPath, string(addr))
+	put := clientv3.OpPut(addrPath, addr)
 	resp, err := cli.Txn(context.Background()).If(lock.IsOwner()).Then(put).Commit()
 	if err != nil {
 		return nil, err
@@ -75,6 +86,7 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat
 		statePath: statePath,
 		client:    cli,
 		lock:      lock,
+		sess:      sess,
 	}
 
 	return e, nil
@@ -143,9 +155,24 @@ func (e *EtcdClient) Load() ([]byte, error) {
 	return state, nil
 }
 
+// Shutdown shuts down the etcd client gracefully.
+func (e *EtcdClient) Shutdown() error {
+	err := e.sess.Close()
+	newErr := e.client.Close()
+	if newErr != nil {
+		if err == nil {
+			err = newErr
+		} else {
+			log.Errorln(newErr)
+		}
+	}
+
+	return err
+}
+
 // GetKey gets the value by the specify key.
-func GetKey(c *clientv3.Client, key string, timeout int) (string, error) {
-	ctx, cancel := context.WithTimeout(context.Background(), time.Second*time.Duration(timeout))
+func GetKey(c *clientv3.Client, key string, timeout time.Duration) (string, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
 	resp, err := c.Get(ctx, key)
 	cancel()
 	if err != nil {
@@ -159,8 +186,8 @@ func GetKey(c *clientv3.Client, key string, timeout int) (string, error) {
 	return string(v), nil
 }
 
-// WatchKey watches the specify key and send to valChan if there is some event.
-func WatchKey(c *clientv3.Client, key string, valChan chan<- string) {
+// watchKey watches the specify key and send to valChan if there is some event.
+func watchKey(c *clientv3.Client, key string, valChan chan<- string) {
 	rch := c.Watch(context.Background(), key)
 	for wresp := range rch {
 		for _, ev := range wresp.Events {
diff --git a/go/master/inmem_store.go b/go/master/inmem_store.go
index bcd549b20e..a5bd2d4fe1 100644
--- a/go/master/inmem_store.go
+++ b/go/master/inmem_store.go
@@ -1,10 +1,24 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package master
 
 import "sync"
 
 // InMemStore is an in memory implementation of Store interface.
 //
-// It does not tolerate the fault that casues the program to crash.
+// It does not tolerate the fault that causes the program to crash.
 type InMemStore struct {
 	mu  sync.Mutex
 	buf []byte
@@ -26,3 +40,8 @@ func (m *InMemStore) Load() ([]byte, error) {
 
 	return m.buf, nil
 }
+
+// Shutdown shuts down the in mem store.
+func (m *InMemStore) Shutdown() error {
+	return nil
+}
diff --git a/go/master/service.go b/go/master/service.go
index a6050ab994..df7c6860e6 100644
--- a/go/master/service.go
+++ b/go/master/service.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package master
 
 import (
@@ -5,6 +19,7 @@ import (
 	"compress/gzip"
 	"encoding/gob"
 	"errors"
+	"math/rand"
 	"os"
 	"path/filepath"
 	"sync"
@@ -19,10 +34,23 @@ const (
 	dialTimeout = 5 * time.Second
 )
 
+// ErrAllTaskFailed occur when tasks are in done or failed state.
+var ErrAllTaskFailed = errors.New("all task finished")
+
+// ErrNoMoreAvailable occur when no task in todo and yet not all done or fail.
+var ErrNoMoreAvailable = errors.New("no more available task")
+
+// ErrPassBefore client side pass number does not match with master counter.
+var ErrPassBefore = errors.New("pass number smaller than master")
+
+// ErrPassAfter client side pass number does not match with master counter.
+var ErrPassAfter = errors.New("pass number larger than master")
+
 // Store is the interface for save and load the master state.
 type Store interface {
 	Save([]byte) error
 	Load() ([]byte, error)
+	Shutdown() error
 }
 
 // Chunk is a chunk of data consisted of several data instances.
@@ -49,11 +77,12 @@ type taskEntry struct {
 	NumFailure int
 }
 
-type taskQueues struct {
+type masterState struct {
 	Todo    []taskEntry
 	Pending map[int]taskEntry // map from task ID to task entry
 	Done    []taskEntry
 	Failed  []taskEntry
+	CurPass int
 }
 
 // Service is the master server service.
@@ -61,16 +90,26 @@ type Service struct {
 	chunksPerTask int
 	timeoutDur    time.Duration
 	failureMax    int
-	ready         chan struct{}
 	store         Store
 
-	mu         sync.Mutex
-	initDone   bool
-	taskQueues taskQueues
+	ready    chan struct{}
+	initDone bool
+
+	mu sync.Mutex
+	// State to be persisted to snapshot.
+	state masterState
+	// The trainer that is currently saving model. This state is
+	// transient, does not need to be persisted to snapshot.
+	savingTrainer string
 }
 
 func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
-	id := 0
+	// generate uniq id across job using nanosecond + randint + counter
+	// FIXME(typhoonzero): this is a workaround, use uuid
+	randStart := rand.Int()
+	counter := 0
+	timestamp := time.Now().Nanosecond()
+	id := timestamp + randStart + counter
 	if chunksPerTask <= 0 {
 		chunksPerTask = 1
 	}
@@ -80,7 +119,8 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 	for i, c := range chunks {
 		if i%chunksPerTask == 0 && len(cur.Task.Chunks) > 0 {
 			cur.Task.Meta.ID = id
-			id++
+			counter++
+			id = timestamp + randStart + counter
 			result = append(result, cur)
 			cur.Task.Chunks = nil
 		}
@@ -102,8 +142,8 @@ func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, failur
 	s.chunksPerTask = chunksPerTask
 	s.timeoutDur = timeoutDur
 	s.failureMax = failureMax
-	s.taskQueues = taskQueues{}
-	s.taskQueues.Pending = make(map[int]taskEntry)
+	s.state = masterState{}
+	s.state.Pending = make(map[int]taskEntry)
 	s.ready = make(chan struct{})
 	s.store = store
 	recovered, err := s.recover()
@@ -141,7 +181,7 @@ func (s *Service) recover() (bool, error) {
 	}
 
 	dec := gob.NewDecoder(gr)
-	var tqs taskQueues
+	var tqs masterState
 	err = dec.Decode(&tqs)
 	if err != nil {
 		return false, err
@@ -154,13 +194,18 @@ func (s *Service) recover() (bool, error) {
 		log.Errorln(err)
 	}
 
-	s.taskQueues = tqs
+	s.state = tqs
+	log.WithFields(s.logFields()).Infof("Master recovered from snapshot, scheduling pending task timeout check.")
+	for _, t := range s.state.Pending {
+		time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.Meta.ID, t.Task.Meta.Epoch))
+	}
+
 	return true, nil
 }
 
 // snapshot *must* be called with s.mu being held.
 func (s *Service) snapshot() error {
-	// TOOD(helin): etcd request has a size limit, so the snapshot
+	// TODO(helin): etcd request has a size limit, so the snapshot
 	// size is limited by the max request size. We should either
 	// divide the snapshot into smaller chunks and save under
 	// different keys, or configure the request size to be big
@@ -169,7 +214,7 @@ func (s *Service) snapshot() error {
 	var buf bytes.Buffer
 	gw := gzip.NewWriter(&buf)
 	enc := gob.NewEncoder(gw)
-	err := enc.Encode(s.taskQueues)
+	err := enc.Encode(s.state)
 	if err != nil {
 		return err
 	}
@@ -215,6 +260,7 @@ func readChunks(globPaths []string) ([]Chunk, error) {
 		}
 
 		count := index.NumChunks()
+		log.Infof("readChunks: file %s has %d chunks", path, count)
 		for i := 0; i < count; i++ {
 			chunk := Chunk{
 				Path:  path,
@@ -231,7 +277,7 @@ func readChunks(globPaths []string) ([]Chunk, error) {
 //
 // SetDataset can be call multiple times. But only the first call will
 // be honored.
-func (s *Service) SetDataset(globPaths []string, dummy *int) error {
+func (s *Service) SetDataset(globPaths []string, _ *int) error {
 	if len(globPaths) == 0 {
 		return errors.New("no dataset specified")
 	}
@@ -250,19 +296,20 @@ func (s *Service) SetDataset(globPaths []string, dummy *int) error {
 		return err
 	}
 
-	s.taskQueues.Todo = partition(chunks, s.chunksPerTask)
+	s.state.Todo = partition(chunks, s.chunksPerTask)
 
 	err = s.snapshot()
 	if err != nil {
 		log.Errorln(err)
 		return err
 	}
-
 	close(s.ready)
 	s.initDone = true
 	return nil
 }
 
+// processFailedTask retry s.failureMax times for failed task.
+// return true if all task are done or failed.
 func (s *Service) processFailedTask(t taskEntry, epoch int) {
 	if t.Task.Meta.Epoch != epoch {
 		// new epoch, task launched after the
@@ -277,17 +324,17 @@ func (s *Service) processFailedTask(t taskEntry, epoch int) {
 		}
 	}()
 
-	delete(s.taskQueues.Pending, t.Task.Meta.ID)
+	delete(s.state.Pending, t.Task.Meta.ID)
 
 	t.NumFailure++
 	if t.NumFailure > s.failureMax {
 		log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure)
-		s.taskQueues.Failed = append(s.taskQueues.Failed, t)
+		s.state.Failed = append(s.state.Failed, t)
 		return
 	}
 
-	log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure)
-	s.taskQueues.Todo = append(s.taskQueues.Todo, t)
+	log.Warningf("Task %v failed %d times, re-dispatch.", t.Task, t.NumFailure)
+	s.state.Todo = append(s.state.Todo, t)
 	return
 }
 
@@ -296,7 +343,7 @@ func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
 		s.mu.Lock()
 		defer s.mu.Unlock()
 
-		t, ok := s.taskQueues.Pending[taskID]
+		t, ok := s.state.Pending[taskID]
 		if !ok {
 			return
 		}
@@ -308,51 +355,45 @@ func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
 // must be called with lock held.
 func (s *Service) logFields() log.Fields {
 	return log.Fields{
-		"todoLen":    len(s.taskQueues.Todo),
-		"pendingLen": len(s.taskQueues.Pending),
-		"doneLen":    len(s.taskQueues.Done),
-		"failedLen":  len(s.taskQueues.Failed),
+		"todoLen":    len(s.state.Todo),
+		"pendingLen": len(s.state.Pending),
+		"doneLen":    len(s.state.Done),
+		"failedLen":  len(s.state.Failed),
+		"curPass":    s.state.CurPass,
 	}
 }
 
 // GetTask gets a new task from the service.
-func (s *Service) GetTask(dummy int, task *Task) error {
+// passID is the client side pass count
+func (s *Service) GetTask(passID int, task *Task) error {
 	select {
 	case <-s.ready:
 	}
 
 	s.mu.Lock()
 	defer s.mu.Unlock()
+	if passID < s.state.CurPass {
+		return ErrPassBefore
+	}
+	if passID > s.state.CurPass {
+		// Client may get run to pass after master when one client faster than the
+		// other
+		return ErrPassAfter
+	}
 
-	if len(s.taskQueues.Todo) == 0 {
-		if len(s.taskQueues.Done) == 0 {
-			if len(s.taskQueues.Pending) == 0 {
-				err := errors.New("all task failed")
-				log.WithFields(s.logFields()).Warningln("All tasks failed.")
-				return err
-			}
-
-			// TODO(helin): client need to retry in this
-			// error case. Gotcha: RPC client can't
-			// compare returned error with predefined
-			// errors like io.EOF, because the error
-			// instance deserialized from RPC is a
-			// different instance than the error defined
-			// in package. So we need to figure out a way
-			// for client to check this error correctly.
-			err := errors.New("no more available task")
-			log.WithFields(s.logFields()).Warningln("No more available task.")
-			return err
+	if len(s.state.Todo) == 0 {
+		if len(s.state.Done) == 0 && len(s.state.Pending) == 0 {
+			log.WithFields(s.logFields()).Warningln("All tasks failed, may start next pass")
+			return ErrAllTaskFailed
 		}
-		s.taskQueues.Todo = s.taskQueues.Done
-		s.taskQueues.Done = nil
-		log.WithFields(s.logFields()).Infoln("No more todo task, but trainer is requesting task to do. Move all done task to todo.")
+		log.WithFields(s.logFields()).Warningln("No more available task.")
+		return ErrNoMoreAvailable
 	}
 
-	t := s.taskQueues.Todo[0]
+	t := s.state.Todo[0]
 	t.Task.Meta.Epoch++
-	s.taskQueues.Todo = s.taskQueues.Todo[1:]
-	s.taskQueues.Pending[t.Task.Meta.ID] = t
+	s.state.Todo = s.state.Todo[1:]
+	s.state.Pending[t.Task.Meta.ID] = t
 	err := s.snapshot()
 	if err != nil {
 		return err
@@ -374,7 +415,7 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
-	t, ok := s.taskQueues.Pending[taskID]
+	t, ok := s.state.Pending[taskID]
 	if !ok {
 		log.WithFields(s.logFields()).Warningln("Pending task #%d not found.", taskID)
 		return nil
@@ -382,15 +423,18 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
 
 	// task finished, reset timeout
 	t.NumFailure = 0
-	s.taskQueues.Done = append(s.taskQueues.Done, t)
-	delete(s.taskQueues.Pending, taskID)
+	s.state.Done = append(s.state.Done, t)
+	delete(s.state.Pending, taskID)
 
 	log.WithFields(s.logFields()).Infof("Task #%d finished.", taskID)
-
-	if len(s.taskQueues.Pending) == 0 && len(s.taskQueues.Todo) == 0 {
-		log.WithFields(s.logFields()).Infoln("No more todo and pending task, start a new pass.")
-		s.taskQueues.Todo = append(s.taskQueues.Todo, s.taskQueues.Done...)
-		s.taskQueues.Done = nil
+	if len(s.state.Todo) == 0 && len(s.state.Pending) == 0 {
+		// increase master side pass count if all tasks finished
+		s.state.CurPass++
+		s.state.Todo = append(s.state.Done, s.state.Failed...)
+		s.state.Done = []taskEntry{}
+		// TODO(typhoonzero): deal with failed tasks
+		s.state.Failed = []taskEntry{}
+		log.WithFields(s.logFields()).Warningf("all task finished, add new pass data, newpass: %d.", s.state.CurPass)
 	}
 
 	err := s.snapshot()
@@ -409,7 +453,7 @@ func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
-	t, ok := s.taskQueues.Pending[meta.ID]
+	t, ok := s.state.Pending[meta.ID]
 	if !ok {
 		log.WithFields(s.logFields()).Warningln("TaskFailed:Pending task #%v not found.", t.Task.Meta)
 		return nil
@@ -418,3 +462,42 @@ func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error {
 	s.processFailedTask(t, meta.Epoch)
 	return nil
 }
+
+// SaveModelRequest is the request for saving model
+type SaveModelRequest struct {
+	TrainerID string
+	BlockDur  time.Duration
+}
+
+// RequestSaveModel requests the master server to approve the caller
+// to save the model.
+func (s *Service) RequestSaveModel(req SaveModelRequest, need *bool) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if req.TrainerID == "" {
+		return errors.New("trainer id is empty")
+	}
+
+	if s.savingTrainer == "" {
+		*need = true
+	} else {
+		if req.TrainerID == s.savingTrainer {
+			// save trainer asked to save model again
+			*need = true
+		} else {
+			*need = false
+		}
+	}
+
+	if *need {
+		s.savingTrainer = req.TrainerID
+		time.AfterFunc(req.BlockDur, func() {
+			s.mu.Lock()
+			s.savingTrainer = ""
+			s.mu.Unlock()
+		})
+	}
+
+	return nil
+}
diff --git a/go/master/service_internal_test.go b/go/master/service_internal_test.go
index 9c0d1d0a39..bd1a939a55 100644
--- a/go/master/service_internal_test.go
+++ b/go/master/service_internal_test.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package master
 
 import "testing"
@@ -30,7 +44,8 @@ func TestPartionIndex(t *testing.T) {
 	cs := make([]Chunk, 100)
 	ts := partition(cs, 20)
 	for i := range ts {
-		if ts[i].Task.Meta.ID != i {
+		// test auto increament ids
+		if i > 0 && ts[i].Task.Meta.ID != ts[i-1].Task.Meta.ID+1 {
 			t.Error(ts[i], i)
 		}
 	}
diff --git a/go/master/service_test.go b/go/master/service_test.go
new file mode 100644
index 0000000000..2d00c22d6f
--- /dev/null
+++ b/go/master/service_test.go
@@ -0,0 +1,72 @@
+package master_test
+
+import (
+	"io/ioutil"
+	"net/url"
+	"os"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/PaddlePaddle/Paddle/go/master"
+	"github.com/coreos/etcd/clientv3"
+	"github.com/coreos/etcd/embed"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestNewServiceWithEtcd(t *testing.T) {
+	// setup an embed etcd server
+	etcdDir, err := ioutil.TempDir("", "")
+	if err != nil {
+		t.Fatal(err)
+	}
+	cfg := embed.NewConfig()
+	lpurl, _ := url.Parse("http://localhost:0")
+	lcurl, _ := url.Parse("http://localhost:0")
+	cfg.LPUrls = []url.URL{*lpurl}
+	cfg.LCUrls = []url.URL{*lcurl}
+	cfg.Dir = etcdDir
+	e, err := embed.StartEtcd(cfg)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer func() {
+		e.Close()
+		if err := os.RemoveAll(etcdDir); err != nil {
+			t.Fatal(err)
+		}
+	}()
+
+	<-e.Server.ReadyNotify()
+
+	port := strings.Split(e.Clients[0].Addr().String(), ":")[1]
+	endpoint := "127.0.0.1:" + port
+
+	ep := []string{endpoint}
+	masterAddr := "127.0.0.1:3306"
+	store, err := master.NewEtcdClient(ep, masterAddr, master.DefaultLockPath, master.DefaultAddrPath, master.DefaultStatePath, 30)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	_, err = master.NewService(store, 10, 10, 3)
+	if err != nil {
+		t.Fatal(err)
+	}
+	cli, err := clientv3.New(clientv3.Config{
+		Endpoints:   ep,
+		DialTimeout: 3 * time.Second,
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	v, err := master.GetKey(cli, master.DefaultAddrPath, 3*time.Second)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := cli.Close(); err != nil {
+		t.Fatal(err)
+	}
+	// test master process registry itself into etcd server.
+	assert.Equal(t, masterAddr, v, "master process should registry itself into etcd server.")
+}
diff --git a/go/pserver/CMakeLists.txt b/go/pserver/CMakeLists.txt
index 6267040a6e..4fe0a8cb02 100644
--- a/go/pserver/CMakeLists.txt
+++ b/go/pserver/CMakeLists.txt
@@ -1,3 +1,17 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 if(WITH_TESTING)
   go_test(pserver_test DEPS paddle_go_optimizer)
 endif()
diff --git a/go/pserver/client/CMakeLists.txt b/go/pserver/client/CMakeLists.txt
index 0052bb460b..e295611060 100644
--- a/go/pserver/client/CMakeLists.txt
+++ b/go/pserver/client/CMakeLists.txt
@@ -1,3 +1,17 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 if(WITH_TESTING)
   go_test(pserver_client_test DEPS paddle_go_optimizer)
 endif()
diff --git a/go/pserver/client/c/CMakeLists.txt b/go/pserver/client/c/CMakeLists.txt
index c6333eab55..a932791c7c 100644
--- a/go/pserver/client/c/CMakeLists.txt
+++ b/go/pserver/client/c/CMakeLists.txt
@@ -1,3 +1,17 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags protobuf)
 target_link_libraries(paddle_go_optimizer stdc++ m)
 
diff --git a/go/pserver/client/c/cclient.go b/go/pserver/client/c/cclient.go
index 7ddaceb7ed..a49cd01522 100644
--- a/go/pserver/client/c/cclient.go
+++ b/go/pserver/client/c/cclient.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package main
 
 /*
@@ -34,7 +48,6 @@ import (
 	log "github.com/sirupsen/logrus"
 )
 
-var nullPtr = unsafe.Pointer(uintptr(0))
 var mu sync.Mutex
 var handleMap = make(map[C.paddle_pserver_client]*client.Client)
 var curHandle C.paddle_pserver_client
@@ -42,10 +55,10 @@ var curHandle C.paddle_pserver_client
 func add(c *client.Client) C.paddle_pserver_client {
 	mu.Lock()
 	defer mu.Unlock()
-	client := curHandle
+	cli := curHandle
 	curHandle++
-	handleMap[client] = c
-	return client
+	handleMap[cli] = c
+	return cli
 }
 
 func get(client C.paddle_pserver_client) *client.Client {
@@ -63,7 +76,7 @@ func remove(client C.paddle_pserver_client) *client.Client {
 }
 
 func cArrayToSlice(p unsafe.Pointer, len int) []byte {
-	if p == nullPtr {
+	if p == nil {
 		return nil
 	}
 
@@ -77,8 +90,12 @@ func cArrayToSlice(p unsafe.Pointer, len int) []byte {
 
 type selector bool
 
-func (s selector) Select() bool {
-	return bool(s)
+func (s selector) Select() (bool, error) {
+	return bool(s), nil
+}
+
+func (s selector) Done() error {
+	return nil
 }
 
 type lister []client.Server
@@ -101,11 +118,10 @@ func paddle_new_pserver_client(addrs *C.char, selected int) C.paddle_pserver_cli
 }
 
 //export paddle_new_etcd_pserver_client
-func paddle_new_etcd_pserver_client(etcd_endpoints *C.char, selected int) C.paddle_pserver_client {
-	// TODO(Longfei: use etcd lock to decide which trainer to initialize the parameters)
-	addr := C.GoString(etcd_endpoints)
-	etcd_client := client.NewEtcd(addr)
-	c := client.NewClient(etcd_client, etcd_client.Desired(), selector(selected != 0))
+func paddle_new_etcd_pserver_client(etcdEndpoints *C.char) C.paddle_pserver_client {
+	addr := C.GoString(etcdEndpoints)
+	etcdClient := client.NewEtcd(addr)
+	c := client.NewClient(etcdClient, etcdClient.Desired(), etcdClient)
 	return add(c)
 }
 
@@ -114,30 +130,41 @@ func paddle_pserver_client_release(client C.paddle_pserver_client) {
 	remove(client)
 }
 
+// paddle_begin_init_params tells trainer if it needs to init the
+// parameters.
+//
+// returns 1 if the trainer needs to init the parameters. 0 if the
+// trainer does not need to init the parameters.
+//
 //export paddle_begin_init_params
 func paddle_begin_init_params(client C.paddle_pserver_client) C.int {
 	c := get(client)
-	if selected := c.BeginInitParams(); selected {
+	selected, err := c.BeginInitParams()
+	if err != nil {
+		panic(err)
+	}
+
+	if selected {
 		return 1
 	}
-	return C.PSERVER_OK
+	return 0
 }
 
 //export paddle_init_param
-func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, param_config unsafe.Pointer, config_len C.int) C.int {
+func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, paramConfig unsafe.Pointer, configLen C.int) C.int {
 	et := pserver.ElementType(param.element_type)
 	name := C.GoString(param.name)
 	content := cArrayToSlice(unsafe.Pointer(param.content), int(param.content_len))
 	pc := pserver.ParameterWithConfig{
 		Param:  pserver.Parameter{Name: name, ElementType: et, Content: content},
-		Config: cArrayToSlice(param_config, int(config_len)),
+		Config: cArrayToSlice(paramConfig, int(configLen)),
 	}
 	c := get(client)
 	err := c.InitParam(pc)
 
 	if err != nil {
 		if err.Error() == pserver.AlreadyInitialized {
-			log.Warningf("parameter %s already initialized, treat paddle_init_param as sucessful.", name)
+			log.Warningf("parameter %s already initialized, treat paddle_init_param as successful.", name)
 			return C.PSERVER_OK
 		}
 		log.Errorln(err)
@@ -153,7 +180,7 @@ func paddle_finish_init_params(client C.paddle_pserver_client) C.int {
 	err := c.FinishInitParams()
 	if err != nil {
 		if err.Error() == pserver.AlreadyInitialized {
-			log.Warningln("parameters already initialized, treat paddle_finish_init_params as sucessful.")
+			log.Warningln("parameters already initialized, treat paddle_finish_init_params as successful.")
 			return C.PSERVER_OK
 		}
 
@@ -223,12 +250,12 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter,
 		p := ps[i]
 		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
 
-		if unsafe.Pointer(param) == nullPtr {
+		if unsafe.Pointer(param) == nil {
 			log.Errorln("must pre-allocate parameter.")
 			return C.PSERVER_ERROR
 		}
 
-		if unsafe.Pointer(param.content) != nullPtr {
+		if unsafe.Pointer(param.content) != nil {
 			if int(param.content_len) != len(p.Content) {
 				log.Errorf("the pre-allocated content len does not match parameter content len. Pre-allocated len: %d, returned len: %d", param.content_len, len(p.Content))
 				return C.PSERVER_ERROR
@@ -243,17 +270,4 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter,
 	return C.PSERVER_OK
 }
 
-//export paddle_save_model
-func paddle_save_model(client C.paddle_pserver_client, path *C.char) C.int {
-	p := C.GoString(path)
-	c := get(client)
-	err := c.Save(p)
-	if err != nil {
-		log.Errorln(err)
-		return C.PSERVER_ERROR
-	}
-
-	return C.PSERVER_OK
-}
-
 func main() {} // Required but ignored
diff --git a/go/pserver/client/c/test/CMakeLists.txt b/go/pserver/client/c/test/CMakeLists.txt
index dce8645ce7..3724ccb60b 100644
--- a/go/pserver/client/c/test/CMakeLists.txt
+++ b/go/pserver/client/c/test/CMakeLists.txt
@@ -1,2 +1,16 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_go_optimizer)
 add_style_check_target(test_cclient test_cclient.c)
diff --git a/go/pserver/client/c/test/test_cclient.c b/go/pserver/client/c/test/test_cclient.c
index 8eababbe33..89c4d7f00a 100644
--- a/go/pserver/client/c/test/test_cclient.c
+++ b/go/pserver/client/c/test/test_cclient.c
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include <stdio.h>
 #include <stdlib.h>
 
@@ -97,9 +111,5 @@ retry:
     getParams(c);
   }
 
-  if (paddle_save_model(c, "/tmp/")) {
-    fail();
-  }
-
   return 0;
 }
diff --git a/go/pserver/client/c/test/test_train.py b/go/pserver/client/c/test/test_train.py
index d6922672f4..8d9c6b9b20 100644
--- a/go/pserver/client/c/test/test_train.py
+++ b/go/pserver/client/c/test/test_train.py
@@ -1,5 +1,13 @@
 import paddle.v2 as paddle
 import paddle.v2.dataset.uci_housing as uci_housing
+import paddle.v2.master as master
+import os
+import cPickle as pickle
+from paddle.v2.reader.creator import cloud_reader
+
+etcd_ip = os.getenv("MASTER_IP", "127.0.0.1")
+etcd_endpoints = "http://" + etcd_ip + ":2379"
+print "etcd endpoints: ", etcd_endpoints
 
 
 def main():
@@ -20,19 +28,20 @@ def main():
     parameters = paddle.parameters.create(cost)
 
     # create optimizer of new remote updater to pserver
-    optimizer = paddle.optimizer.Momentum(momentum=0)
-
-    #TODO(zhihong) : replace optimizer with new OptimizerConfig
+    optimizer = paddle.optimizer.Momentum(momentum=0, learning_rate=1e-3)
 
     trainer = paddle.trainer.SGD(cost=cost,
                                  parameters=parameters,
                                  update_equation=optimizer,
                                  is_local=False,
-                                 pserver_spec="localhost:3000")
+                                 pserver_spec=etcd_endpoints,
+                                 use_etcd=True)
 
     # event_handler to print training and testing info
     def event_handler(event):
         if isinstance(event, paddle.event.EndIteration):
+            # FIXME: for cloud data reader, pass number is managed by master
+            # should print the server side pass number
             if event.batch_id % 100 == 0:
                 print "Pass %d, Batch %d, Cost %f" % (
                     event.pass_id, event.batch_id, event.cost)
@@ -47,10 +56,14 @@ def main():
                 print "Test %d, %.2f" % (event.pass_id, result.cost)
 
     # training
+    # NOTE: use uci_housing.train() as reader for non-paddlecloud training
     trainer.train(
         reader=paddle.batch(
             paddle.reader.shuffle(
-                uci_housing.train(), buf_size=500),
+                cloud_reader(
+                    ["/pfs/dlnel/public/dataset/uci_housing/uci_housing*"],
+                    etcd_endpoints),
+                buf_size=500),
             batch_size=2),
         feeding={'x': 0,
                  'y': 1},
diff --git a/go/pserver/client/client.go b/go/pserver/client/client.go
index aa8bfe30c2..20d91e7703 100644
--- a/go/pserver/client/client.go
+++ b/go/pserver/client/client.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package client
 
 import (
@@ -13,9 +27,13 @@ import (
 
 // TODO(helin): add RPC call retry logic
 
-// Selector selects if the client should initialize parameter servers.
+// Selector selects if the client should initialize parameters and
+// reports the initialization process done.
 type Selector interface {
-	Select() bool
+	// Select selects if the client should initialize parameter servers.
+	Select() (bool, error)
+	// Done indicates the initialization process is done.
+	Done() error
 }
 
 // Server is the identification of a parameter Server.
@@ -101,7 +119,7 @@ func (c *Client) monitorPservers(l Lister, pserverNum int) {
 // servers. Other trainers will be blocked until the initialization is
 // done, and they need to get the initialized parameters from
 // parameter servers using GetParams.
-func (c *Client) BeginInitParams() bool {
+func (c *Client) BeginInitParams() (bool, error) {
 	return c.sel.Select()
 }
 
@@ -205,35 +223,9 @@ func (c *Client) GetParams(names []string) ([]pserver.Parameter, error) {
 	return ps, nil
 }
 
-// Save indicates parameters to save the parameter to the given path.
-func (c *Client) Save(path string) error {
-	errCh := make(chan error, len(c.pservers))
-
-	for _, p := range c.pservers {
-		err := p.Call("Service.Save", path, nil)
-		errCh <- err
-	}
-
-	recv := 0
-	for err := range errCh {
-		if err != nil {
-			return err
-		}
-
-		recv++
-		if recv == len(c.pservers) {
-			break
-		}
-	}
-
-	// TODO(helin): there will be many files under path, need to
-	// merge them into a single file.
-	return nil
-}
-
 func strHash(s string) uint32 {
 	h := fnv.New32a()
-	h.Write([]byte(s))
+	_, _ = h.Write([]byte(s))
 	return h.Sum32()
 }
 
diff --git a/go/pserver/client/client_test.go b/go/pserver/client/client_test.go
index 2b72a202b5..c3d88e926d 100644
--- a/go/pserver/client/client_test.go
+++ b/go/pserver/client/client_test.go
@@ -1,13 +1,29 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package client_test
 
 import (
 	"context"
 	"io/ioutil"
+	"math/rand"
 	"net"
 	"net/http"
 	"net/rpc"
 	"strconv"
 	"strings"
+	"sync"
 	"testing"
 	"time"
 
@@ -43,7 +59,7 @@ func initClient() [numPserver]int {
 
 		go func(l net.Listener) {
 			var cp pserver.Checkpoint
-			s, err := pserver.NewService(0, 1, "", nil, cp)
+			s, err := pserver.NewService(0, time.Hour, "", nil, cp)
 			if err != nil {
 				panic(err)
 			}
@@ -77,21 +93,43 @@ func initEtcdClient() {
 		log.Errorf("err %v", err)
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
-	client.Delete(ctx, pserver.PsDesired)
-	client.Delete(ctx, pserver.PsPath)
-	client.Put(ctx, pserver.PsDesired, strconv.Itoa(numPserver))
+	_, err = client.Delete(ctx, pserver.PsDesired)
+	if err != nil {
+		panic(err)
+	}
+
+	_, err = client.Delete(ctx, pserver.PsPath)
+	if err != nil {
+		panic(err)
+	}
+
+	_, err = client.Put(ctx, pserver.PsDesired, strconv.Itoa(numPserver))
+	if err != nil {
+		panic(err)
+	}
+
 	ports := initClient()
 	for i := 0; i < numPserver; i++ {
-		client.Put(ctx, pserver.PsPath+strconv.Itoa(i), ":"+strconv.Itoa(ports[i]))
+		_, err = client.Put(ctx, pserver.PsPath+strconv.Itoa(i), ":"+strconv.Itoa(ports[i]))
+		if err != nil {
+			panic(err)
+		}
 	}
 	cancel()
-	client.Close()
+	err = client.Close()
+	if err != nil {
+		panic(err)
+	}
 }
 
 type selector bool
 
-func (s selector) Select() bool {
-	return bool(s)
+func (s selector) Select() (bool, error) {
+	return bool(s), nil
+}
+
+func (s selector) Done() error {
+	return nil
 }
 
 type lister []client.Server
@@ -100,27 +138,38 @@ func (l lister) List() []client.Server {
 	return l
 }
 
-func ClientTest(t *testing.T, c *client.Client) {
-	selected := c.BeginInitParams()
+func testClient(t *testing.T, c *client.Client) {
+	selected, err := c.BeginInitParams()
+	if err != nil {
+		t.Fatal(err)
+	}
+
 	if !selected {
 		t.Fatal("should be selected.")
 	}
 
-	const numParameter = 100
+	const numParameter = 1000
 	config, err := ioutil.ReadFile("./c/test/testdata/optimizer.pb")
 	if err != nil {
 		t.Fatalf("read optimizer proto failed")
 	}
+
+	var wg sync.WaitGroup
 	for i := 0; i < numParameter; i++ {
-		var p pserver.Parameter
-		p.Name = "p_" + strconv.Itoa(i)
-		p.ElementType = pserver.Float32
-		p.Content = make([]byte, (i+1)*100)
-		err := c.InitParam(pserver.ParameterWithConfig{Param: p, Config: config})
-		if err != nil {
-			t.Fatal(err)
-		}
+		wg.Add(1)
+		go func(i int) {
+			var p pserver.Parameter
+			p.Name = "p_" + strconv.Itoa(i)
+			p.ElementType = pserver.Float32
+			p.Content = make([]byte, (i+1)*100)
+			err := c.InitParam(pserver.ParameterWithConfig{Param: p, Config: config})
+			if err != nil {
+				t.Fatal(err)
+			}
+			wg.Done()
+		}(i)
 	}
+	wg.Wait()
 
 	err = c.FinishInitParams()
 	if err != nil {
@@ -128,7 +177,7 @@ func ClientTest(t *testing.T, c *client.Client) {
 	}
 
 	var grads []pserver.Gradient
-	for i := 0; i < numParameter/2; i++ {
+	for i := 0; i < numParameter; i++ {
 		var g pserver.Gradient
 		g.Name = "p_" + strconv.Itoa(i)
 		g.ElementType = pserver.Float32
@@ -136,9 +185,31 @@ func ClientTest(t *testing.T, c *client.Client) {
 		grads = append(grads, g)
 	}
 
-	err = c.SendGrads(grads)
-	if err != nil {
-		t.Fatal(err)
+	const paramPerGroup = 10
+	const numGroups = numParameter / paramPerGroup
+
+	// shuffle send grads order
+	for i := range grads {
+		j := rand.Intn(i + 1)
+		grads[i], grads[j] = grads[j], grads[i]
+	}
+
+	for i := 0; i < numGroups; i++ {
+		var gs []pserver.Gradient
+		if i == numGroups-1 {
+			gs = grads[i*paramPerGroup:]
+		} else {
+			gs = grads[i*paramPerGroup : (i+1)*paramPerGroup]
+		}
+
+		wg.Add(1)
+		go func(gs []pserver.Gradient) {
+			err := c.SendGrads(gs)
+			if err != nil {
+				t.Fatal(err)
+			}
+			wg.Done()
+		}(gs)
 	}
 
 	names := make([]string, numParameter)
@@ -146,20 +217,35 @@ func ClientTest(t *testing.T, c *client.Client) {
 		names[i] = "p_" + strconv.Itoa(i)
 	}
 
-	params, err := c.GetParams(names)
-	if err != nil {
-		t.Fatal(err)
-	}
+	for i := 0; i < numGroups; i++ {
+		var ns []string
+		if i == numGroups-1 {
+			ns = names[i*paramPerGroup:]
+		} else {
+			ns = names[i*paramPerGroup : (i+1)*paramPerGroup]
+		}
 
-	if len(names) != len(params) {
-		t.Fatalf("parameter size not match, need: %d, have: %d", len(names), len(params))
-	}
+		wg.Add(1)
+		go func(ns []string) {
+			params, err := c.GetParams(ns)
+			if err != nil {
+				t.Fatal(err)
+			}
 
-	for i := range params {
-		if names[i] != params[i].Name {
-			t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", names[i], params[i].Name)
-		}
+			if len(ns) != len(params) {
+				t.Fatalf("parameter size not match, need: %d, have: %d", len(names), len(params))
+			}
+
+			for i := range params {
+				if ns[i] != params[i].Name {
+					t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", ns[i], params[i].Name)
+				}
+			}
+			wg.Done()
+		}(ns)
 	}
+
+	wg.Wait()
 }
 
 func TestNativeClient(t *testing.T) {
@@ -169,13 +255,14 @@ func TestNativeClient(t *testing.T) {
 		servers[i] = client.Server{Index: i, Addr: ":" + strconv.Itoa(pserverClientPorts[i])}
 	}
 	c1 := client.NewClient(lister(servers), len(servers), selector(true))
-	ClientTest(t, c1)
+	testClient(t, c1)
 }
 
-// TODO: tmperary disable etcdClient test for dependency of etcd)
+// EtcdClient is a disabled test, since we have not embedded etcd into
+// our test.
 func EtcdClient(t *testing.T) {
 	initEtcdClient()
 	etcdClient := client.NewEtcd(etcdEndpoints)
 	c2 := client.NewClient(etcdClient, etcdClient.Desired(), selector(true))
-	ClientTest(t, c2)
+	testClient(t, c2)
 }
diff --git a/go/pserver/client/etcd_client.go b/go/pserver/client/etcd_client.go
index 1fd3479aa8..f9071caaa8 100644
--- a/go/pserver/client/etcd_client.go
+++ b/go/pserver/client/etcd_client.go
@@ -1,54 +1,75 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package client
 
 import (
 	"context"
+	"errors"
+	"fmt"
 	"strconv"
 	"strings"
 	"time"
 
 	"github.com/PaddlePaddle/Paddle/go/pserver"
 	"github.com/coreos/etcd/clientv3"
+	"github.com/coreos/etcd/clientv3/concurrency"
 	log "github.com/sirupsen/logrus"
 )
 
 const (
-	DefaultEtcdTimeout time.Duration = 5 * time.Second
+	defaultEtcdTimeout time.Duration = 5 * time.Second
+
+	initLockPath = "/init_ps/lock"
+	initDonePath = "/init_ps/done"
+	initDoneVal  = "1"
 )
 
-// EtcdClient is used by pserver client that is a part of trainer process.
+// Etcd is used by pserver client that is a part of trainer process.
 // TODO:
-// 1. add watcher to watch the change state of pservers)
-// 1. add etcd lock)
-type EtcdClient struct {
+// 1. add watcher to watch the change state of pservers.
+type Etcd struct {
 	client    *clientv3.Client
 	timeout   time.Duration
 	endpoints []string
+	lock      *concurrency.Mutex
 }
 
 // Desired read ps desired number from etcd.
-func (p *EtcdClient) Desired() int {
+func (e *Etcd) Desired() int {
 	var psDesired int
 	for {
-		ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
-		resp, err := p.client.Get(ctx, pserver.PsDesired)
+		ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
+		resp, err := e.client.Get(ctx, pserver.PsDesired)
 		cancel()
 		if err != nil {
 			log.Errorf("Get ps dresire number failed! recnnectiong..., %v", err)
-			time.Sleep(p.timeout)
+			time.Sleep(e.timeout)
 			continue
 		}
 
 		kvs := resp.Kvs
 		if len(kvs) == 0 {
 			log.Infoln("Waiting for ps desired registered ...")
-			time.Sleep(p.timeout)
+			time.Sleep(e.timeout)
 			continue
 		}
 
 		psDesired, err = strconv.Atoi(string(resp.Kvs[0].Value))
 		if err != nil {
-			log.Errorf("psDesired %s invalid %v", psDesired, err)
-			time.Sleep(p.timeout)
+			log.Errorf("psDesired %d invalid %v", psDesired, err)
+			time.Sleep(e.timeout)
 			continue
 		}
 
@@ -59,26 +80,26 @@ func (p *EtcdClient) Desired() int {
 }
 
 // List return the pserver list read from etcd.
-func (p *EtcdClient) List() []Server {
-	psDesired := p.Desired()
+func (e *Etcd) List() []Server {
+	psDesired := e.Desired()
 
 	servers := make([]Server, psDesired)
 	for {
 		for i := 0; i < psDesired; i++ {
-			ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
-			cancel()
+			ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
 			psKey := pserver.PsPath + strconv.Itoa(i)
 			log.Debugf("checking %s", psKey)
-			resp, err := p.client.Get(ctx, psKey)
+			resp, err := e.client.Get(ctx, psKey)
+			cancel()
 			if err != nil {
 				log.Infof("Get psKey= %s error, %v", psKey, err)
-				time.Sleep(p.timeout)
+				time.Sleep(e.timeout)
 				continue
 			}
 			kvs := resp.Kvs
 			if len(kvs) == 0 {
 				log.Infof("Waiting for ps addr registered ...")
-				time.Sleep(p.timeout)
+				time.Sleep(e.timeout)
 				continue
 			}
 
@@ -86,10 +107,10 @@ func (p *EtcdClient) List() []Server {
 			// TODO(Longfei) check the ps address
 			if psAddr == "" {
 				log.Infof("Get psKey = %s, psAddr is empty", psKey)
-				time.Sleep(p.timeout)
+				time.Sleep(e.timeout)
 				continue
 			}
-			log.Infof("got value (%s) for key: %s", psAddr, psKey)
+			log.Debugf("got value (%s) for key: %s", psAddr, psKey)
 			servers[i].Index = i
 			servers[i].Addr = psAddr
 		}
@@ -99,27 +120,135 @@ func (p *EtcdClient) List() []Server {
 }
 
 // NewEtcd create a etcd client to return the state of pserver on etcd.
-func NewEtcd(endpoints string) *EtcdClient {
+func NewEtcd(endpoints string) *Etcd {
 	ep := strings.Split(endpoints, ",")
 	var cli *clientv3.Client
 	var err error
 	for {
 		cli, err = clientv3.New(clientv3.Config{
 			Endpoints:   ep,
-			DialTimeout: DefaultEtcdTimeout,
+			DialTimeout: defaultEtcdTimeout,
 		})
 		if err != nil {
 			log.Errorf("Init etcd connection failed: %v", err)
-			time.Sleep(DefaultEtcdTimeout)
+			time.Sleep(defaultEtcdTimeout)
 			continue
 		}
 		break
 	}
 	log.Infof("Connected to etcd: %s\n", endpoints)
-	client := &EtcdClient{
+	client := &Etcd{
 		client:    cli,
-		timeout:   DefaultEtcdTimeout,
+		timeout:   defaultEtcdTimeout,
 		endpoints: ep,
 	}
 	return client
 }
+
+// Select indicates if the current trainer is selected to initialize
+// the pserver parameters.
+func (e *Etcd) Select() (bool, error) {
+	sess, err := concurrency.NewSession(e.client, concurrency.WithTTL(5))
+	if err != nil {
+		return false, err
+	}
+
+	lock := concurrency.NewMutex(sess, initLockPath)
+	log.Infof("Trying to acquire lock at %s.", initLockPath)
+	// Do not use timeout context here, since we don't know how
+	// long does it take for other trainers to initialize the
+	// parameters.
+	err = lock.Lock(context.Background())
+	if err != nil {
+		return false, err
+	}
+	log.Infof("Successfully acquired lock at %s.", initLockPath)
+
+	get := clientv3.OpGet(initDonePath)
+	ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
+	tresp, err := e.client.Txn(ctx).If(lock.IsOwner()).Then(get).Commit()
+	cancel()
+	if err != nil {
+		return false, err
+	}
+
+	if !tresp.Succeeded {
+		return false, errors.New("no longer the owner of the lock")
+	}
+
+	resp := tresp.Responses[0].GetResponseRange()
+
+	if len(resp.Kvs) == 0 {
+		// Key value not set, select current trainer.
+		e.lock = lock
+		log.Infoln("Trainer selected.")
+		return true, nil
+	}
+
+	if string(resp.Kvs[0].Value) == initDoneVal {
+		log.Infoln("Initialization is already done.")
+		ctx, cancel = context.WithTimeout(context.Background(), e.timeout)
+		err = lock.Unlock(ctx)
+		cancel()
+		if err != nil {
+			log.Errorln(err)
+		}
+		return false, nil
+	}
+
+	return false, fmt.Errorf("key %s have unexpected value: %v", initDonePath, resp.Kvs[0].Value)
+}
+
+// Done indicates the parameter initialization process is done.
+func (e *Etcd) Done() error {
+	if e.lock == nil {
+		return errors.New("lock is nil, Done called unexpectedly")
+	}
+
+	put := clientv3.OpPut(initDonePath, initDoneVal)
+	ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
+	tresp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(put).Commit()
+	cancel()
+	if err != nil {
+		return err
+	}
+
+	if !tresp.Succeeded {
+		return errors.New("no longer the owner of the lock")
+	}
+
+	ctx, cancel = context.WithTimeout(context.Background(), e.timeout)
+	err = e.lock.Unlock(ctx)
+	cancel()
+	if err != nil {
+		log.Errorln(err)
+	} else {
+		e.lock = nil
+	}
+
+	return nil
+}
+
+// Close closes the etcd client.
+func (e *Etcd) Close() error {
+	var err error
+	if e.lock != nil {
+		ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
+		err = e.lock.Unlock(ctx)
+		cancel()
+		if err == nil {
+			e.lock = nil
+		}
+	}
+
+	cErr := e.client.Close()
+	if cErr != nil {
+		if err != nil {
+			log.Errorln(cErr)
+			return err
+		}
+		return cErr
+	}
+
+	return err
+}
diff --git a/go/pserver/client/etcd_client_test.go b/go/pserver/client/etcd_client_test.go
new file mode 100644
index 0000000000..08742433e7
--- /dev/null
+++ b/go/pserver/client/etcd_client_test.go
@@ -0,0 +1,106 @@
+package client_test
+
+import (
+	"io/ioutil"
+	"net/url"
+	"os"
+	"strings"
+	"sync"
+	"testing"
+
+	"github.com/PaddlePaddle/Paddle/go/pserver/client"
+	"github.com/coreos/etcd/embed"
+)
+
+func TestSelector(t *testing.T) {
+	etcdDir, err := ioutil.TempDir("", "")
+	if err != nil {
+		t.Fatal(err)
+	}
+	cfg := embed.NewConfig()
+	lpurl, _ := url.Parse("http://localhost:0")
+	lcurl, _ := url.Parse("http://localhost:0")
+	cfg.LPUrls = []url.URL{*lpurl}
+	cfg.LCUrls = []url.URL{*lcurl}
+	cfg.Dir = etcdDir
+	e, err := embed.StartEtcd(cfg)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	defer func() {
+		e.Close()
+		if err := os.RemoveAll(etcdDir); err != nil {
+			t.Fatal(err)
+		}
+	}()
+
+	<-e.Server.ReadyNotify()
+
+	port := strings.Split(e.Clients[0].Addr().String(), ":")[1]
+	endpoint := "127.0.0.1:" + port
+
+	var mu sync.Mutex
+	selectedCount := 0
+	var wg sync.WaitGroup
+	selectAndDone := func(c *client.Etcd) {
+		defer wg.Done()
+
+		selected, err := c.Select()
+		if err != nil {
+			panic(err)
+		}
+
+		if selected {
+			mu.Lock()
+			selectedCount++
+			mu.Unlock()
+			err = c.Done()
+			if err != nil {
+				t.Fatal(err)
+			}
+		}
+	}
+
+	c0 := client.NewEtcd(endpoint)
+	c1 := client.NewEtcd(endpoint)
+	c2 := client.NewEtcd(endpoint)
+	c3 := client.NewEtcd(endpoint)
+	wg.Add(3)
+	go selectAndDone(c0)
+	go selectAndDone(c1)
+	go selectAndDone(c2)
+	wg.Wait()
+
+	// simulate trainer crashed and restarted after the
+	// initialization process.
+	wg.Add(1)
+	go selectAndDone(c3)
+	wg.Wait()
+
+	mu.Lock()
+	if selectedCount != 1 {
+		t.Fatal("selected count wrong:", selectedCount)
+	}
+	mu.Unlock()
+
+	err = c0.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	err = c1.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	err = c2.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	err = c3.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+}
diff --git a/go/pserver/etcd_client.go b/go/pserver/etcd_client.go
index 4a694b97f4..41f0640fc0 100644
--- a/go/pserver/etcd_client.go
+++ b/go/pserver/etcd_client.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package pserver
 
 import (
@@ -20,16 +34,19 @@ const (
 	PsPath = "/ps/"
 	// PsCheckpoint is the etcd path for store checkpoints information
 	PsCheckpoint = "/checkpoints/"
+
+	retryTimeout = 5 * time.Second
 )
 
 // EtcdClient is the etcd client that the pserver uses for fault
 // tolerance, service registry and coordination.
 type EtcdClient struct {
-	numPservers   int
-	etcdEndpoints string
-	etcdClient    *clientv3.Client
-	// etcdTimeout is also used as retry intervals.
-	etcdTimeout time.Duration
+	numPservers int
+	endpoints   string
+	client      *clientv3.Client
+	sess        *concurrency.Session
+	dialTimeout time.Duration
+	ttlSec      int
 	// FIXME: ensure GetExternalIP gets the correct ip for trainers to connect.
 	externalIP string
 	// desired number of pservers in the job.
@@ -38,19 +55,19 @@ type EtcdClient struct {
 }
 
 // NewEtcdClient creates an EtcdClient
-func NewEtcdClient(endpoints string, numPservers int, timeout time.Duration) *EtcdClient {
+func NewEtcdClient(endpoints string, numPservers int, dialtimeout time.Duration, ttlSec int) *EtcdClient {
 	return &EtcdClient{
-		etcdTimeout:   timeout,
-		numPservers:   numPservers,
-		etcdEndpoints: endpoints,
+		dialTimeout: dialtimeout,
+		ttlSec:      ttlSec,
+		numPservers: numPservers,
+		endpoints:   endpoints,
 	}
 }
 
 // Register registers the pserver on etcd
 //
 // Register returns the index of the current pserver.
-func (e *EtcdClient) Register() (int, error) {
-
+func (e *EtcdClient) Register(port int) (int, error) {
 	var err error
 	e.externalIP, err = networkhelper.GetExternalIP()
 	if err != nil {
@@ -58,19 +75,26 @@ func (e *EtcdClient) Register() (int, error) {
 	}
 
 	// initialize connection to etcd.
-	ep := strings.Split(e.etcdEndpoints, ",")
+	ep := strings.Split(e.endpoints, ",")
 	for {
 		cli, err := clientv3.New(clientv3.Config{
 			Endpoints:   ep,
-			DialTimeout: e.etcdTimeout,
+			DialTimeout: e.dialTimeout,
 		})
 		if err != nil {
 			log.Errorf("connect to etcd error: %v", err)
-			time.Sleep(e.etcdTimeout)
+			time.Sleep(retryTimeout)
 			continue
 		}
-		e.etcdClient = cli
-		log.Debugf("inited client to %s", e.etcdEndpoints)
+		e.client = cli
+		sess, err := concurrency.NewSession(cli, concurrency.WithTTL(e.ttlSec))
+		if err != nil {
+			log.Errorf("create etcd session error: %v", err)
+			time.Sleep(retryTimeout)
+			continue
+		}
+		e.sess = sess
+		log.Debugf("inited client to %s", e.endpoints)
 		break
 	}
 	// init /ps_desired using transaction, for multiple pservers may want to write
@@ -81,7 +105,7 @@ func (e *EtcdClient) Register() (int, error) {
 		cancel()
 		if err != nil {
 			log.Warn(err)
-			time.Sleep(e.etcdTimeout)
+			time.Sleep(retryTimeout)
 			continue
 		}
 		break
@@ -92,18 +116,18 @@ func (e *EtcdClient) Register() (int, error) {
 	// wait and set s.desired init value
 	for {
 		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
-		resp, err := e.etcdClient.Get(ctx, PsDesired)
+		resp, err := e.client.Get(ctx, PsDesired)
 		cancel()
 		if err != nil {
 			log.Errorf("getting %s error: %v", PsDesired, err)
-			time.Sleep(e.etcdTimeout)
+			time.Sleep(retryTimeout)
 			continue
 		}
 		if len(resp.Kvs) != 0 {
 			e.desired, err = strconv.Atoi(string(resp.Kvs[0].Value))
 			if err != nil {
 				log.Errorf("value of %s invalid %v\n", PsDesired, err)
-				time.Sleep(e.etcdTimeout)
+				time.Sleep(retryTimeout)
 				// NOTE: wait util ps_desired value change
 				continue
 			}
@@ -116,11 +140,11 @@ func (e *EtcdClient) Register() (int, error) {
 	for {
 		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
 		var err error
-		pserverIdx, err = e.registerPserverEtcd(ctx)
+		pserverIdx, err = e.registerPserverEtcd(ctx, port)
 		cancel()
 		if err != nil {
 			log.Warn(err)
-			time.Sleep(e.etcdTimeout)
+			time.Sleep(retryTimeout)
 			continue
 		}
 		break
@@ -130,19 +154,19 @@ func (e *EtcdClient) Register() (int, error) {
 }
 
 func (e *EtcdClient) initDesiredPservers(ctx context.Context, numPservers int) (*clientv3.TxnResponse, error) {
-	return concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error {
+	return concurrency.NewSTM(e.client, func(c concurrency.STM) error {
 		dsStr := c.Get(PsDesired)
 		if dsStr == "" {
-			c.Put(PsDesired, strconv.Itoa(numPservers))
+			c.Put(PsDesired, strconv.Itoa(numPservers), clientv3.WithLease(e.sess.Lease()))
 		}
 		return nil
 	}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
 }
 
 // registerPserverEtcd registers pserver node on etcd using transaction.
-func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
+func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, error) {
 	var idx int
-	_, err := concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error {
+	_, err := concurrency.NewSTM(e.client, func(c concurrency.STM) error {
 		registered := false
 		for i := 0; i < e.desired; i++ {
 			psKey := PsPath + strconv.Itoa(i)
@@ -151,35 +175,20 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
 			log.Debugf("got value (%s) for key: %s", ps, psKey)
 
 			if ps == "" {
-				resp, err := e.etcdClient.Grant(context.TODO(), 5)
-				if err != nil {
-					log.Fatal(err)
-				}
 				// find the first id and write info
-				c.Put(psKey, e.externalIP, clientv3.WithLease(resp.ID))
-				log.Debugf("set pserver node %s with value %s", psKey, e.externalIP)
-				ch, kaerr := e.etcdClient.KeepAlive(context.TODO(), resp.ID)
-				if kaerr != nil {
-					log.Errorf("keepalive etcd node error: %v", kaerr)
-					return kaerr
-				}
-
-				// Eat the keep alive message so etcd
-				// will not expire the lease.
-				go func(ch <-chan *clientv3.LeaseKeepAliveResponse) {
-					ka := <-ch
-					log.Debugf("keepalive: %d\n", ka.TTL)
-				}(ch)
+				pserverAddr := e.externalIP + ":" + strconv.Itoa(port)
+				c.Put(psKey, pserverAddr, clientv3.WithLease(e.sess.Lease()))
+				log.Debugf("set pserver node %s with value %s", psKey, pserverAddr)
 				log.Debug("register finished")
 				idx = i
 				registered = true
 				break
 			}
 		}
-		if registered == true {
+		if registered {
 			return nil
 		}
-		return errors.New("not registerd, may due to already have enough pservers")
+		return errors.New("not registered, may due to already have enough pservers")
 	}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
 
 	if err != nil {
@@ -192,11 +201,12 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
 // GetKey gets the value by the specified key
 func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) {
 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
-	resp, err := e.etcdClient.Get(ctx, key)
+	resp, err := e.client.Get(ctx, key)
 	cancel()
 	if err != nil {
 		return []byte{}, err
 	}
+
 	kvs := resp.Kvs
 	if len(kvs) == 0 {
 		return []byte{}, nil
@@ -206,12 +216,34 @@ func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) {
 }
 
 // PutKey put into etcd with value by key specified
-func (e *EtcdClient) PutKey(key string, value []byte, timeout time.Duration) error {
+func (e *EtcdClient) PutKey(key string, value []byte, timeout time.Duration, withLease bool) error {
 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
-	_, err := e.etcdClient.Put(ctx, key, string(value))
+	var err error
+	if withLease {
+		_, err = e.client.Put(ctx, key, string(value), clientv3.WithLease(e.sess.Lease()))
+	} else {
+		_, err = e.client.Put(ctx, key, string(value))
+	}
 	cancel()
-	if err != nil {
-		return err
+	return err
+}
+
+// Shutdown shuts down the etcd client gracefully.
+func (e *EtcdClient) Shutdown() error {
+	var err error
+	if e.sess != nil {
+		err = e.sess.Close()
+	}
+
+	if e.client != nil {
+		newErr := e.client.Close()
+		if newErr != nil {
+			if err != nil {
+				log.Errorln(newErr)
+			} else {
+				err = newErr
+			}
+		}
 	}
-	return nil
+	return err
 }
diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go
index a6b73dd5a1..ae73590734 100644
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package pserver
 
 // #cgo CFLAGS: -I ../../
@@ -14,15 +28,15 @@ import (
 	log "github.com/sirupsen/logrus"
 )
 
-var nullPtr = unsafe.Pointer(uintptr(0))
-
 type optimizer struct {
 	opt         *C.struct_paddle_optimizer
 	elementType ElementType
+	contentLen  int
+	config      []byte
 }
 
 func cArrayToSlice(p unsafe.Pointer, len int) []byte {
-	if p == nullPtr {
+	if p == nil {
 		return nil
 	}
 
@@ -37,10 +51,11 @@ func cArrayToSlice(p unsafe.Pointer, len int) []byte {
 func newOptimizer(paramWithConfigs ParameterWithConfig, State []byte) *optimizer {
 	o := &optimizer{}
 	o.elementType = paramWithConfigs.Param.ElementType
+	o.contentLen = len(paramWithConfigs.Param.Content)
 	p := paramWithConfigs.Param
 	c := paramWithConfigs.Config
 	s := State
-	paramBufferSize := C.size_t(len(p.Content) / C.sizeof_float)
+	paramBufferSize := C.size_t(len(p.Content))
 	log.WithFields(log.Fields{
 		"ElementType": p.ElementType,
 		"ParamSize":   paramBufferSize,
@@ -56,6 +71,7 @@ func newOptimizer(paramWithConfigs ParameterWithConfig, State []byte) *optimizer
 		cstate = unsafe.Pointer(&s[0])
 	}
 
+	o.config = c
 	o.opt = C.paddle_create_optimizer((*C.uchar)(&c[0]), C.int(len(c)),
 		C.paddle_element_type(p.ElementType), cbuffer, C.int(paramBufferSize), (*C.char)(cstate), C.int(len(s)))
 	return o
@@ -78,7 +94,11 @@ func (o *optimizer) UpdateParameter(g Gradient) error {
 		return fmt.Errorf("Name: %s, parameter and gradient element type not match, parameter: %v, gradient: %v", g.Name, o.elementType, g.ElementType)
 	}
 
-	r := C.paddle_update_parameter(o.opt, C.paddle_element_type(g.ElementType), unsafe.Pointer(&g.Content[0]), C.int(len(g.Content))/C.sizeof_float)
+	if o.contentLen != len(g.Content) {
+		return fmt.Errorf("Name: %s, parameter and gradient does not have same content len, parameter: %d, gradient: %d", g.Name, o.contentLen, len(g.Content))
+	}
+
+	r := C.paddle_update_parameter(o.opt, C.paddle_element_type(g.ElementType), unsafe.Pointer(&g.Content[0]), C.int(len(g.Content)))
 	if r != 0 {
 		return fmt.Errorf("optimizer update returned error code: %d", r)
 	}
@@ -86,8 +106,8 @@ func (o *optimizer) UpdateParameter(g Gradient) error {
 }
 
 func (o *optimizer) Cleanup() {
-	if unsafe.Pointer(o.opt) != nullPtr {
+	if unsafe.Pointer(o.opt) != nil {
 		C.paddle_release_optimizer(o.opt)
-		o.opt = (*C.struct_paddle_optimizer)(nullPtr)
+		o.opt = (*C.struct_paddle_optimizer)(nil)
 	}
 }
diff --git a/go/pserver/optimizer_test.go b/go/pserver/optimizer_test.go
index d19e9de92e..d001e6993e 100644
--- a/go/pserver/optimizer_test.go
+++ b/go/pserver/optimizer_test.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package pserver
 
 import (
diff --git a/go/pserver/service.go b/go/pserver/service.go
index fec2ec61dc..25751540a9 100644
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package pserver
 
 import (
@@ -11,22 +25,28 @@ import (
 	"fmt"
 	"io/ioutil"
 	"os"
-	"path/filepath"
+	"path"
 	"strconv"
 	"sync"
 	"time"
 
+	uuid "github.com/satori/go.uuid"
+
 	log "github.com/sirupsen/logrus"
 )
 
 // ElementType is the type of elements of a Parameter.
 type ElementType int
 
+// ErrCheckpointNotFound indicates that the pserver checkpoint could
+// not be found.
+var ErrCheckpointNotFound = errors.New("checkpoint not found")
+
 // RPC error message.
 const (
-	AlreadyInitialized  = "pserver already initialized"
-	Uninitialized       = "pserver not fully initialized"
-	CheckpointMD5Failed = "checkpoint file MD5 validation failed"
+	AlreadyInitialized = "pserver already initialized"
+	Uninitialized      = "pserver not fully initialized"
+	WrongChecksum      = "checkpoint file checksum validation failed"
 )
 
 // Supported element types.
@@ -55,11 +75,12 @@ type ParameterWithConfig struct {
 // checkpointMeta saves checkpoint metadata
 type checkpointMeta struct {
 	UUID      string `json:"uuid"`
+	Path      string `json:"path"`
 	MD5       string `json:"md5"`
 	Timestamp int64  `json:"timestamp"`
 }
 
-// Checkpoint is the pserver shard persist in file
+// Checkpoint is the pserver shard persist in file.
 type Checkpoint []parameterCheckpoint
 
 // Gradient is the gradient of the parameter.
@@ -72,46 +93,58 @@ type Service struct {
 	checkpointInterval time.Duration
 	checkpointPath     string
 	client             *EtcdClient
-	mu                 sync.Mutex
-	optMap             map[string]*optimizer
+
+	mu     sync.Mutex
+	optMap map[string]*optimizer
 }
 
-// parameterCheckpoint saves parameter checkpoint
+// parameterCheckpoint saves parameter checkpoint.
 type parameterCheckpoint struct {
 	ParameterWithConfig
 	State []byte
 }
 
-// NewCheckpointFromFile loads parameters and state from checkpoint file
-func NewCheckpointFromFile(cpPath string, idx int, e *EtcdClient) (Checkpoint, error) {
-	v, err := e.GetKey(PsPath+string(idx), 3*time.Second)
+func loadMeta(e *EtcdClient, idx int) (meta checkpointMeta, err error) {
+	v, err := e.GetKey(PsCheckpoint+strconv.Itoa(idx), 3*time.Second)
 	if err != nil {
-		return nil, err
+		return
 	}
 
-	var cpMeta checkpointMeta
-	if err = json.Unmarshal(v, &cpMeta); err != nil {
-		return nil, err
+	if len(v) == 0 {
+		err = ErrCheckpointNotFound
+		return
+	}
+
+	if err = json.Unmarshal(v, &meta); err != nil {
+		return
 	}
 
-	fn := filepath.Join(cpPath, cpMeta.UUID)
-	if _, err = os.Stat(fn); os.IsNotExist(err) {
+	return
+}
+
+// LoadCheckpoint loads checkpoint from file.
+func LoadCheckpoint(e *EtcdClient, idx int) (Checkpoint, error) {
+	cpMeta, err := loadMeta(e, idx)
+	if err != nil {
 		return nil, err
 	}
-	content, err := ioutil.ReadFile(fn)
+
+	content, err := ioutil.ReadFile(cpMeta.Path)
 	if err != nil {
 		return nil, err
 	}
 
+	// TODO(helin): change MD5 to CRC since CRC is better for file
+	// checksum in our use case (emphasize speed over security).
 	h := md5.New()
 	md5 := hex.EncodeToString(h.Sum(content))
 	if md5 != cpMeta.MD5 {
-		return nil, errors.New(CheckpointMD5Failed)
+		return nil, errors.New(WrongChecksum)
 	}
 
 	dec := gob.NewDecoder(bytes.NewReader(content))
-	cp := Checkpoint{}
-	if err = dec.Decode(cp); err != nil {
+	var cp Checkpoint
+	if err = dec.Decode(&cp); err != nil {
 		return nil, err
 	}
 	return cp, nil
@@ -142,7 +175,7 @@ func NewService(idx int, interval time.Duration, path string, client *EtcdClient
 }
 
 // InitParam initializes a parameter.
-func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) error {
+func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, _ *int) error {
 	select {
 	case <-s.initialized:
 		return errors.New(AlreadyInitialized)
@@ -163,7 +196,7 @@ func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) er
 
 // FinishInitParams tells the parameter server that the parameter
 // initialization has finished.
-func (s *Service) FinishInitParams(dummy0 int, dummy1 *int) error {
+func (s *Service) FinishInitParams(_ int, _ *int) error {
 	select {
 	case <-s.initialized:
 		return errors.New(AlreadyInitialized)
@@ -171,12 +204,21 @@ func (s *Service) FinishInitParams(dummy0 int, dummy1 *int) error {
 	}
 
 	close(s.initialized)
+	go func() {
+		t := time.Tick(s.checkpointInterval)
+		for range t {
+			err := s.checkpoint()
+			if err != nil {
+				log.Errorln(err)
+			}
+		}
+	}()
 	return nil
 }
 
 // SendGrad sends gradient to parameter servers for parameter
 // optimization.
-func (s *Service) SendGrad(g Gradient, dummy *int) error {
+func (s *Service) SendGrad(g Gradient, _ *int) error {
 	select {
 	case <-s.initialized:
 	default:
@@ -211,68 +253,117 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
 	// learning optimization methods are stochastic in
 	// nature. This race condition is allowed deliberately
 	// to save the program from making a copy of the
-	// paramter content.
+	// parameter content.
 	parameter.Name = name
 	parameter.ElementType = opt.elementType
 	parameter.Content = opt.GetWeights()
 	return nil
 }
 
-// pserver save checkpoint
-func (s *Service) doCheckpoint() error {
-	<-s.initialized
-	s.mu.Lock()
-	defer s.mu.Unlock()
+func traceTime(start time.Time, name string) {
+	elapsed := time.Since(start)
+	log.Infof("%s took %v", name, elapsed)
+}
+
+// checkpoint saves checkpoint to disk.
+//
+// checkpoint should be only called after the parameters are
+// initialized.
+func (s *Service) checkpoint() (err error) {
+	log.Infoln("Begin save checkpoint.")
+	defer traceTime(time.Now(), "save checkpoint")
 
+	s.mu.Lock()
 	cp := make([]parameterCheckpoint, len(s.optMap))
 	index := 0
+	// TODO(helin): write checkpoint incrementally to reduce memory
+	// footprint during checkpoint.
 	for name, opt := range s.optMap {
 		var pc parameterCheckpoint
 		pc.Param.Name = name
 		pc.Param.ElementType = opt.elementType
 		pc.Param.Content = opt.GetWeights()
+		pc.Config = opt.config
 		pc.State = opt.GetStates()
 		cp[index] = pc
 		index++
 	}
+	s.mu.Unlock()
+
 	var buf bytes.Buffer
 	encoder := gob.NewEncoder(&buf)
-	err := encoder.Encode(cp)
+	err = encoder.Encode(cp)
 	if err != nil {
-		return err
+		return
 	}
 
-	cpMeta := checkpointMeta{}
-	cpMeta.UUID = s.checkpointPath + strconv.Itoa(s.idx)
-	cpMeta.Timestamp = time.Now().UnixNano()
-	h := md5.New()
-	cpMeta.MD5 = hex.EncodeToString(h.Sum(buf.Bytes()))
-
-	cpMetajson, _ := json.Marshal(cpMeta)
-	err = s.client.PutKey(filepath.Join(PsCheckpoint, strconv.Itoa(s.idx)), cpMetajson, 3*time.Second)
+	id := uuid.NewV4().String()
+	p := path.Join(s.checkpointPath, id)
+	f, err := os.Create(p)
 	if err != nil {
-		return err
+		return
 	}
-	if _, err = os.Stat(cpMeta.UUID); os.IsNotExist(err) {
-		log.Info("checkpoint does not exists.")
-	} else {
-		err = os.Remove(cpMeta.UUID)
-		if err != nil {
-			log.Infof("Removing checkpoint %s failed", cpMeta.UUID)
-		} else {
-			log.Infof("checkpoint %s already exsits, removing ", cpMeta.UUID)
+
+	defer func() {
+		closeErr := f.Close()
+		if closeErr != nil {
+			if err != nil {
+				log.Errorln(closeErr)
+			} else {
+				// Set closeErr as return value.
+				err = closeErr
+			}
 		}
+	}()
+
+	writer := bufio.NewWriter(f)
+	_, err = writer.Write(buf.Bytes())
+	if err != nil {
+		return
 	}
-	f, err := os.Create(cpMeta.UUID)
-	defer f.Close()
+
+	err = writer.Flush()
 	if err != nil {
-		return err
+		return
 	}
-	writer := bufio.NewWriter(f)
-	_, err = writer.Write(buf.Bytes())
-	writer.Flush()
+
+	oldMeta, err := loadMeta(s.client, s.idx)
+	if err == ErrCheckpointNotFound {
+		log.Infoln("Do not have existing checkpoint.")
+		err = nil
+	}
+
 	if err != nil {
-		return err
+		return
 	}
-	return nil
+
+	h := md5.New()
+	md5 := hex.EncodeToString(h.Sum(buf.Bytes()))
+	cpMeta := checkpointMeta{
+		UUID:      id,
+		Timestamp: time.Now().UnixNano(),
+		MD5:       md5,
+		Path:      p,
+	}
+
+	json, err := json.Marshal(cpMeta)
+	if err != nil {
+		return
+	}
+
+	err = s.client.PutKey(PsCheckpoint+strconv.Itoa(s.idx), json, 3*time.Second, false)
+	if err != nil {
+		return
+	}
+
+	if oldMeta.Path != "" {
+		rmErr := os.Remove(oldMeta.Path)
+		if rmErr != nil {
+			// log error, but still treat checkpoint as
+			// successful.
+			log.Errorln(rmErr)
+		}
+	}
+
+	return
 }
diff --git a/go/pserver/service_test.go b/go/pserver/service_test.go
index 9bf1a48a59..be648cd1e8 100644
--- a/go/pserver/service_test.go
+++ b/go/pserver/service_test.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package pserver_test
 
 import (
@@ -16,7 +30,7 @@ const (
 
 func TestServiceFull(t *testing.T) {
 	var cp pserver.Checkpoint
-	s, err := pserver.NewService(0, 1, "", nil, cp)
+	s, err := pserver.NewService(0, time.Hour, "", nil, cp)
 	if err != nil {
 		t.Error(err)
 	}
@@ -31,7 +45,7 @@ func TestServiceFull(t *testing.T) {
 
 	err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: config}, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	var p1 pserver.Parameter
@@ -40,40 +54,40 @@ func TestServiceFull(t *testing.T) {
 	p1.ElementType = pserver.Float32
 	err = s.InitParam(pserver.ParameterWithConfig{Param: p1, Config: config}, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	err = s.FinishInitParams(0, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	var param pserver.Parameter
 	err = s.GetParam("param_b", &param)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	if !reflect.DeepEqual(param, p1) {
-		t.FailNow()
+		t.Fatal("not equal:", param, p1)
 	}
 
 	g1, g2 := pserver.Gradient(p1), pserver.Gradient(p)
 
 	err = s.SendGrad(g1, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 	err = s.SendGrad(g2, nil)
 
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	var param1 pserver.Parameter
 	err = s.GetParam("param_a", &param1)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	// don't compare content, since it's already changed by
@@ -82,39 +96,39 @@ func TestServiceFull(t *testing.T) {
 	p.Content = nil
 
 	if !reflect.DeepEqual(param1, p) {
-		t.FailNow()
+		t.Fatal("not equal:", param1, p)
 	}
 }
 
 func TestMultipleInit(t *testing.T) {
 	var cp pserver.Checkpoint
-	s, err := pserver.NewService(0, 1, "", nil, cp)
+	s, err := pserver.NewService(0, time.Hour, "", nil, cp)
 	if err != nil {
-		t.Error(err)
+		t.Fatal(err)
 	}
 	err = s.FinishInitParams(0, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	err = s.FinishInitParams(0, nil)
 	if err.Error() != pserver.AlreadyInitialized {
-		t.FailNow()
+		t.Fatal(err)
 	}
 }
 
 func TestUninitialized(t *testing.T) {
 	var cp pserver.Checkpoint
-	s, err := pserver.NewService(0, 1, "", nil, cp)
+	s, err := pserver.NewService(0, time.Hour, "", nil, cp)
 	err = s.SendGrad(pserver.Gradient{}, nil)
 	if err.Error() != pserver.Uninitialized {
-		t.FailNow()
+		t.Fatal(err)
 	}
 }
 
 func TestBlockUntilInitialized(t *testing.T) {
 	var cp pserver.Checkpoint
-	s, err := pserver.NewService(0, 1, "", nil, cp)
+	s, err := pserver.NewService(0, time.Hour, "", nil, cp)
 	if err != nil {
 		t.Error(err)
 	}
@@ -154,12 +168,12 @@ func TestBlockUntilInitialized(t *testing.T) {
 	err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: config}, nil)
 
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	err = s.FinishInitParams(0, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	wg.Wait()
diff --git a/go/utils/networkhelper/CMakeLists.txt b/go/utils/networkhelper/CMakeLists.txt
index db6cf211d8..9233264ff3 100644
--- a/go/utils/networkhelper/CMakeLists.txt
+++ b/go/utils/networkhelper/CMakeLists.txt
@@ -1,3 +1,17 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 if(WITH_TESTING)
   go_test(network_helper_test)
 endif()
diff --git a/go/utils/networkhelper/helper.go b/go/utils/networkhelper/helper.go
index fbeaea8f5e..c3fc747bda 100644
--- a/go/utils/networkhelper/helper.go
+++ b/go/utils/networkhelper/helper.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package networkhelper
 
 import (
diff --git a/go/utils/networkhelper/helper_test.go b/go/utils/networkhelper/helper_test.go
index 4208f9e358..0bc02ad42a 100644
--- a/go/utils/networkhelper/helper_test.go
+++ b/go/utils/networkhelper/helper_test.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package networkhelper
 
 import "testing"
diff --git a/paddle/.set_python_path.sh b/paddle/.set_python_path.sh
index fa7baccc86..8fd58925ee 100755
--- a/paddle/.set_python_path.sh
+++ b/paddle/.set_python_path.sh
@@ -21,22 +21,15 @@
 # 
 # It same as PYTHONPATH=${YOUR_PYTHON_PATH}:$PYTHONPATH {exec...}
 #
-
-if ! python -c "import paddle" >/dev/null 2>/dev/null; then
-  PYPATH=""
-  set -x
-  while getopts "d:" opt; do
-    case $opt in
-      d)
-        PYPATH=$OPTARG
-        ;;
-    esac
-  done
-  shift $(($OPTIND - 1))
-  export PYTHONPATH=$PYPATH:$PYTHONPATH
-  $@
-else
-  echo "paddle package is already in your PYTHONPATH. But unittest need a clean environment."
-  echo "Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'"
-  exit 1
-fi
+PYPATH=""
+set -x
+while getopts "d:" opt; do
+  case $opt in
+    d)
+      PYPATH=$OPTARG
+      ;;
+  esac
+done
+shift $(($OPTIND - 1))
+export PYTHONPATH=$PYPATH:$PYTHONPATH
+$@
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 0b5e9a2599..cf61a243e9 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -14,7 +14,7 @@ if(Boost_FOUND)
   add_subdirectory(memory)
   add_subdirectory(platform)
   add_subdirectory(framework)
-  add_subdirectory(pybind)
+  add_subdirectory(operators)
 endif()
 
 if(WITH_C_API)
@@ -22,7 +22,5 @@ if(WITH_C_API)
 endif()
 
 if(WITH_SWIG_PY)
-  configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
-          ${CMAKE_CURRENT_SOURCE_DIR}/setup.py)
   add_subdirectory(api)
 endif()
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index 84da89a142..d7b3d2bdec 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -19,9 +19,9 @@ add_library(paddle_api STATIC ${API_SOURCES})
 add_dependencies(paddle_api paddle_proto paddle_trainer_lib)
 
 INCLUDE(${SWIG_USE_FILE})
-INCLUDE_DIRECTORIES(${PROJ_ROOT}/paddle)
+INCLUDE_DIRECTORIES(${PADDLE_SOURCE_DIR}/paddle)
 
-FILE(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py)
+FILE(GLOB PY_PADDLE_PYTHON_FILES ${PADDLE_SOURCE_DIR}/paddle/py_paddle/*.py)
 
 SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON)
 
@@ -79,22 +79,16 @@ SWIG_LINK_LIBRARIES(swig_paddle
     ${START_END}
 )
 
-add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PROJ_ROOT}/paddle/py_paddle
-    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-    COMMAND ${CMAKE_COMMAND} -E touch dist/.timestamp
-    COMMAND rm -rf py_paddle.egg-info build
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle
+add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PADDLE_SOURCE_DIR}/paddle/py_paddle
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PADDLE_SOURCE_DIR}/paddle/py_paddle
+    COMMAND ${CMAKE_COMMAND} -E touch .timestamp
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
     DEPENDS _swig_paddle
 )
 
 # TODO(yuyang18) : make wheel name calculated by cmake
-add_custom_target(python_api_wheel ALL DEPENDS ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so)
-
-install(DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/dist/
-    DESTINATION opt/paddle/share/wheels
-)
+add_custom_target(python_api_wheel ALL DEPENDS ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so)
 
 if(WITH_TESTING)
     IF(NOT PY_PIP_FOUND)
@@ -108,7 +102,7 @@ if(WITH_TESTING)
             BUILD_COMMAND       ""
             INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
             BUILD_IN_SOURCE     1
-            DEPENDS python setuptools python_api_wheel
+            #DEPENDS python setuptools python_api_wheel
         )
     ENDIF()
     add_subdirectory(test)
diff --git a/paddle/api/ConfigParser.cpp b/paddle/api/ConfigParser.cpp
index 2f45173bfd..b6ff6ec789 100644
--- a/paddle/api/ConfigParser.cpp
+++ b/paddle/api/ConfigParser.cpp
@@ -64,11 +64,7 @@ ModelConfig* TrainerConfig::getModelConfig() const {
 
 ParameterConfig::ParameterConfig() : m(new ParameterConfigPrivate()) {}
 
-ParameterConfig::~ParameterConfig() {
-  if (m) {
-    delete m;
-  }
-}
+ParameterConfig::~ParameterConfig() { delete m; }
 
 ParameterConfig* ParameterConfig::createParameterConfigFromParameterSharedPtr(
     void* ptr) {
@@ -98,11 +94,7 @@ void* ParameterConfig::getRawPtr() { return m->getConfigPtr(); }
 
 OptimizationConfig::OptimizationConfig() : m(new OptimizationConfigPrivate()) {}
 
-OptimizationConfig::~OptimizationConfig() {
-  if (m) {
-    delete m;
-  }
-}
+OptimizationConfig::~OptimizationConfig() { delete m; }
 
 std::string OptimizationConfig::toProtoString() {
   return m->getConfig().SerializeAsString();
diff --git a/paddle/api/Evaluator.cpp b/paddle/api/Evaluator.cpp
index 681e3a3809..fcda6eaf03 100644
--- a/paddle/api/Evaluator.cpp
+++ b/paddle/api/Evaluator.cpp
@@ -37,7 +37,7 @@ std::vector<std::string> Evaluator::getNames() const {
 double Evaluator::getValue(const std::string name) const {
   paddle::Error err;
   double v = m->rawPtr->getValue(name, &err);
-  if (err) {
+  if (!err.isOK()) {
     throw std::runtime_error(err.msg());
   }
   return v;
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index 5fb3d1c73b..0b9b83d429 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -843,7 +843,8 @@ public:
                                                bool useSparseUpdater);
   static ParameterUpdater* createNewRemoteUpdater(
       OptimizationConfig* config,
-      const std::string pserverSpec) throw(UnsupportError);
+      const std::string pserverSpec,
+      const bool useEtcd) throw(UnsupportError);
   ~ParameterUpdater();
 
   /**
diff --git a/paddle/api/ParameterOptimizer.cpp b/paddle/api/ParameterOptimizer.cpp
index 21b851dd5e..120eea3f70 100644
--- a/paddle/api/ParameterOptimizer.cpp
+++ b/paddle/api/ParameterOptimizer.cpp
@@ -53,11 +53,7 @@ struct ParameterTraverseCallbackPrivate {
 
 ParameterOptimizer::ParameterOptimizer() : m(new ParameterOptimizerPrivate()) {}
 
-ParameterOptimizer::~ParameterOptimizer() {
-  if (m) {
-    delete m;
-  }
-}
+ParameterOptimizer::~ParameterOptimizer() { delete m; }
 
 ParameterOptimizer* ParameterOptimizer::create(OptimizationConfig* config) {
   CHECK(config != nullptr);
@@ -104,11 +100,7 @@ std::vector<int> ParameterOptimizer::getParameterTypes() const {
 ParameterTraverseCallback::ParameterTraverseCallback()
     : m(new ParameterTraverseCallbackPrivate()) {}
 
-ParameterTraverseCallback::~ParameterTraverseCallback() {
-  if (m) {
-    delete m;
-  }
-}
+ParameterTraverseCallback::~ParameterTraverseCallback() { delete m; }
 
 void ParameterTraverseCallback::apply(const std::vector<Vector*>& vecs,
                                       const ParameterConfig& conf,
diff --git a/paddle/api/ParameterUpdater.cpp b/paddle/api/ParameterUpdater.cpp
index 1aaefdfb81..8cd73b348c 100644
--- a/paddle/api/ParameterUpdater.cpp
+++ b/paddle/api/ParameterUpdater.cpp
@@ -33,14 +33,15 @@ ParameterUpdater *ParameterUpdater::createLocalUpdater(
 
 ParameterUpdater *ParameterUpdater::createNewRemoteUpdater(
     OptimizationConfig *config,
-    const std::string pserverSpec) throw(UnsupportError) {
+    const std::string pserverSpec,
+    const bool useEtcd) throw(UnsupportError) {
 #ifndef PADDLE_WITHOUT_GOLANG
   auto updater = new ParameterUpdater();
   updater->m->updater.reset(new paddle::NewRemoteParameterUpdater(
-      config->m->getConfig(), pserverSpec));
+      config->m->getConfig(), pserverSpec, useEtcd));
   return updater;
 #else
-  throw UnsupportError();
+  throw UnsupportError("not compiled with WITH_GOLANG");
 #endif
 }
 
diff --git a/paddle/api/Vector.cpp b/paddle/api/Vector.cpp
index db8f005929..500bc448c9 100644
--- a/paddle/api/Vector.cpp
+++ b/paddle/api/Vector.cpp
@@ -171,11 +171,7 @@ struct VectorPrivate {
 
 Vector::Vector() : m(new VectorPrivate()) {}
 
-Vector::~Vector() {
-  if (m) {
-    delete m;
-  }
-}
+Vector::~Vector() { delete m; }
 
 Vector* Vector::createZero(size_t sz, bool useGpu) {
   auto retVec = new Vector();
diff --git a/paddle/api/test/CMakeLists.txt b/paddle/api/test/CMakeLists.txt
index f3b1c2c4d4..761aeb5b17 100644
--- a/paddle/api/test/CMakeLists.txt
+++ b/paddle/api/test/CMakeLists.txt
@@ -1,2 +1,6 @@
-add_python_test(test_swig_api
-    testArguments.py testGradientMachine.py testMatrix.py testVector.py testTrain.py testTrainer.py)
+py_test(testTrain SRCS testTrain.py)
+py_test(testMatrix SRCS testMatrix.py)
+py_test(testVector SRCS testVector.py)
+py_test(testTrainer SRCS testTrainer.py)
+py_test(testArguments SRCS testArguments.py)
+py_test(testGradientMachine SRCS testGradientMachine.py)
diff --git a/paddle/capi/Arguments.cpp b/paddle/capi/Arguments.cpp
index 8b81ec69e6..1ec403077e 100644
--- a/paddle/capi/Arguments.cpp
+++ b/paddle/capi/Arguments.cpp
@@ -90,6 +90,18 @@ paddle_error paddle_arguments_set_ids(paddle_arguments args,
   return kPD_NO_ERROR;
 }
 
+paddle_error paddle_arguments_set_frame_shape(paddle_arguments args,
+                                              uint64_t ID,
+                                              uint64_t frameHeight,
+                                              uint64_t frameWidth) {
+  if (args == nullptr) return kPD_NULLPTR;
+  auto a = castArg(args);
+  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
+  a->args[ID].setFrameHeight(frameHeight);
+  a->args[ID].setFrameWidth(frameWidth);
+  return kPD_NO_ERROR;
+}
+
 paddle_error paddle_arguments_set_sequence_start_pos(paddle_arguments args,
                                                      uint64_t ID,
                                                      uint32_t nestedLevel,
diff --git a/paddle/capi/arguments.h b/paddle/capi/arguments.h
index d71ea26a5d..7c32524a00 100644
--- a/paddle/capi/arguments.h
+++ b/paddle/capi/arguments.h
@@ -111,6 +111,20 @@ PD_API paddle_error paddle_arguments_set_ids(paddle_arguments args,
                                              uint64_t ID,
                                              paddle_ivector ids);
 
+/**
+ * @brief paddle_arguments_set_frame_shape Set the fram size of one argument
+ *        in array, which index is `ID`.
+ * @param [in] args arguments array
+ * @param [in] ID array index
+ * @param [in] frameHeight maximum height of input images
+ * @param [in] frameWidth maximum width of input images
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_arguments_set_frame_shape(paddle_arguments args,
+                                                     uint64_t ID,
+                                                     uint64_t frameHeight,
+                                                     uint64_t frameWidth);
+
 /**
  * @brief PDArgsSetSequenceStartPos Set sequence start position vector of one
  *        argument in array, which index is `ID`.
diff --git a/paddle/capi/examples/model_inference/common/common.h b/paddle/capi/examples/model_inference/common/common.h
index a78522e4a7..e32f2f9836 100644
--- a/paddle/capi/examples/model_inference/common/common.h
+++ b/paddle/capi/examples/model_inference/common/common.h
@@ -3,18 +3,21 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#define CHECK(stmt)                                                \
-  do {                                                             \
-    paddle_error __err__ = stmt;                                   \
-    if (__err__ != kPD_NO_ERROR) {                                 \
-      fprintf(stderr, "Invoke paddle error %d \n" #stmt, __err__); \
-      exit(__err__);                                               \
-    }                                                              \
+#define CHECK(stmt)                                                      \
+  do {                                                                   \
+    paddle_error __err__ = stmt;                                         \
+    if (__err__ != kPD_NO_ERROR) {                                       \
+      fprintf(stderr, "Invoke paddle error %d in " #stmt "\n", __err__); \
+      exit(__err__);                                                     \
+    }                                                                    \
   } while (0)
 
 void* read_config(const char* filename, long* size) {
   FILE* file = fopen(filename, "r");
-  if (file == NULL) return NULL;
+  if (file == NULL) {
+    fprintf(stderr, "Open %s error\n", filename);
+    return NULL;
+  }
   fseek(file, 0L, SEEK_END);
   *size = ftell(file);
   fseek(file, 0L, SEEK_SET);
diff --git a/paddle/capi/gradient_machine.cpp b/paddle/capi/gradient_machine.cpp
index 00f76e0152..629449bbd4 100644
--- a/paddle/capi/gradient_machine.cpp
+++ b/paddle/capi/gradient_machine.cpp
@@ -54,6 +54,31 @@ paddle_error paddle_gradient_machine_create_for_inference(
   return kPD_NO_ERROR;
 }
 
+paddle_error paddle_gradient_machine_create_for_inference_with_parameters(
+    paddle_gradient_machine* machine, void* mergedModel, uint64_t size) {
+  if (mergedModel == nullptr) return kPD_NULLPTR;
+  std::istringstream is(std::string(static_cast<char*>(mergedModel), size));
+  int64_t modelConfigSize = 0;
+  is.read((char*)(&modelConfigSize), sizeof(modelConfigSize));
+  std::string modelConfigProtobuf;
+  modelConfigProtobuf.resize(modelConfigSize);
+  is.read(&modelConfigProtobuf[0], modelConfigSize);
+  paddle::TrainerConfig config;
+  if (!config.ParseFromString(modelConfigProtobuf) || !config.IsInitialized()) {
+    return kPD_PROTOBUF_ERROR;
+  }
+  auto ptr = new paddle::capi::CGradientMachine();
+  ptr->machine.reset(paddle::GradientMachine::create(
+      config.model_config(), CREATE_MODE_TESTING, {paddle::PARAMETER_VALUE}));
+  std::vector<paddle::ParameterPtr>& parameters = ptr->machine->getParameters();
+  for (auto& para : parameters) {
+    para->load(is);
+  }
+
+  *machine = ptr;
+  return kPD_NO_ERROR;
+}
+
 paddle_error paddle_gradient_machine_destroy(paddle_gradient_machine machine) {
   delete cast(machine);
   return kPD_NO_ERROR;
@@ -121,3 +146,19 @@ paddle_error paddle_gradient_machine_randomize_param(
   m->machine->randParameters();
   return kPD_NO_ERROR;
 }
+
+paddle_error paddle_gradient_machine_get_layer_output(
+    paddle_gradient_machine machine,
+    const char* layerName,
+    paddle_arguments args) {
+  auto m = cast(machine);
+  auto out = paddle::capi::cast<paddle::capi::CArguments>(args);
+  if (m == nullptr || layerName == nullptr || out == nullptr ||
+      m->machine == nullptr) {
+    return kPD_NULLPTR;
+  }
+
+  auto layerOutput = m->machine->getLayerOutput(layerName);
+  out->args.push_back(layerOutput);
+  return kPD_NO_ERROR;
+}
diff --git a/paddle/capi/gradient_machine.h b/paddle/capi/gradient_machine.h
index d7e2dd9bf8..28eeb23e3b 100644
--- a/paddle/capi/gradient_machine.h
+++ b/paddle/capi/gradient_machine.h
@@ -36,6 +36,22 @@ typedef void* paddle_gradient_machine;
 PD_API paddle_error paddle_gradient_machine_create_for_inference(
     paddle_gradient_machine* machine, void* modelConfigProtobuf, int size);
 
+/**
+ * @brief Create a gradient machine used for model inference, using config with
+ *        parameters which is generated by `paddle merge_model`.
+ *        Example:
+ *          paddle merge_model \
+ *                 --model_dir="pass-00000" \
+ *                 --model_file="merged_model.paddle"
+ * @param [out] machine that used for model inference
+ * @param [in] mergedModel
+ * @param [in] size
+ * @return paddle_error
+ */
+PD_API paddle_error
+paddle_gradient_machine_create_for_inference_with_parameters(
+    paddle_gradient_machine* machine, void* mergedModel, uint64_t size);
+
 /**
  * @brief Load parameter from disk.
  * @param machine Gradient Machine.
@@ -85,6 +101,18 @@ paddle_gradient_machine_randomize_param(paddle_gradient_machine machine);
 PD_API paddle_error
 paddle_gradient_machine_destroy(paddle_gradient_machine machine);
 
+/**
+ * @brief Get the output of the layer named `layerName`.
+ * @param [in] gradient machine that have run a inference
+ * @param [in] layerName name of specified layer
+ * @param [out] args output of the specified layer
+ * @return paddle_error
+ */
+PD_API paddle_error
+paddle_gradient_machine_get_layer_output(paddle_gradient_machine machine,
+                                         const char* layerName,
+                                         paddle_arguments args);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/paddle/capi/tests/CMakeLists.txt b/paddle/capi/tests/CMakeLists.txt
index d73f6b7733..8208808b94 100644
--- a/paddle/capi/tests/CMakeLists.txt
+++ b/paddle/capi/tests/CMakeLists.txt
@@ -10,5 +10,5 @@ target_include_directories(capi_test_gradientMachine PUBLIC
   ${PADDLE_CAPI_INC_PATH})
 target_link_libraries(capi_test_gradientMachine paddle_capi)
 add_test(NAME capi_test_gradientMachine
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python ${CMAKE_CURRENT_BINARY_DIR}/capi_test_gradientMachine
-  WORKING_DIRECTORY ${PROJ_ROOT}/paddle/capi/tests)
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/capi_test_gradientMachine
+  WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/capi/tests)
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index 73ffa690d9..0865b02c4f 100755
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -39,6 +39,7 @@ set(CUDA_CU_SOURCES
     src/hl_cuda_lstm.cu
     src/hl_top_k.cu
     src/hl_batch_transpose.cu
+    src/hl_batch_norm.cu
     src/hl_cuda_sequence.cu
     src/hl_table_apply.cu)
 
diff --git a/paddle/cuda/include/hl_batch_norm.h b/paddle/cuda/include/hl_batch_norm.h
new file mode 100644
index 0000000000..afc5e0b2de
--- /dev/null
+++ b/paddle/cuda/include/hl_batch_norm.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_BATCH_NORM_H_
+#define HL_BATCH_NORM_H_
+
+#include "hl_base.h"
+
+/**
+ * @brief   batch norm inferece.
+ *
+ * @param[in]   input         input data.
+ * @param[out]  output        output data.
+ * @param[in]   scale         batch normalization scale parameter (in original
+ *                            paper scale is referred to as gamma).
+ * @param[in]   bias          batch normalization bias parameter (in original
+ *                            paper scale is referred to as beta).
+ * @param[in]   estimatedMean
+ * @param[in]   estimatedVar  The moving mean and variance
+ *                            accumulated during the training phase are passed
+ *                            as inputs here.
+ * @param[in]   epsilon       Epsilon value used in the batch
+ *                            normalization formula.
+ */
+extern void hl_batch_norm_cuda_inference(const real* input,
+                                         real* output,
+                                         const real* scale,
+                                         const real* bias,
+                                         const real* estimatedMean,
+                                         const real* estimatedVar,
+                                         const double epsilon,
+                                         size_t batchSize,
+                                         size_t channel,
+                                         size_t height,
+                                         size_t width);
+
+#endif  // HL_BATCH_NORM_H_
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index f55197c8c9..9f84db72da 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -17,73 +17,6 @@ limitations under the License. */
 
 #include "hl_base.h"
 
-/**
- * @brief   Shrink column to feature.
- *
- * @param[in]   dataCol     expand data.
- * @param[in]   channels    number of channel.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   blockH      filter height.
- * @param[in]   blockW      filter width.
- * @param[in]   strideH     stride height.
- * @param[in]   strideW     stride width.
- * @param[in]   paddingH    padding height.
- * @param[in]   paddingW    padding width.
- * @param[in]   outputH     output height.
- * @param[in]   outputW     output width.
- * @param[out]  dataIm      output image data.
- * @param[in]   alpha
- * @param[in]   beta
- */
-extern void hl_shrink_col2feature(const real* dataCol,
-                                  size_t channels,
-                                  size_t height,
-                                  size_t width,
-                                  size_t blockH,
-                                  size_t blockW,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  real* dataIm,
-                                  real alpha = 1.0f,
-                                  real beta = 0.0f);
-
-/**
- * @brief   Expand feature to column.
- *
- * @param[in]   dataIm      input image data.
- * @param[in]   channels    number of channel.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   blockH      filter height.
- * @param[in]   blockW      filter width.
- * @param[in]   strideH     stride height.
- * @param[in]   strideW     stride width.
- * @param[in]   paddingH    padding height.
- * @param[in]   paddingW    padding width.
- * @param[in]   outputH     output height.
- * @param[in]   outputW     output width.
- * @param[out]  dataCol     expand data.
- *
- */
-extern void hl_expand_feature2col(const real* dataIm,
-                                  size_t channels,
-                                  size_t height,
-                                  size_t width,
-                                  size_t blockH,
-                                  size_t blockW,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  real* dataCol);
-
 /**
  * @brief   Maximum pool forward.
  *
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index 039551c6cc..2bbb9fa8df 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -17,36 +17,6 @@ limitations under the License. */
 
 #include "hl_cnn.h"
 
-inline void hl_shrink_col2feature(const real* dataCol,
-                                  size_t channels,
-                                  size_t height,
-                                  size_t width,
-                                  size_t blockH,
-                                  size_t blockW,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  real* dataIm,
-                                  real alpha,
-                                  real beta) {}
-
-inline void hl_expand_feature2col(const real* dataIm,
-                                  size_t channels,
-                                  size_t height,
-                                  size_t width,
-                                  size_t blockH,
-                                  size_t blockW,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  real* dataCol) {}
-
 inline void hl_maxpool_forward(const int frameCnt,
                                const real* inputData,
                                const int channels,
diff --git a/paddle/cuda/src/hl_batch_norm.cu b/paddle/cuda/src/hl_batch_norm.cu
new file mode 100644
index 0000000000..5828ecb8e0
--- /dev/null
+++ b/paddle/cuda/src/hl_batch_norm.cu
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_batch_norm.h"
+
+__global__ void batchNormInference(real* output,
+                                   const real* input,
+                                   const real* scale,
+                                   const real* bias,
+                                   const real* estimatedMean,
+                                   const real* estimatedVar,
+                                   const double epsilon,
+                                   size_t batchSize,
+                                   size_t channel,
+                                   size_t height,
+                                   size_t width) {
+  const int tid = threadIdx.x;
+  const int num = channel * height * width;
+  const int batch = blockIdx.x;
+  for (int i = tid; i < num; i += blockDim.x) {
+    const int c = i / (height * width);
+    const int id = batch * num + i;
+    real val = input[id] - estimatedMean[c];
+    val /= sqrt(estimatedVar[c] + epsilon);
+    val *= scale[c];
+    val += bias[c];
+    output[id] = val;
+  }
+}
+
+void hl_batch_norm_cuda_inference(const real* input,
+                                  real* output,
+                                  const real* scale,
+                                  const real* bias,
+                                  const real* estimatedMean,
+                                  const real* estimatedVar,
+                                  const double epsilon,
+                                  size_t batchSize,
+                                  size_t channel,
+                                  size_t height,
+                                  size_t width) {
+  batchNormInference<<<batchSize, 256, 0, STREAM_DEFAULT>>>(output,
+                                                            input,
+                                                            scale,
+                                                            bias,
+                                                            estimatedMean,
+                                                            estimatedVar,
+                                                            epsilon,
+                                                            batchSize,
+                                                            channel,
+                                                            height,
+                                                            width);
+
+  CHECK_SYNC("hl_batch_norm_cuda_inference failed!");
+}
diff --git a/paddle/cuda/src/hl_batch_transpose.cu b/paddle/cuda/src/hl_batch_transpose.cu
index f047403da1..f4c253df7b 100644
--- a/paddle/cuda/src/hl_batch_transpose.cu
+++ b/paddle/cuda/src/hl_batch_transpose.cu
@@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "hl_batch_transpose.h"
 #include "hl_base.h"
+#include "hl_batch_transpose.h"
 
 const int TILE_DIM = 64;
 const int BLOCK_ROWS = 16;
 
 // No bank-conflict transpose for a batch of data.
-__global__ void batchTransposeNoBankConflicts(real* odata,
-                                              const real* idata,
-                                              int numSamples, int width,
-                                              int height) {
+__global__ void batchTransposeNoBankConflicts(
+    real* odata, const real* idata, int numSamples, int width, int height) {
   __shared__ float tile[TILE_DIM][TILE_DIM + 1];
 
   const int x = blockIdx.x * TILE_DIM + threadIdx.x;
@@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata,
           newX] = tile[threadIdx.x][j];
 }
 
-void batchTranspose(const real* input, real* output, int width, int height,
-                    int batchSize) {
+void batchTranspose(
+    const real* input, real* output, int width, int height, int batchSize) {
   dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
   dim3 dimGrid(DIVUP(width, TILE_DIM), DIVUP(height, TILE_DIM), batchSize);
-  batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
-      (output, input, batchSize, width, height);
+  batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
+      output, input, batchSize, width, height);
 
   CHECK_SYNC("batchTranspose failed!");
 }
diff --git a/paddle/cuda/src/hl_cuda_aggregate.cu b/paddle/cuda/src/hl_cuda_aggregate.cu
index 97034a9177..16a54ad343 100644
--- a/paddle/cuda/src/hl_cuda_aggregate.cu
+++ b/paddle/cuda/src/hl_cuda_aggregate.cu
@@ -12,27 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
+#include "hl_aggregate.h"
 #include "hl_base.h"
 #include "hl_cuda.h"
 #include "hl_cuda.ph"
-#include "hl_aggregate.h"
-#include "hl_thread.ph"
 #include "hl_matrix_base.cuh"
+#include "hl_thread.ph"
 #include "paddle/utils/Logging.h"
 
 /**
  * @brief   matrix row operator.
  */
-template<class Agg, int blockSize>
-__global__ void KeMatrixRowOp(Agg agg,
-                              real *E,
-                              real *Sum,
-                              int dimN) {
+template <class Agg, int blockSize>
+__global__ void KeMatrixRowOp(Agg agg, real *E, real *Sum, int dimN) {
   __shared__ real sum_s[blockSize];
-  int cnt = (dimN + blockSize -1) / blockSize;
-  int rowId = blockIdx.x + blockIdx.y*gridDim.x;
-  int index = rowId*dimN;
+  int cnt = (dimN + blockSize - 1) / blockSize;
+  int rowId = blockIdx.x + blockIdx.y * gridDim.x;
+  int index = rowId * dimN;
   int tid = threadIdx.x;
   int lmt = tid;
 
@@ -44,7 +40,7 @@ __global__ void KeMatrixRowOp(Agg agg,
   sum_s[tid] = tmp;
   __syncthreads();
 
-  for (int stride = blockSize/2; stride > 0; stride = stride/2) {
+  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
     if (tid < stride) {
       sum_s[tid] = agg(sum_s[tid], sum_s[tid + stride]);
     }
@@ -58,29 +54,21 @@ __global__ void KeMatrixRowOp(Agg agg,
 }
 
 template <class Agg>
-void hl_matrix_row_op(Agg agg,
-                      real *A_d,
-                      real *C_d,
-                      int dimM,
-                      int dimN) {
+void hl_matrix_row_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
   int blocksX = dimM;
   int blocksY = 1;
   dim3 threads(128, 1);
   dim3 grid(blocksX, blocksY);
 
-  KeMatrixRowOp<Agg, 128><<< grid, threads, 0, STREAM_DEFAULT >>>
-           (agg, A_d, C_d, dimN);
+  KeMatrixRowOp<Agg, 128><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      agg, A_d, C_d, dimN);
 }
 
 void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(C_d);
 
-  hl_matrix_row_op(aggregate::sum(),
-                   A_d,
-                   C_d,
-                   dimM,
-                   dimN);
+  hl_matrix_row_op(aggregate::sum(), A_d, C_d, dimM, dimN);
   CHECK_SYNC("hl_matrix_row_sum failed");
 }
 
@@ -88,11 +76,7 @@ void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(C_d);
 
-  hl_matrix_row_op(aggregate::max(),
-                   A_d,
-                   C_d,
-                   dimM,
-                   dimN);
+  hl_matrix_row_op(aggregate::max(), A_d, C_d, dimM, dimN);
   CHECK_SYNC("hl_matrix_row_max failed");
 }
 
@@ -100,23 +84,16 @@ void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(C_d);
 
-  hl_matrix_row_op(aggregate::min(),
-                   A_d,
-                   C_d,
-                   dimM,
-                   dimN);
+  hl_matrix_row_op(aggregate::min(), A_d, C_d, dimM, dimN);
   CHECK_SYNC("hl_matrix_row_min failed");
 }
 
 /**
  * @brief   matrix column operator.
  */
-template<class Agg>
-__global__ void KeMatrixColumnOp(Agg agg,
-                                 real *E,
-                                 real *Sum,
-                                 int dimM,
-                                 int dimN) {
+template <class Agg>
+__global__ void KeMatrixColumnOp(
+    Agg agg, real *E, real *Sum, int dimM, int dimN) {
   int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
   real tmp = agg.init();
   if (rowIdx < dimN) {
@@ -127,15 +104,12 @@ __global__ void KeMatrixColumnOp(Agg agg,
   }
 }
 
-template<class Agg, int blockDimX, int blockDimY>
-__global__ void KeMatrixColumnOp_S(Agg agg,
-                                   real *E,
-                                   real *Sum,
-                                   int dimM,
-                                   int dimN) {
-    __shared__ real _sum[blockDimX*blockDimY];
-    int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-    int index = threadIdx.y;
+template <class Agg, int blockDimX, int blockDimY>
+__global__ void KeMatrixColumnOp_S(
+    Agg agg, real *E, real *Sum, int dimM, int dimN) {
+  __shared__ real _sum[blockDimX * blockDimY];
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int index = threadIdx.y;
 
   real tmp = agg.init();
   if (rowIdx < dimN) {
@@ -144,14 +118,14 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
       index += blockDimY;
     }
   }
-  _sum[threadIdx.x + threadIdx.y*blockDimX] = tmp;
+  _sum[threadIdx.x + threadIdx.y * blockDimX] = tmp;
   __syncthreads();
 
   if (rowIdx < dimN) {
-    if (threadIdx.y ==0) {
+    if (threadIdx.y == 0) {
       real tmp = agg.init();
-      for (int i=0; i < blockDimY; i++) {
-        tmp = agg(tmp, _sum[threadIdx.x + i*blockDimX]);
+      for (int i = 0; i < blockDimY; i++) {
+        tmp = agg(tmp, _sum[threadIdx.x + i * blockDimX]);
       }
       Sum[rowIdx] = tmp;
     }
@@ -159,25 +133,21 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
 }
 
 template <class Agg>
-void hl_matrix_column_op(Agg agg,
-                         real *A_d,
-                         real *C_d,
-                         int dimM,
-                         int dimN) {
+void hl_matrix_column_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
   if (dimN >= 8192) {
-    int blocksX = (dimN + 128 -1) / 128;
+    int blocksX = (dimN + 128 - 1) / 128;
     int blocksY = 1;
     dim3 threads(128, 1);
     dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp<Agg><<< grid, threads, 0, STREAM_DEFAULT >>>
-             (agg, A_d, C_d, dimM, dimN);
+    KeMatrixColumnOp<Agg><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        agg, A_d, C_d, dimM, dimN);
   } else {
-    int blocksX = (dimN + 32 -1) / 32;
+    int blocksX = (dimN + 32 - 1) / 32;
     int blocksY = 1;
     dim3 threads(32, 32);
     dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp_S<Agg, 32, 32><<< grid, threads, 0, STREAM_DEFAULT>>>
-             (agg, A_d, C_d, dimM, dimN);
+    KeMatrixColumnOp_S<Agg, 32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        agg, A_d, C_d, dimM, dimN);
   }
 
   return;
@@ -187,11 +157,7 @@ void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(C_d);
 
-  hl_matrix_column_op(aggregate::sum(),
-                      A_d,
-                      C_d,
-                      dimM,
-                      dimN);
+  hl_matrix_column_op(aggregate::sum(), A_d, C_d, dimM, dimN);
 
   CHECK_SYNC("hl_matrix_column_sum failed");
 }
@@ -200,11 +166,7 @@ void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(C_d);
 
-  hl_matrix_column_op(aggregate::max(),
-                      A_d,
-                      C_d,
-                      dimM,
-                      dimN);
+  hl_matrix_column_op(aggregate::max(), A_d, C_d, dimM, dimN);
 
   CHECK_SYNC("hl_matrix_column_max failed");
 }
@@ -213,11 +175,7 @@ void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(C_d);
 
-  hl_matrix_column_op(aggregate::min(),
-                      A_d,
-                      C_d,
-                      dimM,
-                      dimN);
+  hl_matrix_column_op(aggregate::min(), A_d, C_d, dimM, dimN);
 
   CHECK_SYNC("hl_matrix_column_min failed");
 }
@@ -226,16 +184,16 @@ template <int blockSize>
 __global__ void KeVectorSum(real *E, real *Sum, int dimM) {
   __shared__ double sum_s[blockSize];
   int tid = threadIdx.x;
-  int index = blockIdx.y*blockDim.x+threadIdx.x;
+  int index = blockIdx.y * blockDim.x + threadIdx.x;
 
   sum_s[tid] = 0.0f;
   while (index < dimM) {
     sum_s[tid] += E[index];
-    index += blockDim.x*gridDim.y;
+    index += blockDim.x * gridDim.y;
   }
   __syncthreads();
 
-  for (int stride = blockSize/2; stride > 0; stride = stride/2) {
+  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
     if (tid < stride) {
       sum_s[tid] += sum_s[tid + stride];
     }
@@ -259,38 +217,39 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) {
   dim3 threads(blockSize, 1);
   dim3 grid(blocksX, blocksY);
 
-  struct _hl_event_st hl_event_st  = {.cu_event = t_resource.event};
+  struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
   hl_event_t hl_event = &hl_event_st;
-  while (!hl_cuda_event_is_ready(hl_event)) {}
+  while (!hl_cuda_event_is_ready(hl_event)) {
+  }
 
-  KeVectorSum<128><<< grid, threads, 0, STREAM_DEFAULT >>>
-           (A_d, t_resource.gpu_mem, dimM);
-  KeVectorSum<128><<< 1, threads, 0, STREAM_DEFAULT >>>
-           (t_resource.gpu_mem, t_resource.cpu_mem, 128);
+  KeVectorSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      A_d, t_resource.gpu_mem, dimM);
+  KeVectorSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
+      t_resource.gpu_mem, t_resource.cpu_mem, 128);
 
   hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
   hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
 
   hl_stream_synchronize(HPPL_STREAM_DEFAULT);
   cudaError_t err = (cudaError_t)hl_get_device_last_error();
-  CHECK_EQ(cudaSuccess, err)
-    << "CUDA error: " << hl_get_device_error_string((size_t)err);
+  CHECK_EQ(cudaSuccess, err) << "CUDA error: "
+                             << hl_get_device_error_string((size_t)err);
 }
 
 template <int blockSize>
 __global__ void KeVectorAbsSum(real *E, real *Sum, int dimM) {
   __shared__ double sum_s[blockSize];
   int tid = threadIdx.x;
-  int index = blockIdx.y*blockDim.x+threadIdx.x;
+  int index = blockIdx.y * blockDim.x + threadIdx.x;
 
   sum_s[tid] = 0.0f;
   while (index < dimM) {
     sum_s[tid] += abs(E[index]);
-    index += blockDim.x*gridDim.y;
+    index += blockDim.x * gridDim.y;
   }
   __syncthreads();
 
-  for (int stride = blockSize/2; stride > 0; stride = stride/2) {
+  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
     if (tid < stride) {
       sum_s[tid] += sum_s[tid + stride];
     }
@@ -314,20 +273,21 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
   dim3 threads(blockSize, 1);
   dim3 grid(blocksX, blocksY);
 
-  struct _hl_event_st hl_event_st  = {.cu_event = t_resource.event};
+  struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
   hl_event_t hl_event = &hl_event_st;
-  while (!hl_cuda_event_is_ready(hl_event)) {}
+  while (!hl_cuda_event_is_ready(hl_event)) {
+  }
 
-  KeVectorAbsSum<128><<< grid, threads, 0, STREAM_DEFAULT >>>
-           (A_d, t_resource.gpu_mem, dimM);
-  KeVectorAbsSum<128><<< 1, threads, 0, STREAM_DEFAULT >>>
-           (t_resource.gpu_mem, t_resource.cpu_mem, 128);
+  KeVectorAbsSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      A_d, t_resource.gpu_mem, dimM);
+  KeVectorAbsSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
+      t_resource.gpu_mem, t_resource.cpu_mem, 128);
 
   hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
   hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
 
   hl_stream_synchronize(HPPL_STREAM_DEFAULT);
   cudaError_t err = (cudaError_t)hl_get_device_last_error();
-  CHECK_EQ(cudaSuccess, err)
-    << "CUDA error: " << hl_get_device_error_string((size_t)err);
+  CHECK_EQ(cudaSuccess, err) << "CUDA error: "
+                             << hl_get_device_error_string((size_t)err);
 }
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index b94f4d8fe4..aac19b1ea5 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -12,149 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <float.h>
 #include "hl_base.h"
 #include "hl_cnn.h"
 #include "hl_device_functions.cuh"
 
-__global__ void KeFeature2col(size_t n, size_t height, const real* data_im,
-                              size_t blockH, size_t blockW, size_t width,
-                              size_t strideH, size_t strideW,
-                              size_t paddingH, size_t paddingW,
-                              size_t height_col, size_t width_col,
-                              real* data_col) {
-  size_t index =
-    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < n) {
-    size_t w_out = index % width_col;
-    index /= width_col;
-    size_t h_out = index % height_col;
-    size_t channel_in = index / height_col;
-    size_t channel_out = channel_in * blockH * blockW;
-    size_t h_in = h_out * strideH;
-    size_t w_in = w_out * strideW;
-
-    data_col += (channel_out * height_col + h_out) * width_col + w_out;
-    for (size_t i = 0; i < blockH; ++i) {
-      for (size_t j = 0; j < blockW; ++j) {
-        int rIdx = int(h_in+i);
-        int cIdx = int(w_in+j);
-        if ((rIdx-(int)paddingH) >= (int)height ||
-            (rIdx-(int)paddingH) < 0 ||
-            (cIdx-(int)paddingW) >= (int)width ||
-            (cIdx-(int)paddingW) < 0) {
-          *data_col = 0;
-        } else {
-          rIdx = rIdx + channel_in*height - paddingH;
-          cIdx = cIdx - paddingW;
-          *data_col = data_im[rIdx* width + cIdx];
-        }
-        data_col += height_col * width_col;
-      }
-    }
-  }
-}
-
-void hl_expand_feature2col(const real* dataIm, size_t channels,
-                           size_t height, size_t width,
-                           size_t blockH, size_t blockW,
-                           size_t strideH, size_t strideW,
-                           size_t paddingH, size_t paddingW,
-                           size_t outputH, size_t outputW,
-                           real* dataCol) {
-  size_t numKernels = channels * outputH * outputW;
-
-  size_t blocks = (numKernels + 1024 -1) / 1024;
-  size_t blockX = 512;
-  size_t blockY = (blocks+512-1)/512;
-  dim3 threads(1024, 1);
-  dim3 grid(blockX, blockY);
-  KeFeature2col<<< grid, threads, 0, STREAM_DEFAULT >>>
-           (numKernels, height, dataIm, blockH, blockW, width,
-           strideH, strideW, paddingH, paddingW,
-           outputH, outputW, dataCol);
-  CHECK_SYNC("hl_expand_feature2col failed");
-}
-
-__global__ void KeCol2Feature(size_t n, const real* data_col, size_t height,
-                              size_t width, size_t channels,
-                              size_t blockH, size_t blockW,
-                              size_t strideH, size_t strideW,
-                              size_t paddingH, size_t paddingW,
-                              size_t height_col, size_t width_col,
-                              real* data_im, real alpha, real beta) {
-  size_t index =
-    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < n) {
-    real val = 0;
-    int w = int(index % width);
-    int h = int((index / width) % height);
-    int c = int(index / (width * height));
-    if ((w - (int)paddingW) >= 0 &&
-        (w - (int)paddingW) < (width-2 * paddingW) &&
-        (h - (int)paddingH) >= 0 &&
-        (h - paddingH) < (height - 2 * paddingH)) {
-      // compute the start and end of the output
-      int w_col_start =
-        (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1;
-      int w_col_end =
-        min((int)(w / (int)strideW + 1), (int)(width_col));
-      int h_col_start =
-        (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1;
-      int h_col_end = min(int(h / strideH + 1), int(height_col));
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          // the col location: [c * width * height + h_out, w_out]
-          int c_col = int(c * blockH* blockW) + \
-            (h - h_col * (int)strideH) * (int)blockW +
-            (w - w_col * (int)strideW);
-          val += data_col[(c_col * height_col + h_col) * width_col + w_col];
-        }
-      }
-      h -= paddingH;
-      w -= paddingW;
-      real tD = data_im[c*((width-2*paddingW) * (height-2*paddingH)) +
-                          h*(width-2*paddingW) + w];
-      data_im[c*((width-2*paddingW) * (height-2*paddingH)) +
-              h*(width-2*paddingW) + w] = alpha * val + beta*tD;
-    }
-  }
-}
-
-void hl_shrink_col2feature(const real * dataCol, size_t channels,
-                           size_t height, size_t width,
-                           size_t blockH, size_t blockW,
-                           size_t strideH, size_t strideW,
-                           size_t paddingH, size_t paddingW,
-                           size_t outputH, size_t outputW,
-                           real* dataIm, real alpha, real beta) {
-  size_t numKernels = channels * (height + 2*paddingH) * (width + 2*paddingW);
-
-  size_t blocks = (numKernels + 1024 -1) / 1024;
-  size_t blockX = 512;
-  size_t blockY = (blocks+512-1)/512;
-  dim3 threads(1024, 1);
-  dim3 grid(blockX, blockY);
-
-  // To avoid involving atomic operations, we will launch one kernel per
-  // bottom dimension, and then in the kernel add up the top dimensions.
-  KeCol2Feature<<< grid, threads, 0, STREAM_DEFAULT >>>
-           (numKernels, dataCol, height + 2*paddingH, width + 2*paddingW,
-           channels, blockH, blockW, strideH, strideW, paddingH, paddingW,
-           outputH, outputW, dataIm, alpha, beta);
-  CHECK_SYNC("hl_shrink_col2feature failed");
-}
-
-__global__ void KeMaxPoolForward(const int nthreads, const real* inputData,
-                                 const int channels, const int height,
+__global__ void KeMaxPoolForward(const int nthreads,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int height,
                                  const int width,
-                                 const int pooledH, const int pooledW,
-                                 const int ksizeW, const int ksizeH,
-                                 const int strideH, const int strideW,
-                                 const int offsetH, const int offsetW,
-                                 real* tgtData, const int tgtStride) {
-  int index =  blockIdx.x * blockDim.x + threadIdx.x;
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int ksizeW,
+                                 const int ksizeH,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int offsetH,
+                                 const int offsetW,
+                                 real* tgtData,
+                                 const int tgtStride) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     int pw = index % pooledW;
     int ph = (index / pooledW) % pooledH;
@@ -174,44 +52,70 @@ __global__ void KeMaxPoolForward(const int nthreads, const real* inputData,
           maxval = inputData[h * width + w];
       }
     }
-    int tgtIndex = index % (pooledW * pooledH * channels) +
-        frameNum * tgtStride;
+    int tgtIndex =
+        index % (pooledW * pooledH * channels) + frameNum * tgtStride;
     tgtData[tgtIndex] = maxval;
   }
 }
 
-void hl_maxpool_forward(const int frameCnt, const real* inputData,
+void hl_maxpool_forward(const int frameCnt,
+                        const real* inputData,
                         const int channels,
-                        const int height, const int width,
-                        const int pooledH, const int pooledW,
-                        const int sizeX, const int sizeY,
-                        const int strideH, const int strideW,
-                        const int paddingH, const int paddingW,
-                        real* tgtData, const int tgtStride) {
-
+                        const int height,
+                        const int width,
+                        const int pooledH,
+                        const int pooledW,
+                        const int sizeX,
+                        const int sizeY,
+                        const int strideH,
+                        const int strideW,
+                        const int paddingH,
+                        const int paddingW,
+                        real* tgtData,
+                        const int tgtStride) {
   int num_kernels = pooledH * pooledW * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
   dim3 threads(1024, 1);
   dim3 grid(blocks, 1);
 
-  KeMaxPoolForward<<< grid, threads, 0, STREAM_DEFAULT >>>
-           (num_kernels, inputData, channels, height, width,
-           pooledH, pooledW, sizeX, sizeY, strideH, strideW,
-           paddingH, paddingW, tgtData, tgtStride);
+  KeMaxPoolForward<<<grid, threads, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                         inputData,
+                                                         channels,
+                                                         height,
+                                                         width,
+                                                         pooledH,
+                                                         pooledW,
+                                                         sizeX,
+                                                         sizeY,
+                                                         strideH,
+                                                         strideW,
+                                                         paddingH,
+                                                         paddingW,
+                                                         tgtData,
+                                                         tgtStride);
   CHECK_SYNC("hl_maxpool_forward failed");
 }
 
-__global__ void KeMaxPoolBackward(const int nthreads, const real* inputData,
-                                  const real* outData, const real* outGrad,
-                                  const int channels, const int height,
+__global__ void KeMaxPoolBackward(const int nthreads,
+                                  const real* inputData,
+                                  const real* outData,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int height,
                                   const int width,
-                                  const int pooledH, const int pooledW,
-                                  const int sizeX, const int sizeY,
-                                  const int strideH, const int strideW,
-                                  const int padH, const int padW,
-                                  real scaleA, real scaleB,
-                                  real* targetGrad, const int outStride) {
-  int index = blockIdx.x  * blockDim.x + threadIdx.x;
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeX,
+                                  const int sizeY,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int padH,
+                                  const int padW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* targetGrad,
+                                  const int outStride) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     // find out the local index
     // find out the local offset
@@ -235,43 +139,69 @@ __global__ void KeMaxPoolBackward(const int nthreads, const real* inputData,
         }
       }
     }
-    targetGrad[index] =
-      scaleB * targetGrad[index] + scaleA * gradient;
+    targetGrad[index] = scaleB * targetGrad[index] + scaleA * gradient;
   }
 }
 
-void hl_maxpool_backward(const int frameCnt, const real* inputData,
-                        const real* outData, const real* outGrad,
-                        const int channels, const int height,
-                        const int width,
-                        const int pooledH, const int pooledW,
-                        const int sizeX, const int sizeY,
-                        const int strideH, const int strideW,
-                        const int paddingH, const int paddingW,
-                        real scaleA, real scaleB,
-                        real* targetGrad, const int outStride) {
-
+void hl_maxpool_backward(const int frameCnt,
+                         const real* inputData,
+                         const real* outData,
+                         const real* outGrad,
+                         const int channels,
+                         const int height,
+                         const int width,
+                         const int pooledH,
+                         const int pooledW,
+                         const int sizeX,
+                         const int sizeY,
+                         const int strideH,
+                         const int strideW,
+                         const int paddingH,
+                         const int paddingW,
+                         real scaleA,
+                         real scaleB,
+                         real* targetGrad,
+                         const int outStride) {
   int num_kernels = height * width * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
 
-  KeMaxPoolBackward<<< blocks, 1024, 0, STREAM_DEFAULT >>>
-           (num_kernels, inputData, outData, outGrad, channels,
-           height, width, pooledH, pooledW, sizeX, sizeY,
-           strideH, strideW,
-           paddingH, paddingW,
-           scaleA, scaleB,
-           targetGrad, outStride);
+  KeMaxPoolBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                         inputData,
+                                                         outData,
+                                                         outGrad,
+                                                         channels,
+                                                         height,
+                                                         width,
+                                                         pooledH,
+                                                         pooledW,
+                                                         sizeX,
+                                                         sizeY,
+                                                         strideH,
+                                                         strideW,
+                                                         paddingH,
+                                                         paddingW,
+                                                         scaleA,
+                                                         scaleB,
+                                                         targetGrad,
+                                                         outStride);
   CHECK_SYNC("hl_maxpool_backward");
 }
 
-__global__ void KeAvgPoolForward(const int nthreads, const real* inputData,
+__global__ void KeAvgPoolForward(const int nthreads,
+                                 const real* inputData,
                                  const int channels,
-                                 const int height, const int width,
-                                 const int pooledH, const int pooledW,
-                                 const int sizeX, const int sizeY,
-                                 const int strideH, const int strideW,
-                                 const int padH, const int padW,
-                                 real* tgtData, const int tgtStride) {
+                                 const int height,
+                                 const int width,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeX,
+                                 const int sizeY,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int padH,
+                                 const int padW,
+                                 real* tgtData,
+                                 const int tgtStride) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     int pw = index % pooledW;
@@ -296,39 +226,64 @@ __global__ void KeAvgPoolForward(const int nthreads, const real* inputData,
         aveval += inputData[h * width + w];
       }
     }
-    int tgtIndex = index % (pooledW * pooledH * channels) +
-        frameNum * tgtStride;
+    int tgtIndex =
+        index % (pooledW * pooledH * channels) + frameNum * tgtStride;
     tgtData[tgtIndex] = aveval / pool_size;
   }
 }
 
-void hl_avgpool_forward(const int frameCnt, const real* inputData,
+void hl_avgpool_forward(const int frameCnt,
+                        const real* inputData,
                         const int channels,
-                        const int height, const int width,
-                        const int pooledH, const int pooledW,
-                        const int sizeX, const int sizeY,
-                        const int strideH, const int strideW,
-                        const int paddingH, const int paddingW, 
-                        real* tgtData, const int tgtStride) {
+                        const int height,
+                        const int width,
+                        const int pooledH,
+                        const int pooledW,
+                        const int sizeX,
+                        const int sizeY,
+                        const int strideH,
+                        const int strideW,
+                        const int paddingH,
+                        const int paddingW,
+                        real* tgtData,
+                        const int tgtStride) {
   int num_kernels = pooledH * pooledW * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
-  KeAvgPoolForward<<< blocks, 1024, 0, STREAM_DEFAULT >>>
-           (num_kernels, inputData, channels,
-           height, width, pooledH, pooledW,
-           sizeX, sizeY, strideH, strideW,
-           paddingH, paddingW, tgtData, tgtStride);
+  KeAvgPoolForward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                        inputData,
+                                                        channels,
+                                                        height,
+                                                        width,
+                                                        pooledH,
+                                                        pooledW,
+                                                        sizeX,
+                                                        sizeY,
+                                                        strideH,
+                                                        strideW,
+                                                        paddingH,
+                                                        paddingW,
+                                                        tgtData,
+                                                        tgtStride);
   CHECK_SYNC("hl_avgpool_forward failed");
 }
 
-__global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad,
-                                  const int channels, const int height,
+__global__ void KeAvgPoolBackward(const int nthreads,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int height,
                                   const int width,
-                                  const int pooledH, const int pooledW,
-                                  const int sizeX, const int sizeY,
-                                  const int strideH, const int strideW,
-                                  const int padH, const int padW,
-                                  real scaleA, real scaleB,
-                                  real* tgtGrad, const int outStride) {
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeX,
+                                  const int sizeY,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int padH,
+                                  const int padW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* tgtGrad,
+                                  const int outStride) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     int offsetW = index % width + padW;
@@ -343,7 +298,6 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad,
     real gradient = 0;
     outGrad += (frameNum * outStride + offsetC * pooledH * pooledW);
 
-
     for (int ph = phstart; ph < phend; ++ph) {
       for (int pw = pwstart; pw < pwend; ++pw) {
         // figure out the pooling size
@@ -352,32 +306,50 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad,
         int hend = min(hstart + sizeY, height + padH);
         int wend = min(wstart + sizeX, width + padW);
         int poolsize = (hend - hstart) * (wend - wstart);
-        gradient += outGrad[ph * pooledW + pw]/poolsize;
+        gradient += outGrad[ph * pooledW + pw] / poolsize;
       }
     }
     tgtGrad[index] = scaleB * tgtGrad[index] + scaleA * gradient;
   }
 }
 
-void hl_avgpool_backward(const int frameCnt, const real* outGrad,
+void hl_avgpool_backward(const int frameCnt,
+                         const real* outGrad,
                          const int channels,
-                         const int height, const int width,
-                         const int pooledH, const int pooledW,
-                         const int sizeX, const int sizeY,
-                         const int strideH, const int strideW,
-                         const int paddingH, const int paddingW,
-                         real scaleA, real scaleB,
-                         real* backGrad, const int outStride) {
+                         const int height,
+                         const int width,
+                         const int pooledH,
+                         const int pooledW,
+                         const int sizeX,
+                         const int sizeY,
+                         const int strideH,
+                         const int strideW,
+                         const int paddingH,
+                         const int paddingW,
+                         real scaleA,
+                         real scaleB,
+                         real* backGrad,
+                         const int outStride) {
   int num_kernels = height * width * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
 
-  KeAvgPoolBackward <<< blocks, 1024, 0, STREAM_DEFAULT >>>
-           (num_kernels, outGrad, channels, height, width,
-           pooledH, pooledW, sizeX, sizeY,
-           strideH, strideW,
-           paddingH, paddingW,
-           scaleA, scaleB,
-           backGrad, outStride);
+  KeAvgPoolBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                         outGrad,
+                                                         channels,
+                                                         height,
+                                                         width,
+                                                         pooledH,
+                                                         pooledW,
+                                                         sizeX,
+                                                         sizeY,
+                                                         strideH,
+                                                         strideW,
+                                                         paddingH,
+                                                         paddingW,
+                                                         scaleA,
+                                                         scaleB,
+                                                         backGrad,
+                                                         outStride);
   CHECK_SYNC("hl_avgpool_backward failed");
 }
 
@@ -394,7 +366,7 @@ __global__ void KeBilinearInterpFw(const real* in,
                                    const size_t numChannels,
                                    const real ratioH,
                                    const real ratioW) {
-  int nthreads = outputH * outputW;                      
+  int nthreads = outputH * outputW;
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   if (tid < nthreads) {
     int outIdH = tid / outputW;
@@ -415,13 +387,14 @@ __global__ void KeBilinearInterpFw(const real* in,
     real w1lambda = ratioW * outImgIdx - inImgIdx;
     real w2lambda = 1.f - w1lambda;
 
-    const real* inPos =
-      &in[outIdH * inputW + channelId * inImgSize + inImgIdy * inImgW + inImgIdx];
+    const real* inPos = &in[outIdH * inputW + channelId * inImgSize +
+                            inImgIdy * inImgW + inImgIdx];
 
     // bilinear interpolation
     out[outIdH * outputW + outIdW] =
-      h2lambda * (w2lambda * inPos[0]            + w1lambda * inPos[wId]) + 
-      h1lambda * (w2lambda * inPos[hId * inImgW] + w1lambda * inPos[hId * inImgW + wId]);
+        h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wId]) +
+        h1lambda * (w2lambda * inPos[hId * inImgW] +
+                    w1lambda * inPos[hId * inImgW + wId]);
   }
 }
 
@@ -441,9 +414,19 @@ void hl_bilinear_forward(const real* inData,
   int threadNum = outputH * outputW;
   int blocks = (threadNum + 1024 - 1) / 1024;
 
-  KeBilinearInterpFw<<< blocks, 1024, 0, STREAM_DEFAULT>>>(
-    inData, inImgH, inImgW, inputH, inputW, outData, outImgH,
-    outImgW, outputH, outputW, numChannels, ratioH, ratioW);
+  KeBilinearInterpFw<<<blocks, 1024, 0, STREAM_DEFAULT>>>(inData,
+                                                          inImgH,
+                                                          inImgW,
+                                                          inputH,
+                                                          inputW,
+                                                          outData,
+                                                          outImgH,
+                                                          outImgW,
+                                                          outputH,
+                                                          outputW,
+                                                          numChannels,
+                                                          ratioH,
+                                                          ratioW);
   CHECK_SYNC("hl_bilinear_forward failed");
 }
 
@@ -481,13 +464,15 @@ __global__ void KeBilinearInterpBw(real* in,
     real w1lambda = ratioW * outImgIdx - inImgIdx;
     real w2lambda = 1.f - w1lambda;
 
-    real* inPos =
-      &in[outIdH * inputW + channelId * inImgSize + inImgIdy * inImgW + inImgIdx];
+    real* inPos = &in[outIdH * inputW + channelId * inImgSize +
+                      inImgIdy * inImgW + inImgIdx];
     const real* outPos = &out[outIdH * outputW + outIdW];
     paddle::paddleAtomicAdd(&inPos[0], h2lambda * w2lambda * outPos[0]);
     paddle::paddleAtomicAdd(&inPos[wId], h2lambda * w1lambda * outPos[0]);
-    paddle::paddleAtomicAdd(&inPos[hId * inImgW], h1lambda * w2lambda * outPos[0]);
-    paddle::paddleAtomicAdd(&inPos[hId * inImgW + wId], h1lambda * w1lambda * outPos[0]);
+    paddle::paddleAtomicAdd(&inPos[hId * inImgW],
+                            h1lambda * w2lambda * outPos[0]);
+    paddle::paddleAtomicAdd(&inPos[hId * inImgW + wId],
+                            h1lambda * w1lambda * outPos[0]);
   }
 }
 
@@ -507,22 +492,37 @@ void hl_bilinear_backward(real* inGrad,
   int threadNum = outputH * outputW;
   int blocks = (threadNum + 1024 - 1) / 1024;
 
-  KeBilinearInterpBw<<< blocks, 1024, 0, STREAM_DEFAULT>>>(
-    inGrad, inImgH, inImgW, inputH, inputW, outGrad, outImgH,
-    outImgW, outputH, outputW, numChannels, ratioH, ratioW);
+  KeBilinearInterpBw<<<blocks, 1024, 0, STREAM_DEFAULT>>>(inGrad,
+                                                          inImgH,
+                                                          inImgW,
+                                                          inputH,
+                                                          inputW,
+                                                          outGrad,
+                                                          outImgH,
+                                                          outImgW,
+                                                          outputH,
+                                                          outputW,
+                                                          numChannels,
+                                                          ratioH,
+                                                          ratioW);
   CHECK_SYNC("hl_bilinear_backward failed");
 }
 
-__global__ void maxoutFpCompute(size_t nthreads, const real * inData,
-                                real * outData, int* idData, 
-                                size_t size, size_t featLen, size_t groups) {
+__global__ void maxoutFpCompute(size_t nthreads,
+                                const real* inData,
+                                real* outData,
+                                int* idData,
+                                size_t size,
+                                size_t featLen,
+                                size_t groups) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if(index < nthreads) {
+  if (index < nthreads) {
     size_t batch_idx = index / size;
     size_t i = index % size;
     size_t channel_idx = i / featLen;
     size_t feat_idx = i % featLen;
-    size_t data_idx = (batch_idx * size + channel_idx * featLen) * groups + feat_idx;
+    size_t data_idx =
+        (batch_idx * size + channel_idx * featLen) * groups + feat_idx;
     real max = inData[data_idx];
     int maxId = 0;
     for (size_t g = 1; g < groups; ++g) {
@@ -537,37 +537,50 @@ __global__ void maxoutFpCompute(size_t nthreads, const real * inData,
   }
 }
 
-void hl_maxout_forward(const real* inData, real* outData,
-                       int* idData, size_t batchSize, size_t size,
-                       size_t featLen, size_t groups) {
+void hl_maxout_forward(const real* inData,
+                       real* outData,
+                       int* idData,
+                       size_t batchSize,
+                       size_t size,
+                       size_t featLen,
+                       size_t groups) {
   int num_kernels = size * batchSize;
   int blocks = (num_kernels + 1024 - 1) / 1024;
-  maxoutFpCompute<<< blocks, 1024, 0, STREAM_DEFAULT>>>(
-    num_kernels, inData, outData, idData, size, featLen, groups);
+  maxoutFpCompute<<<blocks, 1024, 0, STREAM_DEFAULT>>>(
+      num_kernels, inData, outData, idData, size, featLen, groups);
   CHECK_SYNC("hl_maxout_forward failed");
 }
 
-__global__ void maxoutBpCompute(size_t nthreads, real* inGrad,
-                                const real* outGrad, const int* idData,
-                                size_t size, size_t featLen, size_t groups) {
+__global__ void maxoutBpCompute(size_t nthreads,
+                                real* inGrad,
+                                const real* outGrad,
+                                const int* idData,
+                                size_t size,
+                                size_t featLen,
+                                size_t groups) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if(index < nthreads) {
+  if (index < nthreads) {
     size_t batch_idx = index / size;
     size_t i = index % size;
     size_t channel_idx = i / featLen;
     size_t feat_idx = i % featLen;
     size_t newIndex = batch_idx * size;
-    size_t gradIdx = (channel_idx * groups + (idData + newIndex)[i]) * featLen + feat_idx;
+    size_t gradIdx =
+        (channel_idx * groups + (idData + newIndex)[i]) * featLen + feat_idx;
     (inGrad + newIndex * groups)[gradIdx] += (outGrad + newIndex)[i];
   }
 }
 
-void hl_maxout_backward(real* inGrad, const real* outGrad,
-                        const int* idData, size_t batchSize, size_t size,
-                        size_t featLen, size_t groups) {
+void hl_maxout_backward(real* inGrad,
+                        const real* outGrad,
+                        const int* idData,
+                        size_t batchSize,
+                        size_t size,
+                        size_t featLen,
+                        size_t groups) {
   int num_kernels = size * batchSize;
   int blocks = (num_kernels + 1024 - 1) / 1024;
-  maxoutBpCompute<<< blocks, 1024, 0, STREAM_DEFAULT >>>(
-    num_kernels, inGrad, outGrad, idData, size, featLen, groups);
+  maxoutBpCompute<<<blocks, 1024, 0, STREAM_DEFAULT>>>(
+      num_kernels, inGrad, outGrad, idData, size, featLen, groups);
   CHECK_SYNC("hl_maxout_backward failed");
 }
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
index c53a563682..78642a1744 100644
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -1022,6 +1022,7 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
   real alpha = 1.0f;
   real beta = 1.0f;
   cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
+
   CHECK_CUDNN(
       dynload::cudnnBatchNormalizationForwardInference(t_resource.cudnn_handle,
                                                        mode,
diff --git a/paddle/cuda/src/hl_cuda_lstm.cu b/paddle/cuda/src/hl_cuda_lstm.cu
index b869d903ba..a5ce81a904 100644
--- a/paddle/cuda/src/hl_cuda_lstm.cu
+++ b/paddle/cuda/src/hl_cuda_lstm.cu
@@ -12,14 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
+#include "hl_activation_functions.h"
 #include "hl_base.h"
 #include "hl_cuda_cublas.h"
 #include "hl_device_functions.cuh"
-#include "hl_activation_functions.h"
 #include "paddle/utils/Logging.h"
 
-typedef hppl::Active<real>::forward  t_forward;
+typedef hppl::Active<real>::forward t_forward;
 typedef hppl::Active<real>::backward t_backward;
 
 bool hl_lstm_sequence_parallel(int frameSize) {
@@ -42,9 +41,9 @@ public:
       value_ += (start + length - 1) * frameSize + idx;
     }
   }
-  __device__ inline real *getPtr() const {return value_;}
-  __device__ inline real getValue() {return *value_;}
-  __device__ inline void setValue(real value) {*value_ = value;}
+  __device__ inline real *getPtr() const { return value_; }
+  __device__ inline real getValue() { return *value_; }
+  __device__ inline void setValue(real value) { *value_ = value; }
   template <int reversed, int frameSize>
   __device__ inline void nextFrame() {
     if (reversed == 0) {
@@ -55,28 +54,25 @@ public:
   }
 };
 
-__device__ __forceinline__
-void ptx_sync(const int id, const int barriers) {
+__device__ __forceinline__ void ptx_sync(const int id, const int barriers) {
   asm volatile("bar.sync %0, %1;" : : "r"(id), "r"(barriers) : "memory");
 }
 
-__device__ __forceinline__
-void ptx_arrive(const int id, const int barriers) {
+__device__ __forceinline__ void ptx_arrive(const int id, const int barriers) {
   asm volatile("bar.arrive %0, %1;" : : "r"(id), "r"(barriers) : "memory");
 }
 
-template<int valueSize, int frameSize>
-__device__ __forceinline__ real
-forward_sequence(real value,
-                 real *shValue,
-                 real *state,
-                 real *preOutput,
-                 real *output,
-                 real check,
-                 int index,
-                 t_forward activeNode,
-                 t_forward activeGate,
-                 t_forward activeState) {
+template <int valueSize, int frameSize>
+__device__ __forceinline__ real forward_sequence(real value,
+                                                 real *shValue,
+                                                 real *state,
+                                                 real *preOutput,
+                                                 real *output,
+                                                 real check,
+                                                 int index,
+                                                 t_forward activeNode,
+                                                 t_forward activeGate,
+                                                 t_forward activeState) {
   real out;
   real prevOut;
   real state_r;
@@ -112,17 +108,20 @@ forward_sequence(real value,
   if (idy == 0) {
     ptx_sync(2, frameSize * 2);
     prevOut = state[idx];
-     prevOut = activeState(prevOut);
+    prevOut = activeState(prevOut);
     preOutput[idx] = prevOut;
     ptx_arrive(3, frameSize * 2);
   }
   return value;
 }
 
-#define     OUTPUT_BARRIER_ID               10
-#define     OUTPUT_BARRIER_ID2              11
-template<int valueSize, int frameSize, int reversed,
-         int computeThreads, int blockSize>
+#define OUTPUT_BARRIER_ID 10
+#define OUTPUT_BARRIER_ID2 11
+template <int valueSize,
+          int frameSize,
+          int reversed,
+          int computeThreads,
+          int blockSize>
 __global__ void KeLstmForward(real *gateValue,
                               real *state,
                               real *output,
@@ -184,10 +183,16 @@ __global__ void KeLstmForward(real *gateValue,
         }
       }
       value = forward_sequence<valueSize, frameSize>(
-        value, shValue, shState, shPrevOutput, shOutput, check, index,
-        hppl::gpu::forward[active_node],
-        hppl::gpu::forward[active_gate],
-        hppl::gpu::forward[active_state]);
+          value,
+          shValue,
+          shState,
+          shPrevOutput,
+          shOutput,
+          check,
+          index,
+          hppl::gpu::forward[active_node],
+          hppl::gpu::forward[active_gate],
+          hppl::gpu::forward[active_state]);
       const int idx = index % frameSize;
       const int idy = index / frameSize;
       if (valueSize == 128) {
@@ -218,7 +223,7 @@ __global__ void KeLstmForward(real *gateValue,
           real B_r[frameSize];
           const int computeIdx = index - valueSize;
           if (i == 0) {
-            #pragma unroll
+#pragma unroll
             for (int n = 0; n < frameSize; n++) {
               B_r[n] = weight[n * valueSize + computeIdx];
             }
@@ -230,7 +235,7 @@ __global__ void KeLstmForward(real *gateValue,
           }
           real sum = 0.0f;
           for (int n = 0; n < frameSize; n++) {
-            sum += A_r[n]*B_r[n];
+            sum += A_r[n] * B_r[n];
           }
           shValue[computeIdx] = sum;
           ptx_arrive(OUTPUT_BARRIER_ID2, blockSize);
@@ -239,14 +244,14 @@ __global__ void KeLstmForward(real *gateValue,
       if (valueSize == 256) {
         real B_r[frameSize];
         if (i == 0) {
-          #pragma unroll
+#pragma unroll
           for (int n = 0; n < frameSize; n++) {
             B_r[n] = weight[n * valueSize + index];
           }
         }
         real sum = 0.0f;
         for (int n = 0; n < frameSize; n++) {
-          sum += shOutput[n]*B_r[n];
+          sum += shOutput[n] * B_r[n];
         }
         value += sum;
       }
@@ -273,50 +278,81 @@ void hl_lstm_parallel_forward(real *gateValue,
   dim3 grid(numSequences, 1);
   if (!reversed) {
     if (frameSize == 32) {
-      KeLstmForward<128, 32, 0, 128, 256>
-               <<<grid, 256, 0, STREAM_DEFAULT>>>
-               (gateValue, stateValue, outputValue, preOutputValue,
-               checkIg, checkFg, checkOg, weight, sequence,
-               active_node, active_gate, active_state);
+      KeLstmForward<128, 32, 0, 128, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          stateValue,
+          outputValue,
+          preOutputValue,
+          checkIg,
+          checkFg,
+          checkOg,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
     } else if (frameSize == 64) {
-      KeLstmForward<256, 64, 0, 256, 256>
-               <<<grid, 256, 0, STREAM_DEFAULT>>>
-               (gateValue, stateValue, outputValue, preOutputValue,
-               checkIg, checkFg, checkOg, weight, sequence,
-               active_node, active_gate, active_state);
+      KeLstmForward<256, 64, 0, 256, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          stateValue,
+          outputValue,
+          preOutputValue,
+          checkIg,
+          checkFg,
+          checkOg,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
     }
   } else {
     if (frameSize == 32) {
-      KeLstmForward<128, 32, 1, 128, 256>
-               <<<grid, 256, 0, STREAM_DEFAULT>>>
-               (gateValue, stateValue, outputValue, preOutputValue,
-               checkIg, checkFg, checkOg, weight, sequence,
-               active_node, active_gate, active_state);
+      KeLstmForward<128, 32, 1, 128, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          stateValue,
+          outputValue,
+          preOutputValue,
+          checkIg,
+          checkFg,
+          checkOg,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
     } else if (frameSize == 64) {
-      KeLstmForward<256, 64, 1, 256, 256>
-               <<<grid, 256, 0, STREAM_DEFAULT>>>
-               (gateValue, stateValue, outputValue, preOutputValue,
-               checkIg, checkFg, checkOg, weight, sequence,
-               active_node, active_gate, active_state);
+      KeLstmForward<256, 64, 1, 256, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          stateValue,
+          outputValue,
+          preOutputValue,
+          checkIg,
+          checkFg,
+          checkOg,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
     }
   }
   CHECK_SYNC("hl_lstm_parallel_forward failed");
 }
 
-__device__ __forceinline__
-void transpose_32x32(real a[], const int idx) {
+__device__ __forceinline__ void transpose_32x32(real a[], const int idx) {
   int addr = idx % 32;
-  #pragma unroll
+#pragma unroll
   for (int k = 1; k < 32; k++) {
     // rSrc[k] = __shfl(rSrc[k], (threadIdx.x + k) % 32, 32);
     addr = __shfl(addr, (idx + 1) % 32, 32);
     a[k] = __shfl(a[k], addr, 32);
   }
 
-  #pragma unroll
+#pragma unroll
   for (int tid = 0; tid < 31; tid++) {
     real tmp = (idx > tid) ? a[0] : a[1];
-    #pragma unroll
+#pragma unroll
     for (int k = 31; k > 0; k--) {
       a[(k + 1) % 32] = (idx > tid) ? a[k] : a[(k + 1) % 32];
     }
@@ -324,29 +360,28 @@ void transpose_32x32(real a[], const int idx) {
   }
 
   addr = (32 - idx) % 32;
-  #pragma unroll
+#pragma unroll
   for (int k = 0; k < 32; k++) {
     a[k] = __shfl(a[k], addr, 32);
     addr = __shfl(addr, (idx + 31) % 32, 32);
   }
 }
 
-template<int valueSize, int frameSize>
-__device__ void
-backward_sequence(real rGateValue,
-                  real rOutputGrad,
-                  real rPreOutputValue,
-                  real &rGateGrad,
-                  real &rStateGrad,
-                  real *shStateGrad,
-                  real *shStateValue,
-                  real *shGateValue,
-                  real rCheck,
-                  real &rGateValuePrev,
-                  int index,
-                  t_backward activeNode,
-                  t_backward activeGate,
-                  t_backward activeState) {
+template <int valueSize, int frameSize>
+__device__ void backward_sequence(real rGateValue,
+                                  real rOutputGrad,
+                                  real rPreOutputValue,
+                                  real &rGateGrad,
+                                  real &rStateGrad,
+                                  real *shStateGrad,
+                                  real *shStateValue,
+                                  real *shGateValue,
+                                  real rCheck,
+                                  real &rGateValuePrev,
+                                  int index,
+                                  t_backward activeNode,
+                                  t_backward activeGate,
+                                  t_backward activeState) {
   const int frameIdx = index % frameSize;
   const int frameIdy = index / frameSize;
   if (frameIdy == 3) {
@@ -363,8 +398,8 @@ backward_sequence(real rGateValue,
     rStateGrad = rGateGrad * rCheck;
     shStateGrad[index] = rStateGrad;
     ptx_sync(3, valueSize);
-    rStateGrad += shStateGrad[frameIdx + frameSize *2];
-    rStateGrad += shStateGrad[frameIdx + frameSize *3];
+    rStateGrad += shStateGrad[frameIdx + frameSize * 2];
+    rStateGrad += shStateGrad[frameIdx + frameSize * 3];
     rGateGrad = rStateGrad * shGateValue[frameIdx];
     rGateGrad = activeGate(rGateGrad, rGateValue);
   } else if (frameIdy == 2) {
@@ -373,7 +408,7 @@ backward_sequence(real rGateValue,
     shStateGrad[index] = rStateGrad;
     ptx_sync(3, valueSize);
     rStateGrad += shStateGrad[frameIdx + frameSize];
-    rStateGrad += shStateGrad[frameIdx + frameSize *3];
+    rStateGrad += shStateGrad[frameIdx + frameSize * 3];
     rGateValuePrev = rGateValue;
     rGateGrad = rStateGrad * shStateValue[frameIdx];
     rGateGrad = activeGate(rGateGrad, rGateValue);
@@ -381,43 +416,43 @@ backward_sequence(real rGateValue,
     shGateValue[frameIdx] = rGateValue;
     ptx_sync(3, valueSize);
     rStateGrad = shStateGrad[frameIdx + frameSize];
-    rStateGrad += shStateGrad[frameIdx + frameSize *2];
-    rStateGrad += shStateGrad[frameIdx + frameSize *3];
+    rStateGrad += shStateGrad[frameIdx + frameSize * 2];
+    rStateGrad += shStateGrad[frameIdx + frameSize * 3];
     rGateGrad = rStateGrad * shGateValue[frameIdx + frameSize];
     rGateGrad = activeNode(rGateGrad, rGateValue);
   }
 }
 
-template<int valueSize, int frameSize>
+template <int valueSize, int frameSize>
 __device__ void load_weight(real rWeight[], real *weight, const int index) {
   if (valueSize == 128) {
     weight += index;
-    #pragma unroll
+#pragma unroll
     for (int n = 0; n < frameSize; n++) {
-      rWeight[n] = weight[n*valueSize];
+      rWeight[n] = weight[n * valueSize];
     }
     transpose_32x32(rWeight, index % 32);
   }
   if (valueSize == 256) {
     int id = (index / 32) % 2;
     weight += index - id * 32 + id * 32 * valueSize;
-    #pragma unroll
+#pragma unroll
     for (int n = 0; n < 32; n++) {
-      rWeight[n] = weight[n*valueSize];
-      rWeight[n + 32] = weight[n*valueSize + 32];
+      rWeight[n] = weight[n * valueSize];
+      rWeight[n + 32] = weight[n * valueSize + 32];
     }
     transpose_32x32(rWeight, index % 32);
     transpose_32x32(&rWeight[32], index % 32);
   }
 }
 
-template<int valueSize, int frameSize, int reversed>
+template <int valueSize, int frameSize, int reversed>
 __global__ void KeLstmBackward(real *gateValue,
                                real *gateGrad,
                                real *stateValue,
-                               real *stateGrad,       /* do not need save */
+                               real *stateGrad, /* do not need save */
                                real *preOutputValue,
-                               real *preOutputGrad,   /* do not need save */
+                               real *preOutputGrad, /* do not need save */
                                real *checkIg,
                                real *checkIgGrad,
                                real *checkFg,
@@ -484,20 +519,27 @@ __global__ void KeLstmBackward(real *gateValue,
 
   for (int i = 0; i < length; ++i) {
     if (frameIdy == 3) {
-      if (i != length -1) {
+      if (i != length - 1) {
         frameStateValue.nextFrame<!reversed, frameSize>();
         shStateValue[frameIdx] = frameStateValue.getValue();
       } else {
         shStateValue[frameIdx] = 0.0;
       }
     }
-    backward_sequence<valueSize, frameSize>(
-        rGateValue, rOutputGrad, rPreOutputValue, rGateGrad,
-        rStateGrad, shStateGrad, shStateValue, shGateValue,
-        rCheck, rGateValuePrev, index,
-        hppl::gpu::backward[active_node],
-        hppl::gpu::backward[active_gate],
-        hppl::gpu::backward[active_state]);
+    backward_sequence<valueSize, frameSize>(rGateValue,
+                                            rOutputGrad,
+                                            rPreOutputValue,
+                                            rGateGrad,
+                                            rStateGrad,
+                                            shStateGrad,
+                                            shStateValue,
+                                            shGateValue,
+                                            rCheck,
+                                            rGateValuePrev,
+                                            index,
+                                            hppl::gpu::backward[active_node],
+                                            hppl::gpu::backward[active_gate],
+                                            hppl::gpu::backward[active_state]);
     if (frameIdy == 3) {
       rCheckGrad += rGateGrad * rStateValue;
       rStateValue = shStateValue[frameIdx];
@@ -523,9 +565,9 @@ __global__ void KeLstmBackward(real *gateValue,
       shGateGrad[frameIdy][frameIdx] = rGateGrad;
       if (valueSize == 128) {
         real sum = 0.0f;
-        #pragma unroll
+#pragma unroll
         for (int n = 0; n < frameSize; n++) {
-          sum += shGateGrad[frameIdy][n]*B_r[n];
+          sum += shGateGrad[frameIdy][n] * B_r[n];
         }
         if (frameIdy == 3) {
           rOutputGrad += sum;
@@ -541,7 +583,7 @@ __global__ void KeLstmBackward(real *gateValue,
         }
         real sum = 0.0f;
         for (int n = 0; n < frameSize; n++) {
-          sum += A_r[n]*B_r[n];
+          sum += A_r[n] * B_r[n];
         }
         if (frameIdy == 3) {
           rOutputGrad += sum;
@@ -552,8 +594,8 @@ __global__ void KeLstmBackward(real *gateValue,
 
       if (frameIdy == 3) {
         ptx_sync(6, valueSize);
-        #pragma unroll
-        for (int i = 0; i < 3; i ++) {
+#pragma unroll
+        for (int i = 0; i < 3; i++) {
           rOutputGrad += shOutputGrad[i][frameIdx];
         }
       } else {
@@ -564,11 +606,14 @@ __global__ void KeLstmBackward(real *gateValue,
 
   /* TODO: Temporary save & merger in another kernel */
   if (frameIdy == 1) {
-    if (checkIgGrad) paddle::paddleAtomicAdd(checkIgGrad+frameIdx, rCheckGrad);
+    if (checkIgGrad)
+      paddle::paddleAtomicAdd(checkIgGrad + frameIdx, rCheckGrad);
   } else if (frameIdy == 2) {
-    if (checkFgGrad) paddle::paddleAtomicAdd(checkFgGrad+frameIdx, rCheckGrad);
+    if (checkFgGrad)
+      paddle::paddleAtomicAdd(checkFgGrad + frameIdx, rCheckGrad);
   } else if (frameIdy == 3) {
-    if (checkOgGrad) paddle::paddleAtomicAdd(checkOgGrad+frameIdx, rCheckGrad);
+    if (checkOgGrad)
+      paddle::paddleAtomicAdd(checkOgGrad + frameIdx, rCheckGrad);
   }
 }
 
@@ -593,68 +638,183 @@ void hl_lstm_parallel_backward_data(real *gateValue,
                                     hl_activation_mode_t active_node,
                                     hl_activation_mode_t active_gate,
                                     hl_activation_mode_t active_state) {
-  CHECK(frameSize == 32 || frameSize == 64 ||
-        frameSize == 128 || frameSize == 256);
+  CHECK(frameSize == 32 || frameSize == 64 || frameSize == 128 ||
+        frameSize == 256);
   dim3 grid(numSequences, 1);
   if (!reversed) {
     if (frameSize == 32) {
-      KeLstmBackward<128, 32, 0><<<grid, 128, 0, STREAM_DEFAULT>>>
-          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
-          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
-          checkOgGrad, outputGrad, weight, sequence,
-          active_node, active_gate, active_state);
+      KeLstmBackward<128, 32, 0><<<grid, 128, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
     } else if (frameSize == 64) {
-      KeLstmBackward<256, 64, 0><<<grid, 256, 0, STREAM_DEFAULT>>>
-          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
-          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
-          checkOgGrad, outputGrad, weight, sequence,
-          active_node, active_gate, active_state);
+      KeLstmBackward<256, 64, 0><<<grid, 256, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
     } else if (frameSize == 128) {
-      KeLstmBackward<512, 128, 0><<<grid, 512, 0, STREAM_DEFAULT>>>
-          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
-          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
-          checkOgGrad, outputGrad, weight, sequence,
-          active_node, active_gate, active_state);
+      KeLstmBackward<512, 128, 0><<<grid, 512, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
     } else if (frameSize == 256) {
-      KeLstmBackward<1024, 256, 0><<<grid, 1024, 0, STREAM_DEFAULT>>>
-          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
-          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
-          checkOgGrad, outputGrad, weight, sequence,
-          active_node, active_gate, active_state);
+      KeLstmBackward<1024, 256, 0><<<grid, 1024, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
     }
   } else {
     if (frameSize == 32) {
-      KeLstmBackward<128, 32, 1><<<grid, 128, 0, STREAM_DEFAULT>>>
-          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
-          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
-          checkOgGrad, outputGrad, weight, sequence,
-          active_node, active_gate, active_state);
+      KeLstmBackward<128, 32, 1><<<grid, 128, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
     } else if (frameSize == 64) {
-      KeLstmBackward<256, 64, 1><<<grid, 256, 0, STREAM_DEFAULT>>>
-          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
-          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
-          checkOgGrad, outputGrad, weight, sequence,
-          active_node, active_gate, active_state);
+      KeLstmBackward<256, 64, 1><<<grid, 256, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
     } else if (frameSize == 128) {
-      KeLstmBackward<512, 128, 1><<<grid, 512, 0, STREAM_DEFAULT>>>
-          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
-          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
-          checkOgGrad, outputGrad, weight, sequence,
-          active_node, active_gate, active_state);
+      KeLstmBackward<512, 128, 1><<<grid, 512, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
     } else if (frameSize == 256) {
-      KeLstmBackward<1024, 256, 1><<<grid, 1024, 0, STREAM_DEFAULT>>>
-          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
-          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
-          checkOgGrad, outputGrad, weight, sequence,
-          active_node, active_gate, active_state);
+      KeLstmBackward<1024, 256, 1><<<grid, 1024, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
     }
   }
   CHECK_SYNC("hl_lstm_parallel_backward_data");
 }
 
-template<int B_X, int B_Y>
+template <int B_X, int B_Y>
 __global__ void KeSetGradZero(real *gateGrad,
-    const int *starts, int valueSize, int numSequences, bool reversed) {
+                              const int *starts,
+                              int valueSize,
+                              int numSequences,
+                              bool reversed) {
   // const int tid = threadIdx.x;
 
   const int frameIdx = blockIdx.x * B_X + threadIdx.x;
@@ -682,19 +842,31 @@ void hl_lstm_parallel_backward_weight(real *weightGrad,
   int valueSize = 4 * frameSize;
   dim3 threads(32, 32);
   dim3 grid((valueSize + 32 - 1) / 32, (numSequences + 32 - 1) / 32);
-  KeSetGradZero<32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>
-           (gateGrad, sequence, valueSize, numSequences, reversed);
+  KeSetGradZero<32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      gateGrad, sequence, valueSize, numSequences, reversed);
 
   if (!reversed) {
     hl_matrix_mul(outputValue,
-      HPPL_OP_T, gateGrad + valueSize, HPPL_OP_N, weightGrad,
-      frameSize, valueSize, batchSize - 1,
-      1.0, 1.0);
+                  HPPL_OP_T,
+                  gateGrad + valueSize,
+                  HPPL_OP_N,
+                  weightGrad,
+                  frameSize,
+                  valueSize,
+                  batchSize - 1,
+                  1.0,
+                  1.0);
   } else {
     hl_matrix_mul(outputValue + frameSize,
-      HPPL_OP_T, gateGrad, HPPL_OP_N, weightGrad,
-      frameSize, valueSize, batchSize - 1,
-      1.0, 1.0);
+                  HPPL_OP_T,
+                  gateGrad,
+                  HPPL_OP_N,
+                  weightGrad,
+                  frameSize,
+                  valueSize,
+                  batchSize - 1,
+                  1.0,
+                  1.0);
   }
   CHECK_SYNC("hl_lstm_parallel_backward_weight");
 }
diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu
index 9bcc7fb7de..39272456c3 100644
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "hl_base.h"
+#include "hl_device_functions.cuh"
+#include "hl_gpu_matrix_kernel.cuh"
 #include "hl_matrix.h"
-#include "hl_matrix_ops.cuh"
 #include "hl_matrix_apply.cuh"
+#include "hl_matrix_ops.cuh"
 #include "hl_sequence.h"
 #include "hl_sparse.ph"
 #include "paddle/utils/Logging.h"
-#include "hl_device_functions.cuh"
-#include "hl_gpu_matrix_kernel.cuh"
 
 DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1*a + p2*b);
-void hl_matrix_add(real *A_d,
-                   real *B_d,
-                   real *C_d,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1 * a + p2 * b);
+void hl_matrix_add(real* A_d,
+                   real* B_d,
+                   real* C_d,
                    int dimM,
                    int dimN,
                    real alpha,
@@ -36,33 +35,32 @@ void hl_matrix_add(real *A_d,
   CHECK_NOTNULL(B_d);
   CHECK_NOTNULL(C_d);
 
-  hl_gpu_apply_ternary_op
-    <real, ternary::_add<real>, 0, 0>(ternary::_add<real>(alpha, beta),
-                                      A_d,
-                                      B_d,
-                                      C_d,
-                                      dimM,
-                                      dimN,
-                                      dimN,
-                                      dimN,
-                                      dimN);
+  hl_gpu_apply_ternary_op<real, ternary::_add<real>, 0, 0>(
+      ternary::_add<real>(alpha, beta),
+      A_d,
+      B_d,
+      C_d,
+      dimM,
+      dimN,
+      dimN,
+      dimN,
+      dimN);
   CHECK_SYNC("hl_matrix_add failed");
 }
 
 #ifdef PADDLE_TYPE_DOUBLE
-    #define THRESHOLD   128
+#define THRESHOLD 128
 #else
-    #define THRESHOLD   64
+#define THRESHOLD 64
 #endif
-__device__ __forceinline__
-void findMax(real* I,
-             real* dfMax_s,
-             int blockSize,
-             int base,
-             int curIdx,
-             int nextIdx,
-             int dimN,
-             real* max) {
+__device__ __forceinline__ void findMax(real* I,
+                                        real* dfMax_s,
+                                        int blockSize,
+                                        int base,
+                                        int curIdx,
+                                        int nextIdx,
+                                        int dimN,
+                                        real* max) {
   dfMax_s[base] = -1.0e20;
   while (curIdx < dimN) {
     if (dfMax_s[base] < I[nextIdx]) {
@@ -78,25 +76,24 @@ void findMax(real* I,
     if (base < stride) {
       nextIdx = base + stride;
       if (dfMax_s[base] < dfMax_s[nextIdx]) {
-          dfMax_s[base] = dfMax_s[nextIdx];
+        dfMax_s[base] = dfMax_s[nextIdx];
       }
     }
   }
 
-  if (0 == base)  {
+  if (0 == base) {
     max[0] = dfMax_s[0];
   }
   __syncthreads();
 }
 
-__device__ __forceinline__
-void subMaxAndExp(real* I,
-                  real* O,
-                  int curIdx,
-                  int nextIdx,
-                  int blockSize,
-                  int dimN,
-                  real max) {
+__device__ __forceinline__ void subMaxAndExp(real* I,
+                                             real* O,
+                                             int curIdx,
+                                             int nextIdx,
+                                             int blockSize,
+                                             int dimN,
+                                             real max) {
   real val;
   while (curIdx < dimN) {
     val = I[nextIdx] - max;
@@ -115,14 +112,13 @@ void subMaxAndExp(real* I,
   __syncthreads();
 }
 
-__device__ __forceinline__
-void valueSum(real* O,
-              real* dfMax_s,
-              int blockSize,
-              int base,
-              int curIdx,
-              int nextIdx,
-              int dimN) {
+__device__ __forceinline__ void valueSum(real* O,
+                                         real* dfMax_s,
+                                         int blockSize,
+                                         int base,
+                                         int curIdx,
+                                         int nextIdx,
+                                         int dimN) {
   dfMax_s[base] = 0;
   while (curIdx < dimN) {
     dfMax_s[base] += O[nextIdx];
@@ -141,13 +137,8 @@ void valueSum(real* O,
   __syncthreads();
 }
 
-__device__ __forceinline__
-void divSum(real* O,
-            real sum,
-            int curIdx,
-            int nextIdx,
-            int blockSize,
-            int dimN) {
+__device__ __forceinline__ void divSum(
+    real* O, real sum, int curIdx, int nextIdx, int blockSize, int dimN) {
   while (curIdx < dimN) {
     O[nextIdx] /= sum;
     nextIdx += blockSize;
@@ -155,20 +146,18 @@ void divSum(real* O,
   }
 }
 
-__device__ __forceinline__
-void softmax(real* I,
-             real* O,
-             real* dfMax_s,
-             int blockSize,
-             int base,
-             int curIdx,
-             int nextIdx,
-             int dimN) {
+__device__ __forceinline__ void softmax(real* I,
+                                        real* O,
+                                        real* dfMax_s,
+                                        int blockSize,
+                                        int base,
+                                        int curIdx,
+                                        int nextIdx,
+                                        int dimN) {
   __shared__ real max;
 
   // find the max number
-  findMax(I, dfMax_s, blockSize, base, curIdx,
-          nextIdx, dimN, &max);
+  findMax(I, dfMax_s, blockSize, base, curIdx, nextIdx, dimN, &max);
 
   // sub max Value and do Exp operation
   subMaxAndExp(I, O, base, nextIdx, blockSize, dimN, max);
@@ -181,8 +170,8 @@ void softmax(real* I,
   divSum(O, dfMax_s[0], curIdx, nextIdx, blockSize, dimN);
 }
 
-template<int blockSize>
-__global__ void KeMatrixSoftMax(real *O, real *I, int dimN) {
+template <int blockSize>
+__global__ void KeMatrixSoftMax(real* O, real* I, int dimN) {
   int base = threadIdx.x;
   __shared__ real dfMax_s[blockSize];
   int nextIdx = blockIdx.x * dimN + base;
@@ -191,19 +180,18 @@ __global__ void KeMatrixSoftMax(real *O, real *I, int dimN) {
   softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
 }
 
-void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN) {
+void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(C_d);
 
   dim3 block(512, 1);
   dim3 grid(dimM, 1);
-  KeMatrixSoftMax<512>
-           <<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, dimN);
+  KeMatrixSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, dimN);
   CHECK_SYNC("hl_matrix_softmax failed");
 }
 
-template<int blockSize>
-__global__ void KeSequenceSoftMax(real *O, real *I, const int* index) {
+template <int blockSize>
+__global__ void KeSequenceSoftMax(real* O, real* I, const int* index) {
   int base = threadIdx.x;
   int bid = blockIdx.x;
   __shared__ real dfMax_s[blockSize];
@@ -217,8 +205,8 @@ __global__ void KeSequenceSoftMax(real *O, real *I, const int* index) {
   softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
 }
 
-void hl_sequence_softmax_forward(real *A_d,
-                                 real *C_d,
+void hl_sequence_softmax_forward(real* A_d,
+                                 real* C_d,
                                  const int* index,
                                  int numSequence) {
   CHECK_NOTNULL(A_d);
@@ -226,59 +214,48 @@ void hl_sequence_softmax_forward(real *A_d,
 
   dim3 block(512, 1);
   dim3 grid(numSequence, 1);
-  KeSequenceSoftMax<512>
-           <<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, index);
+  KeSequenceSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, index);
   CHECK_SYNC("hl_sequence_softmax_forward failed");
 }
 
-__global__ void KeMatrixDerivative(real *grad_d,
-                                   real *output_d,
-                                   real *sftmaxSum_d,
-                                   int dimM,
-                                   int dimN) {
-  int rowIdx = blockIdx.x*blockDim.x + threadIdx.x;
-  int colIdx = blockIdx.y*blockDim.y + threadIdx.y;
+__global__ void KeMatrixDerivative(
+    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
   int index;
 
   if (rowIdx < dimM && colIdx < dimN) {
-    index = rowIdx*dimN + colIdx;
+    index = rowIdx * dimN + colIdx;
     grad_d[index] = output_d[index] * (grad_d[index] - sftmaxSum_d[rowIdx]);
   }
 }
 
-void hl_matrix_softmax_derivative(real *grad_d,
-                                  real *output_d,
-                                  real *sftmaxSum_d,
-                                  int dimM,
-                                  int dimN) {
+void hl_matrix_softmax_derivative(
+    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
   CHECK_NOTNULL(grad_d);
   CHECK_NOTNULL(output_d);
   CHECK_NOTNULL(sftmaxSum_d);
 
   int blocksX = (dimM + 0) / 1;
-  int blocksY = (dimN + 1024 -1) / 1024;
+  int blocksY = (dimN + 1024 - 1) / 1024;
   dim3 threads(1, 1024);
   dim3 grid(blocksX, blocksY);
 
-  KeMatrixDerivative<<< grid, threads, 0, STREAM_DEFAULT >>>
-           (grad_d, output_d, sftmaxSum_d, dimM, dimN);
+  KeMatrixDerivative<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      grad_d, output_d, sftmaxSum_d, dimM, dimN);
   CHECK_SYNC("hl_matrix_softmax_derivative failed");
 }
 
-__global__ void KeMatrixMultiBinaryCrossEntropy(real* output,
-                                                real* entropy,
-                                                int* row,
-                                                int* col,
-                                                int dimM,
-                                                int dimN) {
+__global__ void KeMatrixMultiBinaryCrossEntropy(
+    real* output, real* entropy, int* row, int* col, int dimM, int dimN) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < dimM) {
-    for (int i = 0; i < dimN; i ++) {
+    for (int i = 0; i < dimN; i++) {
       entropy[index] -= log(1 - output[index * dimN + i]);
     }
-    int *row_col = col + row[index];
+    int* row_col = col + row[index];
     int col_num = row[index + 1] - row[index];
-    for (int i = 0; i < col_num; i ++) {
+    for (int i = 0; i < col_num; i++) {
       real o = output[index * dimN + row_col[i]];
       entropy[index] -= log(o / (1 - o));
     }
@@ -299,37 +276,30 @@ void hl_matrix_multi_binary_cross_entropy(real* output,
   dim3 threads(n_threads);
   dim3 grid(blocks);
   hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
-  KeMatrixMultiBinaryCrossEntropy<<< grid, threads, 0, STREAM_DEFAULT >>>
-          (output, entropy, mat->csr_row, mat->csr_col, dimM, dimN);
+  KeMatrixMultiBinaryCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      output, entropy, mat->csr_row, mat->csr_col, dimM, dimN);
   CHECK_SYNC("hl_matrix_multi_binary_cross_entropy failed");
 }
 
-__global__ void KeMatrixMultiBinaryCrossEntropyBp(real* output,
-                                                  real* grad,
-                                                  int* row,
-                                                  int* col,
-                                                  int dimM,
-                                                  int dimN) {
+__global__ void KeMatrixMultiBinaryCrossEntropyBp(
+    real* output, real* grad, int* row, int* col, int dimM, int dimN) {
   int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (row_idx < dimM) {
-    for (int i = 0; i < dimN; i ++) {
+    for (int i = 0; i < dimN; i++) {
       int index = row_idx * dimN + i;
       grad[index] += 1.0 / (1 - output[index]);
     }
     int col_num = row[row_idx + 1] - row[row_idx];
-    int *row_col = col + row[row_idx];
-    for (int i = 0; i < col_num; i ++) {
+    int* row_col = col + row[row_idx];
+    for (int i = 0; i < col_num; i++) {
       int index = row_idx * dimN + row_col[i];
       grad[index] -= 1.0 / (output[index] * (1 - output[index]));
     }
   }
 }
 
-void hl_matrix_multi_binary_cross_entropy_bp(real* output,
-                                             real* grad,
-                                             hl_sparse_matrix_s csr_mat,
-                                             int dimM,
-                                             int dimN) {
+void hl_matrix_multi_binary_cross_entropy_bp(
+    real* output, real* grad, hl_sparse_matrix_s csr_mat, int dimM, int dimN) {
   CHECK_NOTNULL(output);
   CHECK_NOTNULL(grad);
   CHECK_NOTNULL(csr_mat);
@@ -339,16 +309,13 @@ void hl_matrix_multi_binary_cross_entropy_bp(real* output,
   dim3 threads(n_threads);
   dim3 grid(blocks);
   hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
-  KeMatrixMultiBinaryCrossEntropyBp<<< grid, threads, 0, STREAM_DEFAULT >>>
-          (output, grad, mat->csr_row, mat->csr_col, dimM, dimN);
+  KeMatrixMultiBinaryCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      output, grad, mat->csr_row, mat->csr_col, dimM, dimN);
   CHECK_SYNC("hl_matrix_multi_binary_cross_entropy_bp failed");
 }
 
-__global__ void KeMatrixCrossEntropy(real* O,
-                                     real* E,
-                                     int* label,
-                                     int dimM,
-                                     int dimN) {
+__global__ void KeMatrixCrossEntropy(
+    real* O, real* E, int* label, int dimM, int dimN) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int newBase;
   if (index < dimM) {
@@ -358,59 +325,49 @@ __global__ void KeMatrixCrossEntropy(real* O,
   }
 }
 
-void hl_matrix_cross_entropy(real* A_d,
-                             real* C_d,
-                             int* label_d,
-                             int dimM,
-                             int dimN) {
+void hl_matrix_cross_entropy(
+    real* A_d, real* C_d, int* label_d, int dimM, int dimN) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(C_d);
 
   int blocks = (dimM + 1024 - 1) / 1024;
   dim3 threads(1024, 1);
   dim3 grid(blocks, 1);
-  KeMatrixCrossEntropy<<< grid, threads, 0, STREAM_DEFAULT >>>
-           (A_d, C_d, label_d, dimM, dimN);
+  KeMatrixCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      A_d, C_d, label_d, dimM, dimN);
   CHECK_SYNC("hl_matrix_cross_entropy failed");
 }
 
-__global__ void KeMatrixCrossEntropyBp(real* grad_d,
-                                       real* output_d,
-                                       int* label_d,
-                                       int dimM,
-                                       int dimN) {
-  int rowIdx = blockIdx.x*blockDim.x + threadIdx.x;
-  int colIdx = blockIdx.y*blockDim.y + threadIdx.y;
+__global__ void KeMatrixCrossEntropyBp(
+    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
   int index;
   if (rowIdx < dimM && colIdx < dimN) {
-    index = rowIdx*dimN + colIdx;
+    index = rowIdx * dimN + colIdx;
     if (label_d[rowIdx] == colIdx) {
       grad_d[index] -= 1.0f / output_d[index];
     }
   }
 }
 
-void hl_matrix_cross_entropy_bp(real* grad_d,
-                                real* output_d,
-                                int* label_d,
-                                int dimM,
-                                int dimN) {
+void hl_matrix_cross_entropy_bp(
+    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
   CHECK_NOTNULL(grad_d);
   CHECK_NOTNULL(output_d);
   CHECK_NOTNULL(label_d);
 
-  int blocksX = (dimM + 0)/1;
-  int blocksY = (dimN + 1024 -1) / 1024;
+  int blocksX = (dimM + 0) / 1;
+  int blocksY = (dimN + 1024 - 1) / 1024;
   dim3 threads(1, 1024);
   dim3 grid(blocksX, blocksY);
-  KeMatrixCrossEntropyBp<<< grid, threads, 0, STREAM_DEFAULT >>>
-           (grad_d, output_d, label_d, dimM, dimN);
+  KeMatrixCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      grad_d, output_d, label_d, dimM, dimN);
   CHECK_SYNC("hl_matrix_cross_entropy_bp failed");
 }
 
 void hl_matrix_zero_mem(real* data, int num) {
-  hl_gpu_apply_unary_op(
-        unary::Zero<real>(), data, 1, num, num);
+  hl_gpu_apply_unary_op(unary::Zero<real>(), data, 1, num, num);
 }
 
 __global__ void KeParamReluForward(real* output,
@@ -423,8 +380,8 @@ __global__ void KeParamReluForward(real* output,
   int ty = blockIdx.y * blockDim.y + threadIdx.y;
   if (tx < width && ty < height) {
     int index = ty * width + tx;
-    output[index] = input[index] > 0 ? input[index] :
-        input[index] * w[tx / partial_sum];
+    output[index] =
+        input[index] > 0 ? input[index] : input[index] * w[tx / partial_sum];
   }
 }
 
@@ -439,14 +396,14 @@ void hl_param_relu_forward(real* output,
   CHECK_NOTNULL(w);
   dim3 threads(16, 16);
   int blockX = (width + 16 - 1) / 16;
-  int blockY = (height + 16 -1) / 16;
+  int blockY = (height + 16 - 1) / 16;
   dim3 grid(blockX, blockY);
-  KeParamReluForward<<<grid, threads, 0, STREAM_DEFAULT>>>
-    (output, input, w, width, height, partial_sum);
+  KeParamReluForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      output, input, w, width, height, partial_sum);
   CHECK_SYNC("hl_param_relu_forward failed");
 }
 
-template<int blockSize>
+template <int blockSize>
 __global__ void KeParamReluBackWardW(real* grad_w,
                                      real* grad_o,
                                      real* input,
@@ -491,8 +448,8 @@ void hl_param_relu_backward_w(real* grad_w,
   int grid_num = width / partial_sum;
   dim3 threads(blockSize, 1);
   dim3 grid(grid_num, 1);
-  KeParamReluBackWardW<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>
-    (grad_w, grad_o, input, width, height, partial_sum);
+  KeParamReluBackWardW<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      grad_w, grad_o, input, width, height, partial_sum);
   CHECK_SYNC("hl_param_relu_backward_w failed");
 }
 
@@ -524,19 +481,15 @@ void hl_param_relu_backward_diff(real* grad_o,
   CHECK_NOTNULL(diff);
   dim3 threads(16, 16);
   int blockX = (width + 16 - 1) / 16;
-  int blockY = (height + 16 -1) / 16;
+  int blockY = (height + 16 - 1) / 16;
   dim3 grid(blockX, blockY);
-  KeParamReluBackwardDiff<<<grid, threads, 0, STREAM_DEFAULT>>>
-      (grad_o, data, w, diff, width, height, partial_sum);
+  KeParamReluBackwardDiff<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      grad_o, data, w, diff, width, height, partial_sum);
   CHECK_SYNC("hl_param_relu_backward_diff failed");
 }
 
-__global__ void KeMatrixAddSharedBias(real* A,
-                                      real* B,
-                                      const int channel,
-                                      const int M,
-                                      const int N,
-                                      real scale) {
+__global__ void KeMatrixAddSharedBias(
+    real* A, real* B, const int channel, const int M, const int N, real scale) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int dim = N / channel;
   if (index < M * N) {
@@ -554,15 +507,14 @@ void hl_matrix_add_shared_bias(real* A_d,
                                real scale) {
   const int blocks = 512;
   const int grids = DIVUP(dimM * dimN, blocks);
-  KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>
-    (A_d, B_d, channel, dimM, dimN, scale);
+  KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>(
+      A_d, B_d, channel, dimM, dimN, scale);
   CHECK_SYNC("hl_matrix_add_shared_bias failed");
 }
 
-
 template <int blockSize>
-__global__ void KeMatrixCollectSharedBias(real *B,
-                                          real *A,
+__global__ void KeMatrixCollectSharedBias(real* B,
+                                          real* A,
                                           const int channel,
                                           const int M,
                                           const int N,
@@ -589,7 +541,7 @@ __global__ void KeMatrixCollectSharedBias(real *B,
       int n = j * blockSize + tid;
       int m = n / dim;
       int w = n % dim;
-      smem[tid] =  (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0;
+      smem[tid] = (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0;
       __syncthreads();
       simpleReduce(smem, tid, blockSize);
       sum += smem[0];
@@ -611,33 +563,32 @@ void hl_matrix_collect_shared_bias(real* B_d,
   const int limit = 64;
   int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel;
 
-  KeMatrixCollectSharedBias<blocks>
-      <<< grids, blocks, 0, STREAM_DEFAULT>>>
-      (B_d, A_d, channel, dimM, dimN, dim, limit, scale);
+  KeMatrixCollectSharedBias<blocks><<<grids, blocks, 0, STREAM_DEFAULT>>>(
+      B_d, A_d, channel, dimM, dimN, dim, limit, scale);
   CHECK_SYNC("hl_matrix_collect_shared_bias failed");
 }
 
-__global__ void keMatrixRotate(real* mat, real* matRot,
-                               int dimM, int dimN, bool clockWise) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < dimM * dimN) {
-        int i = idx / dimN;
-        int j = idx % dimN;
-        if (clockWise) {
-            matRot[j * dimM + i] = mat[(dimM - i - 1) * dimN + j];
-        } else {
-            matRot[j * dimM + i] = mat[i * dimN + (dimN - j - 1)];
-        }
+__global__ void keMatrixRotate(
+    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < dimM * dimN) {
+    int i = idx / dimN;
+    int j = idx % dimN;
+    if (clockWise) {
+      matRot[j * dimM + i] = mat[(dimM - i - 1) * dimN + j];
+    } else {
+      matRot[j * dimM + i] = mat[i * dimN + (dimN - j - 1)];
     }
+  }
 }
 
-void hl_matrix_rotate(real *mat, real* matRot,
-                      int dimM, int dimN, bool clockWise) {
-    CHECK_NOTNULL(mat);
-    CHECK_NOTNULL(matRot);
-    const int threads = 512;
-    const int blocks = DIVUP(dimM * dimN, threads);
-    keMatrixRotate<<< blocks, threads, 0, STREAM_DEFAULT >>>
-            (mat, matRot, dimM, dimN, clockWise);
-    CHECK_SYNC("hl_matrix_rotate failed");
+void hl_matrix_rotate(
+    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
+  CHECK_NOTNULL(mat);
+  CHECK_NOTNULL(matRot);
+  const int threads = 512;
+  const int blocks = DIVUP(dimM * dimN, threads);
+  keMatrixRotate<<<blocks, threads, 0, STREAM_DEFAULT>>>(
+      mat, matRot, dimM, dimN, clockWise);
+  CHECK_SYNC("hl_matrix_rotate failed");
 }
diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu
index 0fe2877f89..c52780dfca 100644
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -16,36 +16,36 @@ limitations under the License. */
 #include "hl_device_functions.cuh"
 #include "paddle/utils/Logging.h"
 
-__global__ void KeMaxSequenceForward(real *input,
-                                     const int *sequence,
+__global__ void KeMaxSequenceForward(real* input,
+                                     const int* sequence,
                                      real* output,
-                                     int *index,
+                                     int* index,
                                      int numSequences,
                                      int dim) {
   int dimIdx = threadIdx.x;
   int sequenceId = blockIdx.x;
   if (sequenceId >= numSequences) return;
   int start = sequence[sequenceId];
-  int end = sequence[sequenceId+1];
+  int end = sequence[sequenceId + 1];
 
   for (int i = dimIdx; i < dim; i += blockDim.x) {
     real tmp = -HL_FLOAT_MAX;
     int tmpId = -1;
     for (int insId = start; insId < end; insId++) {
-      if (tmp < input[insId*dim + i]) {
-        tmp = input[insId*dim + i];
+      if (tmp < input[insId * dim + i]) {
+        tmp = input[insId * dim + i];
         tmpId = insId;
       }
     }
-    output[sequenceId*dim + i] = tmp;
-    index[sequenceId*dim + i] = tmpId;
+    output[sequenceId * dim + i] = tmp;
+    index[sequenceId * dim + i] = tmpId;
   }
 }
 
 void hl_max_sequence_forward(real* input,
                              const int* sequence,
                              real* output,
-                             int *index,
+                             int* index,
                              int numSequences,
                              int dim) {
   CHECK_NOTNULL(input);
@@ -55,29 +55,23 @@ void hl_max_sequence_forward(real* input,
 
   dim3 threads(256, 1);
   dim3 grid(numSequences, 1);
-  KeMaxSequenceForward<<< grid, threads, 0, STREAM_DEFAULT >>>
-      (input, sequence, output, index, numSequences, dim);
+  KeMaxSequenceForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      input, sequence, output, index, numSequences, dim);
   CHECK_SYNC("hl_max_sequence_forward failed");
 }
 
-__global__ void KeMaxSequenceBackward(real *outputGrad,
-                                      int *index,
-                                      real* inputGrad,
-                                      int numSequences,
-                                      int dim) {
+__global__ void KeMaxSequenceBackward(
+    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   int colIdx = idx % dim;
-  if (idx < numSequences*dim) {
+  if (idx < numSequences * dim) {
     int insId = index[idx];
     inputGrad[insId * dim + colIdx] += outputGrad[idx];
   }
 }
 
-void hl_max_sequence_backward(real* outputGrad,
-                              int *index,
-                              real* inputGrad,
-                              int numSequences,
-                              int dim) {
+void hl_max_sequence_backward(
+    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
   CHECK_NOTNULL(outputGrad);
   CHECK_NOTNULL(index);
   CHECK_NOTNULL(inputGrad);
@@ -85,12 +79,12 @@ void hl_max_sequence_backward(real* outputGrad,
   unsigned int blocks = (numSequences * dim + 128 - 1) / 128;
   dim3 threads(128, 1);
   dim3 grid(blocks, 1);
-  KeMaxSequenceBackward<<< grid, threads, 0, STREAM_DEFAULT >>>
-      (outputGrad, index, inputGrad, numSequences, dim);
+  KeMaxSequenceBackward<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      outputGrad, index, inputGrad, numSequences, dim);
   CHECK_SYNC("hl_max_sequence_backward failed");
 }
 
-template<int blockDimX, int blockDimY, int gridDimX, bool AddRow>
+template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
 __global__ void KeMatrixAddRows(real* output,
                                 real* table,
                                 int* ids,
@@ -104,8 +98,8 @@ __global__ void KeMatrixAddRows(real* output,
   while (sampleId < numSamples) {
     int tableId = ids[sampleId];
     if ((0 <= tableId) && (tableId < tableSize)) {
-      real *outputData = output + sampleId * dim;
-      real *tableData = table + tableId * dim;
+      real* outputData = output + sampleId * dim;
+      real* tableData = table + tableId * dim;
       for (int i = idx; i < dim; i += blockDimX) {
         if (AddRow == 0) {
           outputData[i] += tableData[i];
@@ -114,24 +108,27 @@ __global__ void KeMatrixAddRows(real* output,
         }
       }
     }
-    sampleId += blockDimY*gridDimX;
+    sampleId += blockDimY * gridDimX;
   }
 }
 
-template<int blockDimX, int blockDimY, int gridDimX, bool seq2batch, bool isAdd>
-__global__
-void KeSequence2Batch(real *batch,
-                      real *sequence,
-                      const int *batchIndex,
-                      int seqWidth,
-                      int batchCount) {
+template <int blockDimX,
+          int blockDimY,
+          int gridDimX,
+          bool seq2batch,
+          bool isAdd>
+__global__ void KeSequence2Batch(real* batch,
+                                 real* sequence,
+                                 const int* batchIndex,
+                                 int seqWidth,
+                                 int batchCount) {
   int idx = threadIdx.x;
   int idy = threadIdx.y;
   int id = blockIdx.x + idy * gridDimX;
   while (id < batchCount) {
     int seqId = batchIndex[id];
-    real* batchData = batch + id*seqWidth;
-    real* seqData = sequence + seqId*seqWidth;
+    real* batchData = batch + id * seqWidth;
+    real* seqData = sequence + seqId * seqWidth;
     for (int i = idx; i < seqWidth; i += blockDimX) {
       if (seq2batch) {
         if (isAdd) {
@@ -147,13 +144,13 @@ void KeSequence2Batch(real *batch,
         }
       }
     }
-    id += blockDimY*gridDimX;
+    id += blockDimY * gridDimX;
   }
 }
 
-void hl_sequence2batch_copy(real *batch,
-                            real *sequence,
-                            const int *batchIndex,
+void hl_sequence2batch_copy(real* batch,
+                            real* sequence,
+                            const int* batchIndex,
                             int seqWidth,
                             int batchCount,
                             bool seq2batch) {
@@ -164,18 +161,18 @@ void hl_sequence2batch_copy(real *batch,
   dim3 threads(128, 8);
   dim3 grid(8, 1);
   if (seq2batch) {
-    KeSequence2Batch<128, 8, 8, 1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (batch, sequence, batchIndex, seqWidth, batchCount);
+    KeSequence2Batch<128, 8, 8, 1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        batch, sequence, batchIndex, seqWidth, batchCount);
   } else {
-    KeSequence2Batch<128, 8, 8, 0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (batch, sequence, batchIndex, seqWidth, batchCount);
+    KeSequence2Batch<128, 8, 8, 0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        batch, sequence, batchIndex, seqWidth, batchCount);
   }
   CHECK_SYNC("hl_sequence2batch_copy failed");
 }
 
-void hl_sequence2batch_add(real *batch,
-                           real *sequence,
-                           int *batchIndex,
+void hl_sequence2batch_add(real* batch,
+                           real* sequence,
+                           int* batchIndex,
                            int seqWidth,
                            int batchCount,
                            bool seq2batch) {
@@ -186,23 +183,22 @@ void hl_sequence2batch_add(real *batch,
   dim3 threads(128, 8);
   dim3 grid(8, 1);
   if (seq2batch) {
-    KeSequence2Batch<128, 8, 8, 1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (batch, sequence, batchIndex, seqWidth, batchCount);
+    KeSequence2Batch<128, 8, 8, 1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        batch, sequence, batchIndex, seqWidth, batchCount);
   } else {
-    KeSequence2Batch<128, 8, 8, 0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (batch, sequence, batchIndex, seqWidth, batchCount);
+    KeSequence2Batch<128, 8, 8, 0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        batch, sequence, batchIndex, seqWidth, batchCount);
   }
   CHECK_SYNC("hl_sequence2batch_add failed");
 }
 
-template<bool normByTimes, bool seq2batch>
-__global__
-void KeSequence2BatchPadding(real* batch,
-                             real* sequence,
-                             const int* sequenceStartPositions,
-                             const size_t sequenceWidth,
-                             const size_t maxSequenceLength,
-                             const size_t numSequences) {
+template <bool normByTimes, bool seq2batch>
+__global__ void KeSequence2BatchPadding(real* batch,
+                                        real* sequence,
+                                        const int* sequenceStartPositions,
+                                        const size_t sequenceWidth,
+                                        const size_t maxSequenceLength,
+                                        const size_t numSequences) {
   int batchIdx = blockIdx.y;
   int sequenceStart = sequenceStartPositions[batchIdx];
   int sequenceLength = sequenceStartPositions[batchIdx + 1] - sequenceStart;
@@ -269,45 +265,56 @@ void hl_sequence2batch_copy_padding(real* batch,
   int blockDimY = CUDA_BLOCK_SIZE / blockDimX;
   dim3 threads(blockDimX, blockDimY);
 
-  int gridDimX = (maxSequenceLength * blockDimX + CUDA_BLOCK_SIZE - 1) /
-      CUDA_BLOCK_SIZE;
+  int gridDimX = (maxSequenceLength + blockDimY - 1) / blockDimY;
   int gridDimY = numSequences;
   dim3 grid(gridDimX, gridDimY);
 
   if (seq2batch) {
     /* sequence -> batch */
     if (normByTimes) {
-      KeSequence2BatchPadding<1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>(
-              batch, sequence, sequenceStartPositions,
-              sequenceWidth, maxSequenceLength, numSequences);
+      KeSequence2BatchPadding<1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          batch,
+          sequence,
+          sequenceStartPositions,
+          sequenceWidth,
+          maxSequenceLength,
+          numSequences);
     } else {
-      KeSequence2BatchPadding<0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>(
-              batch, sequence, sequenceStartPositions,
-              sequenceWidth, maxSequenceLength, numSequences);
+      KeSequence2BatchPadding<0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          batch,
+          sequence,
+          sequenceStartPositions,
+          sequenceWidth,
+          maxSequenceLength,
+          numSequences);
     }
   } else {
     /* batch -> sequence */
     if (normByTimes) {
-      KeSequence2BatchPadding<1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>(
-              batch, sequence, sequenceStartPositions,
-              sequenceWidth, maxSequenceLength, numSequences);
+      KeSequence2BatchPadding<1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          batch,
+          sequence,
+          sequenceStartPositions,
+          sequenceWidth,
+          maxSequenceLength,
+          numSequences);
     } else {
-      KeSequence2BatchPadding<0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>(
-              batch, sequence, sequenceStartPositions,
-              sequenceWidth, maxSequenceLength, numSequences);
+      KeSequence2BatchPadding<0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          batch,
+          sequence,
+          sequenceStartPositions,
+          sequenceWidth,
+          maxSequenceLength,
+          numSequences);
     }
   }
 
   CHECK_SYNC("hl_sequence2batch_copy_padding failed");
 }
 
-__device__ inline float my_rsqrt(float x) {
-  return rsqrtf(x);
-}
+__device__ inline float my_rsqrt(float x) { return rsqrtf(x); }
 
-__device__ inline double my_rsqrt(double x) {
-  return rsqrt(x);
-}
+__device__ inline double my_rsqrt(double x) { return rsqrt(x); }
 
 __global__ void KeSequenceAvgForward(real* dst,
                                      real* src,
@@ -328,9 +335,9 @@ __global__ void KeSequenceAvgForward(real* dst,
     for (int i = start; i < end; i++) {
       sum += src[i * width + col];
     }
-    sum = mode == 1 ? sum :
-        (mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength));
-    dst[gid] = sum;
+    sum = mode == 1 ? sum : (mode == 0 ? sum / seqLength
+                                       : sum * my_rsqrt((real)seqLength));
+    dst[gid] += sum;
   }
 }
 
@@ -348,10 +355,10 @@ void hl_sequence_avg_forward(real* dst,
   int grid = DIVUP(width * height, 512);
 
   CHECK(mode == 0 || mode == 1 || mode == 2)
-    << "mode error in hl_sequence_avg_forward!";
+      << "mode error in hl_sequence_avg_forward!";
 
-  KeSequenceAvgForward<<< grid, block, 0, STREAM_DEFAULT >>>
-           (dst, src, starts, height, width, mode);
+  KeSequenceAvgForward<<<grid, block, 0, STREAM_DEFAULT>>>(
+      dst, src, starts, height, width, mode);
   CHECK_SYNC("hl_sequence_avg_forward failed");
 }
 
@@ -371,8 +378,8 @@ __global__ void KeSequenceAvgBackward(real* dst,
     int seqLength = end - start;
     if (seqLength == 0) return;
     real grad = src[gid];
-    grad = mode == 1 ? grad :
-        (mode == 0 ? grad / seqLength : grad * my_rsqrt((real)seqLength));
+    grad = mode == 1 ? grad : (mode == 0 ? grad / seqLength
+                                         : grad * my_rsqrt((real)seqLength));
     for (int i = start; i < end; i++) {
       dst[i * width + col] += grad;
     }
@@ -393,9 +400,9 @@ void hl_sequence_avg_backward(real* dst,
   int grid = DIVUP(width * height, 512);
 
   CHECK(mode == 0 || mode == 1 || mode == 2)
-    << "mode error in hl_sequence_avg_backward!";
+      << "mode error in hl_sequence_avg_backward!";
 
-  KeSequenceAvgBackward<<< grid, block, 0, STREAM_DEFAULT >>>
-           (dst, src, starts, height, width, mode);
+  KeSequenceAvgBackward<<<grid, block, 0, STREAM_DEFAULT>>>(
+      dst, src, starts, height, width, mode);
   CHECK_SYNC("hl_sequence_avg_backward failed");
 }
diff --git a/paddle/cuda/src/hl_cuda_sparse.cu b/paddle/cuda/src/hl_cuda_sparse.cu
index ab9ab57c88..6351e7e01e 100644
--- a/paddle/cuda/src/hl_cuda_sparse.cu
+++ b/paddle/cuda/src/hl_cuda_sparse.cu
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "hl_cuda.h"
+#include "hl_cuda_sparse.cuh"
+#include "hl_matrix_apply.cuh"
+#include "hl_matrix_ops.cuh"
 #include "hl_sparse.h"
 #include "hl_sparse.ph"
-#include "hl_matrix_ops.cuh"
-#include "hl_matrix_apply.cuh"
-#include "hl_cuda_sparse.cuh"
 #include "paddle/utils/Logging.h"
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(mul_scalar, ONE_PARAMETER, a = a * p);
@@ -34,15 +33,15 @@ void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
   CHECK(A_d->format == HL_SPARSE_CSR) << "matrix format error!";
 
   if (A_d->nnz == 0) {
-    hl_gpu_apply_unary_op(
-        unary::Zero<real>(), C_d, dimM, dimN, dimN);
+    hl_gpu_apply_unary_op(unary::Zero<real>(), C_d, dimM, dimN, dimN);
     return;
   }
 
   /* nnz != 0 */
   hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
-  CHECK((A_d2->csr_val || A_d->type == HL_NO_VALUE) &&
-        A_d2->csr_row && A_d2->csr_col) << "parameter transa error!";
+  CHECK((A_d2->csr_val || A_d->type == HL_NO_VALUE) && A_d2->csr_row &&
+        A_d2->csr_col)
+      << "parameter transa error!";
 
   int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
   int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
@@ -50,21 +49,11 @@ void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
   dim3 grid(blocksX, blocksY);
 
   if (A_d->type == HL_NO_VALUE) {
-    KeSMatrixCsr2Dense<0>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(A_d2->csr_val,
-                                             A_d2->csr_row,
-                                             A_d2->csr_col,
-                                             C_d,
-                                             dimM,
-                                             dimN);
+    KeSMatrixCsr2Dense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, C_d, dimM, dimN);
   } else if (A_d->type == HL_FLOAT_VALUE) {
-    KeSMatrixCsr2Dense<1>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(A_d2->csr_val,
-                                             A_d2->csr_row,
-                                             A_d2->csr_col,
-                                             C_d,
-                                             dimM,
-                                             dimN);
+    KeSMatrixCsr2Dense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, C_d, dimM, dimN);
   } else {
   }
   CHECK_SYNC("hl_matrix_csr2dense failed");
@@ -80,15 +69,15 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
   CHECK(A_d->format == HL_SPARSE_CSC) << "matrix format error!";
 
   if (A_d->nnz == 0) {
-    hl_gpu_apply_unary_op(
-        unary::Zero<real>(), C_d, dimM, dimN, dimN);
+    hl_gpu_apply_unary_op(unary::Zero<real>(), C_d, dimM, dimN, dimN);
     return;
   }
 
   /* nnz != 0 */
   hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix);
-  CHECK((A_d2->csc_val || A_d->type == HL_NO_VALUE) &&
-        A_d2->csc_row && A_d2->csc_col) << "parameter transa error!";
+  CHECK((A_d2->csc_val || A_d->type == HL_NO_VALUE) && A_d2->csc_row &&
+        A_d2->csc_col)
+      << "parameter transa error!";
 
   int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
   int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
@@ -96,21 +85,11 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
   dim3 grid(blocksX, blocksY);
 
   if (A_d->type == HL_NO_VALUE) {
-    KeSMatrixCsc2Dense<0>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(A_d2->csc_val,
-                                             A_d2->csc_row,
-                                             A_d2->csc_col,
-                                             C_d,
-                                             dimM,
-                                             dimN);
+    KeSMatrixCsc2Dense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        A_d2->csc_val, A_d2->csc_row, A_d2->csc_col, C_d, dimM, dimN);
   } else if (A_d->type == HL_FLOAT_VALUE) {
-    KeSMatrixCsc2Dense<1>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(A_d2->csc_val,
-                                             A_d2->csc_row,
-                                             A_d2->csc_col,
-                                             C_d,
-                                             dimM,
-                                             dimN);
+    KeSMatrixCsc2Dense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        A_d2->csc_val, A_d2->csc_row, A_d2->csc_col, C_d, dimM, dimN);
   } else {
   }
   CHECK_SYNC("hl_matrix_csc2dense failed");
@@ -118,43 +97,43 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
 
 void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
                              hl_matrix_format_t format,
-                             hl_matrix_value_t  value_type,
+                             hl_matrix_value_t value_type,
                              int dimM,
                              int dimN,
                              int nnz) {
   CHECK_NOTNULL(A_d);
   CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
-    << "sparse matrix format error!";
+      << "sparse matrix format error!";
   CHECK(value_type == HL_FLOAT_VALUE || value_type == HL_NO_VALUE)
-    << "sparse matrix value type error!";
+      << "sparse matrix value type error!";
   /* avoid malloc 0 bytes */
   int nnz_s = (nnz == 0 ? 1 : nnz);
 
   if (format == HL_SPARSE_CSR) {
     CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
 
-    char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s)
-                              + sizeof(_hl_csr_matrix));
+    char *tmp =
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
     CHECK_NOTNULL(tmp);
 
-    hl_csr_matrix csr = (hl_csr_matrix)(tmp+sizeof(_hl_sparse_matrix_s));
+    hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
     csr->sparsity = -1.0;
 
     if (value_type == HL_NO_VALUE) {
       csr->csr_val = NULL;
       csr->nnz_s = nnz_s;
-      csr->row_s = dimM+1;
-      csr->csr_row = (int*)hl_malloc_device((dimM+1)*sizeof(int));
-      csr->csr_col = (int*)hl_malloc_device((nnz_s)*sizeof(int));
+      csr->row_s = dimM + 1;
+      csr->csr_row = (int *)hl_malloc_device((dimM + 1) * sizeof(int));
+      csr->csr_col = (int *)hl_malloc_device((nnz_s) * sizeof(int));
 
       *A_d = (hl_sparse_matrix_s)tmp;
       (*A_d)->matrix = (hl_matrix_s)csr;
     } else if (value_type == HL_FLOAT_VALUE) {
       csr->nnz_s = nnz_s;
-      csr->row_s = dimM+1;
-      csr->csr_val = (real*)hl_malloc_device((nnz_s)*sizeof(real));
-      csr->csr_row = (int*)hl_malloc_device((dimM+1)*sizeof(int));
-      csr->csr_col = (int*)hl_malloc_device((nnz_s)*sizeof(int));
+      csr->row_s = dimM + 1;
+      csr->csr_val = (real *)hl_malloc_device((nnz_s) * sizeof(real));
+      csr->csr_row = (int *)hl_malloc_device((dimM + 1) * sizeof(int));
+      csr->csr_col = (int *)hl_malloc_device((nnz_s) * sizeof(int));
 
       *A_d = (hl_sparse_matrix_s)tmp;
       (*A_d)->matrix = (hl_matrix_s)csr;
@@ -162,28 +141,28 @@ void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
   } else if (format == HL_SPARSE_CSC) {
     CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
 
-    char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s)
-                              + sizeof(_hl_csc_matrix));
+    char *tmp =
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
     CHECK_NOTNULL(tmp);
 
-    hl_csc_matrix csc = (hl_csc_matrix)(tmp+sizeof(_hl_sparse_matrix_s));
+    hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
     csc->sparsity = -1.0f;
 
     if (value_type == HL_NO_VALUE) {
       csc->csc_val = NULL;
       csc->nnz_s = nnz_s;
-      csc->col_s = dimN+1;
-      csc->csc_row = (int*)hl_malloc_device((nnz_s)*sizeof(int));
-      csc->csc_col = (int*)hl_malloc_device((dimN+1)*sizeof(int));
+      csc->col_s = dimN + 1;
+      csc->csc_row = (int *)hl_malloc_device((nnz_s) * sizeof(int));
+      csc->csc_col = (int *)hl_malloc_device((dimN + 1) * sizeof(int));
 
       *A_d = (hl_sparse_matrix_s)tmp;
       (*A_d)->matrix = (hl_matrix_s)csc;
     } else if (value_type == HL_FLOAT_VALUE) {
       csc->nnz_s = nnz_s;
-      csc->col_s = dimN+1;
-      csc->csc_val = (real*)hl_malloc_device((nnz_s)*sizeof(real));
-      csc->csc_row = (int*)hl_malloc_device((nnz_s)*sizeof(int));
-      csc->csc_col = (int*)hl_malloc_device((dimN+1)*sizeof(int));
+      csc->col_s = dimN + 1;
+      csc->csc_val = (real *)hl_malloc_device((nnz_s) * sizeof(real));
+      csc->csc_row = (int *)hl_malloc_device((nnz_s) * sizeof(int));
+      csc->csc_col = (int *)hl_malloc_device((dimN + 1) * sizeof(int));
 
       *A_d = (hl_sparse_matrix_s)tmp;
       (*A_d)->matrix = (hl_matrix_s)csc;
@@ -200,7 +179,7 @@ void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
 void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {
   CHECK_NOTNULL(A_d);
   CHECK(A_d->format == HL_SPARSE_CSR || A_d->format == HL_SPARSE_CSC)
-    << "sparse matrix format error!";
+      << "sparse matrix format error!";
 
   if (A_d->matrix == NULL) {
     free(A_d);
@@ -249,77 +228,77 @@ void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {
 }
 
 void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                void * dest_d,
+                                void *dest_d,
                                 size_t size,
                                 hl_matrix_format_t format,
-                                hl_matrix_value_t  value_type,
+                                hl_matrix_value_t value_type,
                                 int dimM,
                                 int dimN,
                                 int nnz) {
   CHECK_NOTNULL(A_d);
   CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
-    << "sparse matrix format error!";
+      << "sparse matrix format error!";
 
   if (format == HL_SPARSE_CSR) {
     CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
 
-    size_t size_ = (dimM+1)*sizeof(int) + nnz*sizeof(int);
+    size_t size_ = (dimM + 1) * sizeof(int) + nnz * sizeof(int);
     if (value_type != HL_NO_VALUE) {
-      size_ += nnz*sizeof(real);
+      size_ += nnz * sizeof(real);
     }
     CHECK_LE(size_, size) << "dest_d size(" << size
-      << ") too small, should bigger than(" << size_ << ")!";
+                          << ") too small, should bigger than(" << size_
+                          << ")!";
 
-    char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s)
-                              + sizeof(_hl_csr_matrix));
+    char *tmp =
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
     CHECK_NOTNULL(tmp);
 
-    hl_csr_matrix csr = (hl_csr_matrix)(tmp+sizeof(_hl_sparse_matrix_s));
+    hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
 
     if (value_type == HL_NO_VALUE) {
       csr->csr_val = NULL;
-      csr->csr_row = (int*)dest_d;
-      csr->csr_col = (int*)((char*)dest_d + (dimM+1)*sizeof(int));
+      csr->csr_row = (int *)dest_d;
+      csr->csr_col = (int *)((char *)dest_d + (dimM + 1) * sizeof(int));
     } else {
-      csr->csr_val = (real*)dest_d;
-      csr->csr_row = (int*)((char*)dest_d + nnz*sizeof(real));
-      csr->csr_col = (int*)((char*)dest_d +
-                            nnz*sizeof(real) +
-                            (dimM+1)*sizeof(int));
+      csr->csr_val = (real *)dest_d;
+      csr->csr_row = (int *)((char *)dest_d + nnz * sizeof(real));
+      csr->csr_col = (int *)((char *)dest_d + nnz * sizeof(real) +
+                             (dimM + 1) * sizeof(int));
     }
     csr->nnz_s = nnz;
-    csr->row_s = dimM+1;
+    csr->row_s = dimM + 1;
     csr->sparsity = -1.0;
     *A_d = (hl_sparse_matrix_s)tmp;
     (*A_d)->matrix = (hl_matrix_s)csr;
   } else if (format == HL_SPARSE_CSC) {
     CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
 
-    size_t size_ = (dimN+1)*sizeof(int) + nnz*sizeof(int);
+    size_t size_ = (dimN + 1) * sizeof(int) + nnz * sizeof(int);
     if (value_type != HL_NO_VALUE) {
-      size_ += nnz*sizeof(real);
+      size_ += nnz * sizeof(real);
     }
     CHECK_LE(size_, size) << "dest_d size(" << size
-      << ") too small, should bigger than(" << size_ << ")!";
+                          << ") too small, should bigger than(" << size_
+                          << ")!";
 
-    char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s)
-                              + sizeof(_hl_csc_matrix));
+    char *tmp =
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
     CHECK_NOTNULL(tmp);
 
-    hl_csc_matrix csc = (hl_csc_matrix)(tmp+sizeof(_hl_sparse_matrix_s));
+    hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
     if (value_type == HL_NO_VALUE) {
       csc->csc_val = NULL;
-      csc->csc_col = (int*)dest_d;
-      csc->csc_row = (int*)((char*)dest_d + (dimN+1)*sizeof(int));
+      csc->csc_col = (int *)dest_d;
+      csc->csc_row = (int *)((char *)dest_d + (dimN + 1) * sizeof(int));
     } else {
-      csc->csc_val = (real*)dest_d;
-      csc->csc_col = (int*)((char*)dest_d + nnz*sizeof(real));
-      csc->csc_row = (int*)((char*)dest_d +
-                            nnz*sizeof(real) +
-                            (dimN+1)*sizeof(int));
+      csc->csc_val = (real *)dest_d;
+      csc->csc_col = (int *)((char *)dest_d + nnz * sizeof(real));
+      csc->csc_row = (int *)((char *)dest_d + nnz * sizeof(real) +
+                             (dimN + 1) * sizeof(int));
     }
     csc->nnz_s = nnz;
-    csc->col_s = dimN+1;
+    csc->col_s = dimN + 1;
     csc->sparsity = -1.0f;
     *A_d = (hl_sparse_matrix_s)tmp;
     (*A_d)->matrix = (hl_matrix_s)csc;
@@ -333,11 +312,11 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
 }
 
 void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                real* value_d,
-                                int* rows_d,
-                                int* cols_d,
+                                real *value_d,
+                                int *rows_d,
+                                int *cols_d,
                                 hl_matrix_format_t format,
-                                hl_matrix_value_t  value_type,
+                                hl_matrix_value_t value_type,
                                 int dimM,
                                 int dimN,
                                 int nnz) {
@@ -345,11 +324,11 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
   CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
 
   CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
-    << "sparse matrix format error!";
+      << "sparse matrix format error!";
 
   if (format == HL_SPARSE_CSR) {
-    char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s)
-                              + sizeof(_hl_csr_matrix));
+    char *tmp =
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
     CHECK_NOTNULL(tmp);
 
     hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
@@ -362,8 +341,8 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
     *A_d = (hl_sparse_matrix_s)tmp;
     (*A_d)->matrix = (hl_matrix_s)csr;
   } else if (format == HL_SPARSE_CSC) {
-    char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s)
-                              + sizeof(_hl_csc_matrix));
+    char *tmp =
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
     CHECK_NOTNULL(tmp);
 
     hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
@@ -396,35 +375,30 @@ void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
                           hl_stream_t stream) {
   CHECK_NOTNULL(csr_matrix);
   CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR)
-    << "csr_matrix is not csr format!";
+      << "csr_matrix is not csr format!";
   CHECK_NOTNULL(csr_matrix->matrix);
 
   hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix);
-  CHECK_LE(csr_matrix->nnz, csr->nnz_s)
-    << "copy size " << csr_matrix->nnz
-    << " is big than alloc size " << csr->nnz_s;
+  CHECK_LE(csr_matrix->nnz, csr->nnz_s) << "copy size " << csr_matrix->nnz
+                                        << " is big than alloc size "
+                                        << csr->nnz_s;
 
-  CHECK_LE((csr_matrix->rows+1), csr->row_s)
-    << "copy size " << (csr_matrix->rows + 1)
-    << " is big than alloc size " << csr->row_s;
+  CHECK_LE((csr_matrix->rows + 1), csr->row_s)
+      << "copy size " << (csr_matrix->rows + 1) << " is big than alloc size "
+      << csr->row_s;
 
-  CHECK(csr_matrix->type == HL_FLOAT_VALUE ||
-        csr_matrix->type == HL_NO_VALUE)
-        << "sparse matrix value type error!";
+  CHECK(csr_matrix->type == HL_FLOAT_VALUE || csr_matrix->type == HL_NO_VALUE)
+      << "sparse matrix value type error!";
 
   if (csr_matrix->type == HL_NO_VALUE) {
     if (csr_row == NULL && csr_col == NULL) {
       return;
     } else if (csr_row != NULL && csr_col != NULL) {
-      hl_memcpy_async(csr->csr_row,
-                      csr_row,
-                      (csr_matrix->rows+1)*sizeof(int),
-                      stream);
+      hl_memcpy_async(
+          csr->csr_row, csr_row, (csr_matrix->rows + 1) * sizeof(int), stream);
 
-      hl_memcpy_async(csr->csr_col,
-                      csr_col,
-                      (csr_matrix->nnz)*sizeof(int),
-                      stream);
+      hl_memcpy_async(
+          csr->csr_col, csr_col, (csr_matrix->nnz) * sizeof(int), stream);
     } else {
       LOG(FATAL) << "parameter csr_row or csr_col is null pointer!";
     }
@@ -432,30 +406,21 @@ void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
     if (csr_val == NULL && csr_row == NULL && csr_col == NULL) {
       return;
     } else if (csr_val != NULL && csr_row == NULL && csr_col == NULL) {
-      hl_memcpy_async(csr->csr_val,
-                      csr_val,
-                      (csr_matrix->nnz)*sizeof(real),
-                      stream);
+      hl_memcpy_async(
+          csr->csr_val, csr_val, (csr_matrix->nnz) * sizeof(real), stream);
     } else if (csr_val != NULL && csr_row != NULL && csr_col != NULL) {
-      hl_memcpy_async(csr->csr_val,
-                      csr_val,
-                      (csr_matrix->nnz)*sizeof(real),
-                      stream);
-      hl_memcpy_async(csr->csr_row,
-                      csr_row,
-                      (csr_matrix->rows+1)*sizeof(int),
-                      stream);
-      hl_memcpy_async(csr->csr_col,
-                      csr_col,
-                      (csr_matrix->nnz)*sizeof(int),
-                      stream);
+      hl_memcpy_async(
+          csr->csr_val, csr_val, (csr_matrix->nnz) * sizeof(real), stream);
+      hl_memcpy_async(
+          csr->csr_row, csr_row, (csr_matrix->rows + 1) * sizeof(int), stream);
+      hl_memcpy_async(
+          csr->csr_col, csr_col, (csr_matrix->nnz) * sizeof(int), stream);
     } else {
       LOG(FATAL) << "parameter csr_row or csr_col is null pointer!";
     }
   }
 
-  csr->sparsity = ((float)csr_matrix->nnz) /
-                  ((float)csr_matrix->rows) /
+  csr->sparsity = ((float)csr_matrix->nnz) / ((float)csr_matrix->rows) /
                   ((float)csr_matrix->cols);
 }
 
@@ -466,33 +431,28 @@ void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
                           hl_stream_t stream) {
   CHECK_NOTNULL(csc_matrix);
   CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC)
-    << "csc_matrix is not csc format error!";
+      << "csc_matrix is not csc format error!";
 
   hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix);
-  CHECK_LE(csc_matrix->nnz, csc->nnz_s)
-    << "copy size " << csc_matrix->nnz
-    << " is big than alloc size " << csc->nnz_s;
+  CHECK_LE(csc_matrix->nnz, csc->nnz_s) << "copy size " << csc_matrix->nnz
+                                        << " is big than alloc size "
+                                        << csc->nnz_s;
 
-  CHECK_LE((csc_matrix->cols+1), csc->col_s)
-    << "copy size " <<(csc_matrix->cols + 1)
-    << " is big than alloc size " << csc->col_s;
+  CHECK_LE((csc_matrix->cols + 1), csc->col_s)
+      << "copy size " << (csc_matrix->cols + 1) << " is big than alloc size "
+      << csc->col_s;
 
-  CHECK(csc_matrix->type == HL_FLOAT_VALUE ||
-        csc_matrix->type == HL_NO_VALUE)
-        << "sparse matrix value type error!";
+  CHECK(csc_matrix->type == HL_FLOAT_VALUE || csc_matrix->type == HL_NO_VALUE)
+      << "sparse matrix value type error!";
 
   if (csc_matrix->type == HL_NO_VALUE) {
     if (csc_row == NULL && csc_col == NULL) {
       return;
     } else if (csc_row != NULL && csc_col != NULL) {
-      hl_memcpy_async(csc->csc_row,
-                      csc_row,
-                      (csc_matrix->nnz)*sizeof(int),
-                      stream);
-      hl_memcpy_async(csc->csc_col,
-                      csc_col,
-                      (csc_matrix->cols+1)*sizeof(int),
-                      stream);
+      hl_memcpy_async(
+          csc->csc_row, csc_row, (csc_matrix->nnz) * sizeof(int), stream);
+      hl_memcpy_async(
+          csc->csc_col, csc_col, (csc_matrix->cols + 1) * sizeof(int), stream);
     } else {
       LOG(FATAL) << "parameter csc_row or csc_col is null pointer!";
     }
@@ -500,30 +460,21 @@ void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
     if (csc_val == NULL && csc_row == NULL && csc_col == NULL) {
       return;
     } else if (csc_val != NULL && csc_row == NULL && csc_col == NULL) {
-      hl_memcpy_async(csc->csc_val,
-                      csc_val,
-                      (csc_matrix->nnz)*sizeof(real),
-                      stream);
+      hl_memcpy_async(
+          csc->csc_val, csc_val, (csc_matrix->nnz) * sizeof(real), stream);
     } else if (csc_val != NULL && csc_row != NULL && csc_col != NULL) {
-      hl_memcpy_async(csc->csc_val,
-                      csc_val,
-                      (csc_matrix->nnz)*sizeof(real),
-                      stream);
-      hl_memcpy_async(csc->csc_row,
-                      csc_row,
-                      (csc_matrix->nnz)*sizeof(int),
-                      stream);
-      hl_memcpy_async(csc->csc_col,
-                      csc_col,
-                      (csc_matrix->cols+1)*sizeof(int),
-                      stream);
+      hl_memcpy_async(
+          csc->csc_val, csc_val, (csc_matrix->nnz) * sizeof(real), stream);
+      hl_memcpy_async(
+          csc->csc_row, csc_row, (csc_matrix->nnz) * sizeof(int), stream);
+      hl_memcpy_async(
+          csc->csc_col, csc_col, (csc_matrix->cols + 1) * sizeof(int), stream);
     } else {
       LOG(FATAL) << "parameter csc_row or csc_col is null pointer!";
     }
   }
 
-  csc->sparsity = ((float)csc_matrix->nnz) /
-                  ((float)csc_matrix->rows) /
+  csc->sparsity = ((float)csc_matrix->nnz) / ((float)csc_matrix->rows) /
                   ((float)csc_matrix->cols);
 }
 
@@ -531,32 +482,23 @@ void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst,
                              hl_sparse_matrix_s src,
                              hl_stream_t stream) {
   CHECK(dst && src && dst->matrix && src->matrix)
-    << "parameter dst or src is null pointer!";
-  CHECK_EQ(dst->format, src->format)
-    << "sparse matrix format does not match!";
+      << "parameter dst or src is null pointer!";
+  CHECK_EQ(dst->format, src->format) << "sparse matrix format does not match!";
   CHECK(dst->type != HL_FLOAT_VALUE || src->type != HL_NO_VALUE)
-    << "src sparse matrix is no value, dst sparse matrix has value!";
+      << "src sparse matrix is no value, dst sparse matrix has value!";
 
   if (dst->format == HL_SPARSE_CSR) {
     dst->rows = src->rows;
     dst->cols = src->cols;
-    dst->nnz  = src->nnz;
+    dst->nnz = src->nnz;
     hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
-    hl_memcpy_csr_matrix(dst,
-                         csr->csr_val,
-                         csr->csr_row,
-                         csr->csr_col,
-                         stream);
+    hl_memcpy_csr_matrix(dst, csr->csr_val, csr->csr_row, csr->csr_col, stream);
   } else if (dst->format == HL_SPARSE_CSC) {
     dst->rows = src->rows;
     dst->cols = src->cols;
-    dst->nnz  = src->nnz;
+    dst->nnz = src->nnz;
     hl_csc_matrix csc = (hl_csc_matrix)src->matrix;
-    hl_memcpy_csc_matrix(dst,
-                         csc->csc_val,
-                         csc->csc_row,
-                         csc->csc_col,
-                         stream);
+    hl_memcpy_csc_matrix(dst, csc->csc_val, csc->csc_row, csc->csc_col, stream);
   } else {
     LOG(FATAL) << "sparse matrix format error!";
   }
@@ -569,20 +511,24 @@ static void _beta_mul_c(real *c, int dimM, int dimN, real beta) {
   if (beta == 0.0) {
     hl_gpu_apply_unary_op(unary::Zero<real>(), c, dimM, dimN, dimN);
   } else {
-    if (beta != 1.0){
-      hl_gpu_apply_unary_op(
-        unary::mul_scalar<real>(beta), c, dimM, dimN, dimN);
+    if (beta != 1.0) {
+      hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta), c, dimM, dimN, dimN);
     }
   }
 
   return;
 }
 
-void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
-                             real *B_d, hl_trans_op_t transb,
+void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
+                             hl_trans_op_t transa,
+                             real *B_d,
+                             hl_trans_op_t transb,
                              real *C_d,
-                             int dimM, int dimN, int dimK,
-                             real alpha, real beta) {
+                             int dimM,
+                             int dimN,
+                             int dimK,
+                             real alpha,
+                             real beta) {
   CHECK_EQ(transb, HPPL_OP_N);
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(B_d);
@@ -592,7 +538,7 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
 
   if ((HPPL_OP_N == transa && (A_d->rows != dimM || A_d->cols != dimK)) ||
       (HPPL_OP_T == transa && (A_d->rows != dimK || A_d->cols != dimM))) {
-      LOG(FATAL) << "parameter error!";
+    LOG(FATAL) << "parameter error!";
   }
 
   if (A_d->nnz == 0) {
@@ -603,8 +549,7 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
   /* nnz != 0 */
   hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
   if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) ||
-       A_d2->csr_row == NULL ||
-       A_d2->csr_col == NULL) {
+      A_d2->csr_row == NULL || A_d2->csr_col == NULL) {
     LOG(FATAL) << "parameter error!";
   }
 
@@ -617,63 +562,63 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
     /* sparsity pattern */
     // A_d->sparsity;
     if (A_d->type == HL_NO_VALUE) {
-      KeSMatrixCsrMulDense<0>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d2->csr_val,
-                                               A_d2->csr_col,
-                                               A_d2->csr_row,
-                                               B_d,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixCsrMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csr_val,
+          A_d2->csr_col,
+          A_d2->csr_row,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     } else {
-      KeSMatrixCsrMulDense<1>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d2->csr_val,
-                                               A_d2->csr_col,
-                                               A_d2->csr_row,
-                                               B_d,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixCsrMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csr_val,
+          A_d2->csr_col,
+          A_d2->csr_row,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     }
   } else if (HPPL_OP_T == transa) {
     _beta_mul_c(C_d, dimM, dimN, beta);
 
-    int blocksX = (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) /
-                  CU_CSC_MUL_DENSE_BLOCK_N;
-    int blocksY = (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) /
-                  CU_CSC_MUL_DENSE_BLOCK_K;
+    int blocksX =
+        (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / CU_CSC_MUL_DENSE_BLOCK_N;
+    int blocksY =
+        (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / CU_CSC_MUL_DENSE_BLOCK_K;
     dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y);
     dim3 grid(blocksX, blocksY);
     if (A_d->type == HL_NO_VALUE) {
-      KeSMatrixCscMulDense<0>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d2->csr_val,
-                                               A_d2->csr_col,
-                                               A_d2->csr_row,
-                                               B_d,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixCscMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csr_val,
+          A_d2->csr_col,
+          A_d2->csr_row,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     } else {
-      KeSMatrixCscMulDense<1>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d2->csr_val,
-                                               A_d2->csr_col,
-                                               A_d2->csr_row,
-                                               B_d,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixCscMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csr_val,
+          A_d2->csr_col,
+          A_d2->csr_row,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     }
   } else {
     LOG(FATAL) << "parameter transa error!";
@@ -682,11 +627,16 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
   CHECK_SYNC("hl_matrix_csr_mul_dense failed");
 }
 
-void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
-                             hl_sparse_matrix_s B_d, hl_trans_op_t transb,
+void hl_matrix_dense_mul_csc(real *A_d,
+                             hl_trans_op_t transa,
+                             hl_sparse_matrix_s B_d,
+                             hl_trans_op_t transb,
                              real *C_d,
-                             int dimM, int dimN, int dimK,
-                             real alpha, real beta) {
+                             int dimM,
+                             int dimN,
+                             int dimK,
+                             real alpha,
+                             real beta) {
   CHECK_EQ(transa, HPPL_OP_N);
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(B_d);
@@ -698,8 +648,7 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
     LOG(FATAL) << "parameter dims error!";
   }
 
-  CHECK_EQ(B_d->format, HL_SPARSE_CSC)
-    << "matrix format error!";
+  CHECK_EQ(B_d->format, HL_SPARSE_CSC) << "matrix format error!";
 
   if (B_d->nnz == 0) {
     _beta_mul_c(C_d, dimM, dimN, beta);
@@ -709,8 +658,7 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
   /* nnz != 0 */
   hl_csc_matrix B_d2 = (hl_csc_matrix)(B_d->matrix);
   if ((B_d2->csc_val == NULL && B_d->type != HL_NO_VALUE) ||
-       B_d2->csc_row == NULL ||
-       B_d2->csc_col == NULL) {
+      B_d2->csc_row == NULL || B_d2->csc_col == NULL) {
     LOG(FATAL) << "parameter B is null!";
   }
 
@@ -721,60 +669,60 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
     dim3 grid(blocksX, blocksY);
 
     if (B_d->type == HL_NO_VALUE) {
-      KeSMatrixDenseMulCsc<0>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d,
-                                               B_d2->csc_val,
-                                               B_d2->csc_row,
-                                               B_d2->csc_col,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixDenseMulCsc<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csc_val,
+          B_d2->csc_row,
+          B_d2->csc_col,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     } else {
-      KeSMatrixDenseMulCsc<1>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d,
-                                               B_d2->csc_val,
-                                               B_d2->csc_row,
-                                               B_d2->csc_col,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixDenseMulCsc<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csc_val,
+          B_d2->csc_row,
+          B_d2->csc_col,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     }
   } else if (transb == HPPL_OP_T) {
     _beta_mul_c(C_d, dimM, dimN, beta);
-    int blocksX = 1 + (dimK-1)/CU_DM_CSR_THREAD_X;
-    int blocksY = 1 + (dimM-1)/CU_DM_CSR_BLOCK_M;
+    int blocksX = 1 + (dimK - 1) / CU_DM_CSR_THREAD_X;
+    int blocksY = 1 + (dimM - 1) / CU_DM_CSR_BLOCK_M;
     dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y);
     dim3 grid(blocksX, blocksY);
     if (B_d->type == HL_NO_VALUE) {
-      KeSMatrixDenseMulCsr<0>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d,
-                                               B_d2->csc_val,
-                                               B_d2->csc_col,
-                                               B_d2->csc_row,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixDenseMulCsr<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csc_val,
+          B_d2->csc_col,
+          B_d2->csc_row,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     } else {
-      KeSMatrixDenseMulCsr<1>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d,
-                                               B_d2->csc_val,
-                                               B_d2->csc_col,
-                                               B_d2->csc_row,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixDenseMulCsr<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csc_val,
+          B_d2->csc_col,
+          B_d2->csc_row,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     }
   } else {
     LOG(FATAL) << "parameter transb error!";
@@ -783,24 +731,28 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
   CHECK_SYNC("hl_matrix_dense_mul_csc failed");
 }
 
-void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
-                             hl_sparse_matrix_s B_d, hl_trans_op_t transb,
+void hl_matrix_dense_mul_csr(real *A_d,
+                             hl_trans_op_t transa,
+                             hl_sparse_matrix_s B_d,
+                             hl_trans_op_t transb,
                              real *C_d,
-                             int dimM, int dimN, int dimK,
-                             real alpha, real beta) {
+                             int dimM,
+                             int dimN,
+                             int dimK,
+                             real alpha,
+                             real beta) {
   CHECK_EQ(transa, HPPL_OP_N);
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(B_d);
   CHECK_NOTNULL(C_d);
 
-  if (dimM <= 0 || dimN <= 0 || dimK <= 0
-      || (transb == HPPL_OP_N && (B_d->rows != dimK || B_d->cols != dimN))
-      || (transb == HPPL_OP_T && (B_d->rows != dimN || B_d->cols != dimK))) {
+  if (dimM <= 0 || dimN <= 0 || dimK <= 0 ||
+      (transb == HPPL_OP_N && (B_d->rows != dimK || B_d->cols != dimN)) ||
+      (transb == HPPL_OP_T && (B_d->rows != dimN || B_d->cols != dimK))) {
     LOG(FATAL) << "parameter dims error!";
   }
 
-  CHECK_EQ(B_d->format, HL_SPARSE_CSR)
-    << "matrix format error!";
+  CHECK_EQ(B_d->format, HL_SPARSE_CSR) << "matrix format error!";
 
   if (B_d->nnz == 0) {
     _beta_mul_c(C_d, dimM, dimN, beta);
@@ -810,41 +762,40 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
   /* nnz != 0 */
   hl_csr_matrix B_d2 = (hl_csr_matrix)(B_d->matrix);
   if ((B_d2->csr_val == NULL && B_d->type != HL_NO_VALUE) ||
-       B_d2->csr_row == NULL ||
-       B_d2->csr_col == NULL) {
+      B_d2->csr_row == NULL || B_d2->csr_col == NULL) {
     LOG(FATAL) << "parameter transa error!";
   }
 
   if (transb == HPPL_OP_N) {
     _beta_mul_c(C_d, dimM, dimN, beta);
-    int blocksX = 1 + (dimK-1)/CU_DM_CSR_THREAD_X;
-    int blocksY = 1 + (dimM-1)/CU_DM_CSR_BLOCK_M;
+    int blocksX = 1 + (dimK - 1) / CU_DM_CSR_THREAD_X;
+    int blocksY = 1 + (dimM - 1) / CU_DM_CSR_BLOCK_M;
     dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y);
     dim3 grid(blocksX, blocksY);
     if (B_d->type == HL_NO_VALUE) {
-      KeSMatrixDenseMulCsr<0>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d,
-                                               B_d2->csr_val,
-                                               B_d2->csr_row,
-                                               B_d2->csr_col,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixDenseMulCsr<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csr_val,
+          B_d2->csr_row,
+          B_d2->csr_col,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     } else {
-      KeSMatrixDenseMulCsr<1>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d,
-                                               B_d2->csr_val,
-                                               B_d2->csr_row,
-                                               B_d2->csr_col,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixDenseMulCsr<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csr_val,
+          B_d2->csr_row,
+          B_d2->csr_col,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     }
   } else if (transb == HPPL_OP_T) {
     int blocksX = (dimM + CU_CSCMM_BLOCK_M_BEST - 1) / CU_CSCMM_BLOCK_M_BEST;
@@ -852,29 +803,29 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
     dim3 threads(CU_CSCMM_THREAD_X_BEST, CU_CSCMM_THREAD_Y_BEST);
     dim3 grid(blocksX, blocksY);
     if (B_d->type == HL_NO_VALUE) {
-      KeSMatrixDenseMulCsc<0>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d,
-                                               B_d2->csr_val,
-                                               B_d2->csr_col,
-                                               B_d2->csr_row,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixDenseMulCsc<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csr_val,
+          B_d2->csr_col,
+          B_d2->csr_row,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     } else {
-      KeSMatrixDenseMulCsc<1>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d,
-                                               B_d2->csr_val,
-                                               B_d2->csr_col,
-                                               B_d2->csr_row,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixDenseMulCsc<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csr_val,
+          B_d2->csr_col,
+          B_d2->csr_row,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     }
   } else {
     LOG(FATAL) << "parameter transb error!";
@@ -883,11 +834,16 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
   CHECK_SYNC("hl_matrix_dense_mul_csr failed");
 }
 
-void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
-                             real *B_d, hl_trans_op_t transb,
+void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
+                             hl_trans_op_t transa,
+                             real *B_d,
+                             hl_trans_op_t transb,
                              real *C_d,
-                             int dimM, int dimN, int dimK,
-                             real alpha, real beta) {
+                             int dimM,
+                             int dimN,
+                             int dimK,
+                             real alpha,
+                             real beta) {
   CHECK_EQ(transb, HPPL_OP_N);
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(B_d);
@@ -908,42 +864,43 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
   /* nnz != 0 */
   hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix);
   if ((A_d2->csc_val == NULL && A_d->type != HL_NO_VALUE) ||
-       A_d2->csc_row == NULL ||
-       A_d2->csc_col == NULL) {
+      A_d2->csc_row == NULL || A_d2->csc_col == NULL) {
     LOG(FATAL) << "parameter error!";
   }
 
   if (HPPL_OP_N == transa) {
     _beta_mul_c(C_d, dimM, dimN, beta);
 
-    int blocksX = (dimN + CU_CSC_MUL_DENSE_BLOCK_N -1)/CU_CSC_MUL_DENSE_BLOCK_N;
-    int blocksY = (dimK + CU_CSC_MUL_DENSE_BLOCK_K -1)/CU_CSC_MUL_DENSE_BLOCK_K;
+    int blocksX =
+        (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / CU_CSC_MUL_DENSE_BLOCK_N;
+    int blocksY =
+        (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / CU_CSC_MUL_DENSE_BLOCK_K;
     dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y);
     dim3 grid(blocksX, blocksY);
     if (A_d->type == HL_NO_VALUE) {
-      KeSMatrixCscMulDense<0>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d2->csc_val,
-                                               A_d2->csc_row,
-                                               A_d2->csc_col,
-                                               B_d,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixCscMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csc_val,
+          A_d2->csc_row,
+          A_d2->csc_col,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     } else {
-      KeSMatrixCscMulDense<1>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d2->csc_val,
-                                               A_d2->csc_row,
-                                               A_d2->csc_col,
-                                               B_d,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixCscMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csc_val,
+          A_d2->csc_row,
+          A_d2->csc_col,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     }
   } else if (HPPL_OP_T == transa) {
     int blocksX = (dimN + CU_CSRMM_BLOCK_N - 1) / CU_CSRMM_BLOCK_N;
@@ -954,29 +911,29 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
     /* sparsity pattern */
     // A_d->sparsity;
     if (A_d->type == HL_NO_VALUE) {
-      KeSMatrixCsrMulDense<0>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d2->csc_val,
-                                               A_d2->csc_row,
-                                               A_d2->csc_col,
-                                               B_d,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixCsrMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csc_val,
+          A_d2->csc_row,
+          A_d2->csc_col,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     } else {
-      KeSMatrixCsrMulDense<1>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d2->csc_val,
-                                               A_d2->csc_row,
-                                               A_d2->csc_col,
-                                               B_d,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixCsrMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csc_val,
+          A_d2->csc_row,
+          A_d2->csc_col,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     }
   } else {
     LOG(FATAL) << "parameter transa error!";
@@ -985,11 +942,16 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
   CHECK_SYNC("hl_matrix_csc_mul_dense failed");
 }
 
-void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
-                          real *B_d, hl_trans_op_t transb,
-                          hl_sparse_matrix_s  C_d,
-                          int dimM, int dimN, int dimK,
-                          real alpha, real beta) {
+void hl_sparse_matrix_mul(real *A_d,
+                          hl_trans_op_t transa,
+                          real *B_d,
+                          hl_trans_op_t transb,
+                          hl_sparse_matrix_s C_d,
+                          int dimM,
+                          int dimN,
+                          int dimK,
+                          real alpha,
+                          real beta) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(B_d);
   CHECK_NOTNULL(C_d);
@@ -1000,18 +962,14 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
 
   if (C_d->format == HL_SPARSE_CSC) {
     hl_csc_matrix C_d2 = (hl_csc_matrix)(C_d->matrix);
-    if (C_d2->csc_val == NULL ||
-        C_d2->csc_row == NULL ||
+    if (C_d2->csc_val == NULL || C_d2->csc_row == NULL ||
         C_d2->csc_col == NULL) {
       LOG(FATAL) << "parameter error!";
     }
 
     if (beta != 1.0) {
-      hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
-                            C_d2->csc_val,
-                            1,
-                            C_d->nnz,
-                            C_d->nnz);
+      hl_gpu_apply_unary_op(
+          unary::mul_scalar<real>(beta), C_d2->csc_val, 1, C_d->nnz, C_d->nnz);
     }
 
     int blocksX = dimN;
@@ -1020,34 +978,30 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
     dim3 grid(blocksX, blocksY);
     bool transA = transa == HPPL_OP_T ? 1 : 0;
     bool transB = transb == HPPL_OP_T ? 1 : 0;
-    KeSMatrixDenseMulDense2CSC
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d2->csc_val,
-                                             C_d2->csc_row,
-                                             C_d2->csc_col,
-                                             A_d,
-                                             B_d,
-                                             transA,
-                                             transB,
-                                             dimM,
-                                             dimN,
-                                             dimK,
-                                             alpha,
-                                             beta);
+    KeSMatrixDenseMulDense2CSC<<<grid, threads, 0, STREAM_DEFAULT>>>(
+        C_d2->csc_val,
+        C_d2->csc_row,
+        C_d2->csc_col,
+        A_d,
+        B_d,
+        transA,
+        transB,
+        dimM,
+        dimN,
+        dimK,
+        alpha,
+        beta);
     CHECK_SYNC("hl_sparse_matrix_mul failed");
   } else {
     hl_csr_matrix C_d2 = (hl_csr_matrix)(C_d->matrix);
     if ((C_d2->csr_val == NULL && C_d->type != HL_NO_VALUE) ||
-         C_d2->csr_row == NULL ||
-         C_d2->csr_col == NULL) {
+        C_d2->csr_row == NULL || C_d2->csr_col == NULL) {
       LOG(FATAL) << "parameter error!";
     }
 
     if (beta != 1.0) {
-      hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
-                            C_d2->csr_val,
-                            1,
-                            C_d->nnz,
-                            C_d->nnz);
+      hl_gpu_apply_unary_op(
+          unary::mul_scalar<real>(beta), C_d2->csr_val, 1, C_d->nnz, C_d->nnz);
     }
 
     bool transA = transa == HPPL_OP_T ? 1 : 0;
@@ -1058,20 +1012,20 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
       dim3 threads(CU_CSCMM_DMD2CSR_THREAD_X, 1);
       dim3 grid(blocksX, blocksY);
 
-      KeSMatrixDenseMulDense2CSR
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d2->csr_val,
-                                               C_d2->csr_row,
-                                               C_d2->csr_col,
-                                               A_d,
-                                               B_d,
-                                               transA,
-                                               transB,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
-     CHECK_SYNC("hl_sparse_matrix_mul failed");
+      KeSMatrixDenseMulDense2CSR<<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d2->csr_val,
+          C_d2->csr_row,
+          C_d2->csr_col,
+          A_d,
+          B_d,
+          transA,
+          transB,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+      CHECK_SYNC("hl_sparse_matrix_mul failed");
     } else {
       CHECK(!transA) << "Not supported A is trans and B is not trans!";
 
@@ -1080,21 +1034,21 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
       avgNnzPerRow = avgNnzPerRow > 0 ? avgNnzPerRow : 1;
       int gridx = DIVUP(avgNnzPerRow, CU_BLOCK_SIZE);
       dim3 grid(gridx, dimM);
-      KeSMatrixDenseMulDenseTrans2CSR
-         <<<grid, block, 0, STREAM_DEFAULT>>>(C_d2->csr_val,
-                                               C_d2->csr_row,
-                                               C_d2->csr_col,
-                                               A_d,
-                                               B_d,
-                                               transA,
-                                               transB,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
-     CHECK_SYNC("hl_sparse_matrix_mul failed");
-   }
+      KeSMatrixDenseMulDenseTrans2CSR<<<grid, block, 0, STREAM_DEFAULT>>>(
+          C_d2->csr_val,
+          C_d2->csr_row,
+          C_d2->csr_col,
+          A_d,
+          B_d,
+          transA,
+          transB,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+      CHECK_SYNC("hl_sparse_matrix_mul failed");
+    }
   }
 }
 
@@ -1111,7 +1065,7 @@ void hl_memcpy_from_csc_matrix(real *csc_val,
   CHECK_NOTNULL(csc_col);
 
   CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC)
-     << "csc_matrix is not csc format error!";
+      << "csc_matrix is not csc format error!";
 
   if (csc_matrix->nnz > row_size ||
       csc_matrix->cols + 1 > static_cast<int>(col_size)) {
@@ -1119,20 +1073,20 @@ void hl_memcpy_from_csc_matrix(real *csc_val,
   }
 
   hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix);
-  hl_memcpy_async((void*)csc_row,
-                  (void*)csc->csc_row,
+  hl_memcpy_async((void *)csc_row,
+                  (void *)csc->csc_row,
                   (csc_matrix->nnz) * sizeof(int),
                   stream);
-  hl_memcpy_async((void*)csc_col,
-                  (void*)csc->csc_col,
+  hl_memcpy_async((void *)csc_col,
+                  (void *)csc->csc_col,
                   (csc_matrix->cols + 1) * sizeof(int),
                   stream);
   if (csc_matrix->type == HL_FLOAT_VALUE) {
     if (csc_val != NULL) {
       CHECK_LE(csc_matrix->nnz, val_size) << "size not match!";
-      hl_memcpy_async((void*)csc_val,
-                      (void*)csc->csc_val,
-                      (csc_matrix->nnz)*sizeof(real),
+      hl_memcpy_async((void *)csc_val,
+                      (void *)csc->csc_val,
+                      (csc_matrix->nnz) * sizeof(real),
                       stream);
     } else {
       LOG(FATAL) << "parameter csr_val is null pointer!";
@@ -1152,7 +1106,7 @@ void hl_memcpy_from_csr_matrix(real *csr_val,
   CHECK_NOTNULL(csr_row);
   CHECK_NOTNULL(csr_col);
   CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR)
-    << "csr_matrix is not csr format error!";
+      << "csr_matrix is not csr format error!";
 
   if (csr_matrix->nnz > col_size ||
       csr_matrix->rows + 1 > static_cast<int>(row_size)) {
@@ -1160,20 +1114,20 @@ void hl_memcpy_from_csr_matrix(real *csr_val,
   }
 
   hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix);
-  hl_memcpy_async((void*)csr_row,
-                  (void*)csr->csr_row,
-                  (csr_matrix->rows+1)*sizeof(int),
+  hl_memcpy_async((void *)csr_row,
+                  (void *)csr->csr_row,
+                  (csr_matrix->rows + 1) * sizeof(int),
                   stream);
-  hl_memcpy_async((void*)csr_col,
-                  (void*)csr->csr_col,
-                  (csr_matrix->nnz)*sizeof(int),
+  hl_memcpy_async((void *)csr_col,
+                  (void *)csr->csr_col,
+                  (csr_matrix->nnz) * sizeof(int),
                   stream);
   if (csr_matrix->type == HL_FLOAT_VALUE) {
     if (csr_val != NULL) {
       CHECK_LE(csr_matrix->nnz, val_size) << "size not match!";
-      hl_memcpy_async((void*)csr_val,
-                      (void*)csr->csr_val,
-                      (csr_matrix->nnz)*sizeof(real),
+      hl_memcpy_async((void *)csr_val,
+                      (void *)csr->csr_val,
+                      (csr_matrix->nnz) * sizeof(real),
                       stream);
     } else {
       LOG(FATAL) << "parameter csr_val is null pointer!";
@@ -1181,8 +1135,8 @@ void hl_memcpy_from_csr_matrix(real *csr_val,
   }
 }
 
-void hl_sparse_matrix_column_sum(real* A_d, hl_sparse_matrix_s B_d, int dimM,
-                                 int dimN, real scale) {
+void hl_sparse_matrix_column_sum(
+    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {
   if (B_d->format == HL_SPARSE_CSR) {
     hl_matrix_csr_column_sum(A_d, B_d, dimM, dimN, scale);
   } else {
@@ -1190,8 +1144,8 @@ void hl_sparse_matrix_column_sum(real* A_d, hl_sparse_matrix_s B_d, int dimM,
   }
 }
 
-void hl_matrix_csr_column_sum(real* A_d, hl_sparse_matrix_s B_d,
-                              int dimM, int dimN, real scale) {
+void hl_matrix_csr_column_sum(
+    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(B_d);
 
@@ -1216,8 +1170,7 @@ void hl_matrix_csr_column_sum(real* A_d, hl_sparse_matrix_s B_d,
   CHECK_SYNC("hl_matrix_csr_column_sum failed");
 }
 
-void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
-                               real* B_d, real scale) {
+void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d, real *B_d, real scale) {
   if (A_d->format == HL_SPARSE_CSR) {
     hl_matrix_csr_add_bias(A_d, B_d, scale);
   } else {
@@ -1225,8 +1178,7 @@ void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
   }
 }
 
-void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real* B_d,
-                            real scale) {
+void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real *B_d, real scale) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(B_d);
 
@@ -1247,8 +1199,12 @@ void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real* B_d,
   CHECK_SYNC("hl_sparse_matrix_add_bias failed");
 }
 
-void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d, real *B_d, int dimM,
-                                int dimN, real alpha, real beta) {
+void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
+                                real *B_d,
+                                int dimM,
+                                int dimN,
+                                real alpha,
+                                real beta) {
   if (A_d->format == HL_SPARSE_CSR) {
     hl_matrix_csr_add_dense(A_d, B_d, dimM, dimN, alpha, beta);
   } else {
@@ -1256,8 +1212,12 @@ void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d, real *B_d, int dimM,
   }
 }
 
-void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, real* B_d, int dimM,
-                             int dimN, real alpha, real beta) {
+void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
+                             real *B_d,
+                             int dimM,
+                             int dimN,
+                             real alpha,
+                             real beta) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(B_d);
 
@@ -1277,20 +1237,26 @@ void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, real* B_d, int dimM,
   gridX = gridX > 0 ? gridX : 1;
   dim3 block(512, 1);
   dim3 grid(gridX, dimM);
-  KeSMatrixCsrAddDense<<<grid, block, 0, STREAM_DEFAULT>>>(
-    A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, B_d, alpha, beta, dimM, dimN);
+  KeSMatrixCsrAddDense<<<grid, block, 0, STREAM_DEFAULT>>>(A_d2->csr_val,
+                                                           A_d2->csr_row,
+                                                           A_d2->csr_col,
+                                                           B_d,
+                                                           alpha,
+                                                           beta,
+                                                           dimM,
+                                                           dimN);
 
   CHECK_SYNC("hl_sparse_matrix_add_dense failed");
 }
 
-int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) {
+int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) {
   __sparse_get_return__(sMat, row);
 }
 
-int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) {
+int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) {
   __sparse_get_return__(sMat, col);
 }
 
-real* hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
+real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
   __sparse_get_return__(sMat, val);
 }
diff --git a/paddle/cuda/src/hl_perturbation_util.cu b/paddle/cuda/src/hl_perturbation_util.cu
index 2a945bcdb8..d01a91561e 100644
--- a/paddle/cuda/src/hl_perturbation_util.cu
+++ b/paddle/cuda/src/hl_perturbation_util.cu
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
-#include <cmath>
 #include <stdlib.h>
-#include "hl_cuda.h"
-#include "hl_time.h"
+#include <cmath>
 #include "hl_base.h"
+#include "hl_cuda.h"
 #include "hl_perturbation_util.cuh"
+#include "hl_time.h"
 
 #define _USE_MATH_DEFINES
 
@@ -30,10 +29,16 @@ limitations under the License. */
  * centerX, centerY: translation.
  * sourceX, sourceY: output coordinates in the original image.
  */
-__device__ void getTranformCoord(int x, int y, real theta, real scale,
-                                 real tgtCenter, real imgCenter,
-                                 real centerR, real centerC,
-                                 int* sourceX, int* sourceY) {
+__device__ void getTranformCoord(int x,
+                                 int y,
+                                 real theta,
+                                 real scale,
+                                 real tgtCenter,
+                                 real imgCenter,
+                                 real centerR,
+                                 real centerC,
+                                 int* sourceX,
+                                 int* sourceY) {
   real H[4] = {cosf(-theta), -sinf(-theta), sinf(-theta), cosf(-theta)};
 
   // compute coornidates in the rotated and scaled image
@@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale,
  * created by Wei Xu (genome), converted by Jiang Wang
  */
 
-__global__ void kSamplingPatches(const real* imgs, real* targets,
-                                 int imgSize, int tgtSize, const int channels,
-                                 int samplingRate, const real* thetas,
-                                 const real* scales, const int* centerRs,
-                                 const int* centerCs, const real padValue,
+__global__ void kSamplingPatches(const real* imgs,
+                                 real* targets,
+                                 int imgSize,
+                                 int tgtSize,
+                                 const int channels,
+                                 int samplingRate,
+                                 const real* thetas,
+                                 const real* scales,
+                                 const int* centerRs,
+                                 const int* centerCs,
+                                 const real padValue,
                                  const int numImages) {
   const int caseIdx = blockIdx.x * 4 + threadIdx.x;
   const int pxIdx = blockIdx.y * 128 + threadIdx.y;
@@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
     const int pxY = pxIdx / tgtSize;
 
     int srcPxX, srcPxY;
-    getTranformCoord(pxX, pxY, thetas[imgIdx], scales[imgIdx], tgtCenter,
-                     imgCenter, centerCs[caseIdx], centerRs[caseIdx], &srcPxX,
+    getTranformCoord(pxX,
+                     pxY,
+                     thetas[imgIdx],
+                     scales[imgIdx],
+                     tgtCenter,
+                     imgCenter,
+                     centerCs[caseIdx],
+                     centerRs[caseIdx],
+                     &srcPxX,
                      &srcPxY);
 
     imgs += (imgIdx * imgPixels + srcPxY * imgSize + srcPxX) * channels;
@@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
  *
  * created by Wei Xu
  */
-void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
-                                int*& gpuCenterR, int*& gpuCenterC,
-                                int numImages, int imgSize, real rotateAngle,
-                                real scaleRatio, int samplingRate,
+void hl_generate_disturb_params(real*& gpuAngle,
+                                real*& gpuScaleRatio,
+                                int*& gpuCenterR,
+                                int*& gpuCenterC,
+                                int numImages,
+                                int imgSize,
+                                real rotateAngle,
+                                real scaleRatio,
+                                int samplingRate,
                                 bool isTrain) {
   // The number of output samples.
   int numPatches = numImages * samplingRate;
@@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
     for (int i = 0; i < numImages; i++) {
       r_angle[i] =
           (rotateAngle * M_PI / 180.0) * (rand() / (RAND_MAX + 1.0)  // NOLINT
-                                          - 0.5);
+                                          -
+                                          0.5);
       s_ratio[i] =
           1 + (rand() / (RAND_MAX + 1.0) - 0.5) * scaleRatio;  // NOLINT
     }
@@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
         int pxY =
             (int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0));  // NOLINT
 
-        const real H[4] = {cos(-r_angle[i]), -sin(-r_angle[i]),
-                           sin(-r_angle[i]), cos(-r_angle[i])};
+        const real H[4] = {cos(-r_angle[i]),
+                           -sin(-r_angle[i]),
+                           sin(-r_angle[i]),
+                           cos(-r_angle[i])};
         real x = pxX - imgCenter;
         real y = pxY - imgCenter;
         real xx = H[0] * x + H[1] * y;
@@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
   delete[] center_c;
 }
 
-void hl_conv_random_disturb_with_params(const real* images, int imgSize,
-                                        int tgtSize, int channels,
-                                        int numImages, int samplingRate,
+void hl_conv_random_disturb_with_params(const real* images,
+                                        int imgSize,
+                                        int tgtSize,
+                                        int channels,
+                                        int numImages,
+                                        int samplingRate,
                                         const real* gpuRotationAngle,
                                         const real* gpuScaleRatio,
                                         const int* gpuCenterR,
@@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize,
   dim3 threadsPerBlock(4, 128);
   dim3 numBlocks(DIVUP(numPatches, 4), DIVUP(targetSize, 128));
 
-  kSamplingPatches <<<numBlocks, threadsPerBlock>>>
-      (images, target, imgSize, tgtSize, channels, samplingRate,
-      gpuRotationAngle, gpuScaleRatio, gpuCenterR, gpuCenterC,
-      paddingValue, numImages);
+  kSamplingPatches<<<numBlocks, threadsPerBlock>>>(images,
+                                                   target,
+                                                   imgSize,
+                                                   tgtSize,
+                                                   channels,
+                                                   samplingRate,
+                                                   gpuRotationAngle,
+                                                   gpuScaleRatio,
+                                                   gpuCenterR,
+                                                   gpuCenterC,
+                                                   paddingValue,
+                                                   numImages);
 
   hl_device_synchronize();
 }
 
-void hl_conv_random_disturb(const real* images, int imgSize,
-                            int tgtSize, int channels, int numImages,
-                            real scaleRatio, real rotateAngle,
-                            int samplingRate, real* gpu_r_angle,
-                            real* gpu_s_ratio, int* gpu_center_r,
-                            int* gpu_center_c, int paddingValue,
-                            bool isTrain, real* targets) {
+void hl_conv_random_disturb(const real* images,
+                            int imgSize,
+                            int tgtSize,
+                            int channels,
+                            int numImages,
+                            real scaleRatio,
+                            real rotateAngle,
+                            int samplingRate,
+                            real* gpu_r_angle,
+                            real* gpu_s_ratio,
+                            int* gpu_center_r,
+                            int* gpu_center_c,
+                            int paddingValue,
+                            bool isTrain,
+                            real* targets) {
   // generate the random disturbance sequence and the sampling locations
-  hl_generate_disturb_params(gpu_r_angle, gpu_s_ratio, gpu_center_r,
-                  gpu_center_c, numImages, imgSize, rotateAngle,
-                  scaleRatio, samplingRate, isTrain);
-
-  hl_conv_random_disturb_with_params(
-                  images, imgSize, tgtSize, channels, numImages,
-                  samplingRate, gpu_r_angle, gpu_s_ratio,
-                  gpu_center_r, gpu_center_r, paddingValue,
-                  targets);
+  hl_generate_disturb_params(gpu_r_angle,
+                             gpu_s_ratio,
+                             gpu_center_r,
+                             gpu_center_c,
+                             numImages,
+                             imgSize,
+                             rotateAngle,
+                             scaleRatio,
+                             samplingRate,
+                             isTrain);
+
+  hl_conv_random_disturb_with_params(images,
+                                     imgSize,
+                                     tgtSize,
+                                     channels,
+                                     numImages,
+                                     samplingRate,
+                                     gpu_r_angle,
+                                     gpu_s_ratio,
+                                     gpu_center_r,
+                                     gpu_center_r,
+                                     paddingValue,
+                                     targets);
 }
diff --git a/paddle/cuda/src/hl_table_apply.cu b/paddle/cuda/src/hl_table_apply.cu
index 61edbe3ccc..d3b71c75e6 100644
--- a/paddle/cuda/src/hl_table_apply.cu
+++ b/paddle/cuda/src/hl_table_apply.cu
@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "hl_base.h"
-#include "hl_device_functions.cuh"
 #include "hl_cuda.h"
+#include "hl_device_functions.cuh"
 #include "paddle/utils/Logging.h"
 
-template<int blockDimX, int blockDimY, int gridDimX, bool AddRow>
-__global__ void KeMatrixAddRows(real* output, int ldo,
-                                real* table, int ldt,
+template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
+__global__ void KeMatrixAddRows(real* output,
+                                int ldo,
+                                real* table,
+                                int ldt,
                                 int* ids,
                                 int numSamples,
                                 int tableSize,
@@ -31,8 +32,8 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
   while (idy < numSamples) {
     int tableId = ids[idy];
     if ((0 <= tableId) && (tableId < tableSize)) {
-      real *out = output + idy * ldo;
-      real *tab = table + tableId * ldt;
+      real* out = output + idy * ldo;
+      real* tab = table + tableId * ldt;
       for (int i = idx; i < dim; i += blockDimX) {
         if (AddRow) {
           paddle::paddleAtomicAdd(&tab[i], out[i]);
@@ -45,8 +46,10 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
   }
 }
 
-void hl_matrix_select_rows(real* output, int ldo,
-                           real* table, int ldt,
+void hl_matrix_select_rows(real* output,
+                           int ldo,
+                           real* table,
+                           int ldt,
                            int* ids,
                            int numSamples,
                            int tableSize,
@@ -57,14 +60,16 @@ void hl_matrix_select_rows(real* output, int ldo,
 
   dim3 threads(128, 8);
   dim3 grid(8, 1);
-  KeMatrixAddRows<128, 8, 8, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
-    (output, ldo, table, ldt, ids, numSamples, tableSize, dim);
+  KeMatrixAddRows<128, 8, 8, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      output, ldo, table, ldt, ids, numSamples, tableSize, dim);
 
   CHECK_SYNC("hl_matrix_select_rows failed");
 }
 
-void hl_matrix_add_to_rows(real* table, int ldt,
-                           real* input, int ldi,
+void hl_matrix_add_to_rows(real* table,
+                           int ldt,
+                           real* input,
+                           int ldi,
                            int* ids,
                            int numSamples,
                            int tableSize,
@@ -75,16 +80,15 @@ void hl_matrix_add_to_rows(real* table, int ldt,
 
   dim3 threads(128, 8);
   dim3 grid(8, 1);
-  KeMatrixAddRows<128, 8, 8, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
-    (input, ldi, table, ldt, ids, numSamples, tableSize, dim);
+  KeMatrixAddRows<128, 8, 8, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      input, ldi, table, ldt, ids, numSamples, tableSize, dim);
 
   CHECK_SYNC("hl_matrix_add_to_rows failed");
 }
 
-template<class T, int blockDimX, int gridDimX>
-__global__ void KeVectorSelect(T* dst, int sized,
-                               const T* src, int sizes,
-                               const int* ids, int sizei) {
+template <class T, int blockDimX, int gridDimX>
+__global__ void KeVectorSelect(
+    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
   int idx = threadIdx.x + blockDimX * blockIdx.x;
   while (idx < sizei) {
     int index = ids[idx];
@@ -95,9 +99,8 @@ __global__ void KeVectorSelect(T* dst, int sized,
 }
 
 template <class T>
-void hl_vector_select_from(T* dst, int sized,
-                           const T* src, int sizes,
-                           const int* ids, int sizei) {
+void hl_vector_select_from(
+    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
   CHECK_NOTNULL(dst);
   CHECK_NOTNULL(src);
   CHECK_NOTNULL(ids);
@@ -105,18 +108,17 @@ void hl_vector_select_from(T* dst, int sized,
 
   dim3 threads(512, 1);
   dim3 grid(8, 1);
-  KeVectorSelect<T, 512, 8><<< grid, threads, 0, STREAM_DEFAULT >>>
-    (dst, sized, src, sizes, ids, sizei);
+  KeVectorSelect<T, 512, 8><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      dst, sized, src, sizes, ids, sizei);
 
   CHECK_SYNC("hl_vector_select_from failed");
 }
 
-template
-void hl_vector_select_from(real* dst, int sized,
-                           const real* src, int sizes,
-                           const int* ids, int sizei);
-template
-void hl_vector_select_from(int* dst, int sized,
-                           const int* src, int sizes,
-                           const int* ids, int sizei);
-
+template void hl_vector_select_from(real* dst,
+                                    int sized,
+                                    const real* src,
+                                    int sizes,
+                                    const int* ids,
+                                    int sizei);
+template void hl_vector_select_from(
+    int* dst, int sized, const int* src, int sizes, const int* ids, int sizei);
diff --git a/paddle/cuda/src/hl_top_k.cu b/paddle/cuda/src/hl_top_k.cu
index 4f0bbfcf4e..1896a56634 100644
--- a/paddle/cuda/src/hl_top_k.cu
+++ b/paddle/cuda/src/hl_top_k.cu
@@ -12,45 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "hl_base.h"
-#include "hl_top_k.h"
 #include "hl_sparse.ph"
+#include "hl_top_k.h"
 #include "paddle/utils/Logging.h"
 
 // using namespace hppl;
 
 struct Pair {
-  __device__ __forceinline__
-  Pair() {}
+  __device__ __forceinline__ Pair() {}
 
-  __device__ __forceinline__
-  Pair(real value, int id) : v_(value), id_(id) {}
+  __device__ __forceinline__ Pair(real value, int id) : v_(value), id_(id) {}
 
-  __device__ __forceinline__
-  void set(real value, int id) {
+  __device__ __forceinline__ void set(real value, int id) {
     v_ = value;
     id_ = id;
   }
 
-  __device__ __forceinline__
-  void operator=(const Pair& in) {
+  __device__ __forceinline__ void operator=(const Pair& in) {
     v_ = in.v_;
     id_ = in.id_;
   }
 
-  __device__ __forceinline__
-  bool operator<(const real value) const {
+  __device__ __forceinline__ bool operator<(const real value) const {
     return (v_ < value);
   }
 
-  __device__ __forceinline__
-  bool operator<(const Pair& in) const {
+  __device__ __forceinline__ bool operator<(const Pair& in) const {
     return (v_ < in.v_) || ((v_ == in.v_) && (id_ > in.id_));
   }
 
-  __device__ __forceinline__
-  bool operator>(const Pair& in) const {
+  __device__ __forceinline__ bool operator>(const Pair& in) const {
     return (v_ > in.v_) || ((v_ == in.v_) && (id_ < in.id_));
   }
 
@@ -58,8 +50,9 @@ struct Pair {
   int id_;
 };
 
-__device__ __forceinline__
-void addTo(Pair topK[], const Pair &p, int beamSize) {
+__device__ __forceinline__ void addTo(Pair topK[],
+                                      const Pair& p,
+                                      int beamSize) {
   for (int k = beamSize - 2; k >= 0; k--) {
     if (topK[k] < p) {
       topK[k + 1] = topK[k];
@@ -71,9 +64,8 @@ void addTo(Pair topK[], const Pair &p, int beamSize) {
   topK[0] = p;
 }
 
-template<int beamSize>
-__device__ __forceinline__
-void addTo(Pair topK[], const Pair &p) {
+template <int beamSize>
+__device__ __forceinline__ void addTo(Pair topK[], const Pair& p) {
   for (int k = beamSize - 2; k >= 0; k--) {
     if (topK[k] < p) {
       topK[k + 1] = topK[k];
@@ -85,9 +77,9 @@ void addTo(Pair topK[], const Pair &p) {
   topK[0] = p;
 }
 
-template<int blockSize>
-__device__ __forceinline__
-void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) {
+template <int blockSize>
+__device__ __forceinline__ void getTopK(
+    Pair topK[], real* src, int idx, int dim, int beamSize) {
   while (idx < dim) {
     if (topK[beamSize - 1] < src[idx]) {
       Pair tmp(src[idx], idx);
@@ -97,10 +89,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) {
   }
 }
 
-template<int blockSize>
-__device__ __forceinline__
-void getTopK(Pair topK[], real *src, int idx, int dim,
-             const Pair& max, int beamSize) {
+template <int blockSize>
+__device__ __forceinline__ void getTopK(
+    Pair topK[], real* src, int idx, int dim, const Pair& max, int beamSize) {
   while (idx < dim) {
     if (topK[beamSize - 1] < src[idx]) {
       Pair tmp(src[idx], idx);
@@ -112,10 +103,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim,
   }
 }
 
-template<int blockSize>
-__device__ __forceinline__
-void getTopK(Pair topK[], real *val, int *col,
-             int idx, int dim, int beamSize) {
+template <int blockSize>
+__device__ __forceinline__ void getTopK(
+    Pair topK[], real* val, int* col, int idx, int dim, int beamSize) {
   while (idx < dim) {
     if (topK[beamSize - 1] < val[idx]) {
       Pair tmp(val[idx], col[idx]);
@@ -125,10 +115,14 @@ void getTopK(Pair topK[], real *val, int *col,
   }
 }
 
-template<int blockSize>
-__device__ __forceinline__
-void getTopK(Pair topK[], real *val, int *col, int idx, int dim,
-             const Pair& max, int beamSize) {
+template <int blockSize>
+__device__ __forceinline__ void getTopK(Pair topK[],
+                                        real* val,
+                                        int* col,
+                                        int idx,
+                                        int dim,
+                                        const Pair& max,
+                                        int beamSize) {
   while (idx < dim) {
     if (topK[beamSize - 1] < val[idx]) {
       Pair tmp(val[idx], col[idx]);
@@ -140,12 +134,16 @@ void getTopK(Pair topK[], real *val, int *col, int idx, int dim,
   }
 }
 
-template<int maxLength, int blockSize>
-__device__ __forceinline__
-void threadGetTopK(Pair topK[], int& beam, int beamSize,
-                   real* src,
-                   bool& firstStep, bool& isEmpty, Pair& max,
-                   int dim, const int tid) {
+template <int maxLength, int blockSize>
+__device__ __forceinline__ void threadGetTopK(Pair topK[],
+                                              int& beam,
+                                              int beamSize,
+                                              real* src,
+                                              bool& firstStep,
+                                              bool& isEmpty,
+                                              Pair& max,
+                                              int dim,
+                                              const int tid) {
   if (beam > 0) {
     int length = beam < beamSize ? beam : beamSize;
     if (firstStep) {
@@ -160,8 +158,7 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
         }
       }
       if (!isEmpty) {
-        getTopK<blockSize>(topK + maxLength - beam, src, tid, dim,
-                           max, length);
+        getTopK<blockSize>(topK + maxLength - beam, src, tid, dim, max, length);
       }
     }
 
@@ -171,12 +168,17 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
   }
 }
 
-template<int maxLength, int blockSize>
-__device__ __forceinline__
-void threadGetTopK(Pair topK[], int& beam, int beamSize,
-                   real* val, int* col,
-                   bool& firstStep, bool& isEmpty, Pair& max,
-                   int dim, const int tid) {
+template <int maxLength, int blockSize>
+__device__ __forceinline__ void threadGetTopK(Pair topK[],
+                                              int& beam,
+                                              int beamSize,
+                                              real* val,
+                                              int* col,
+                                              bool& firstStep,
+                                              bool& isEmpty,
+                                              Pair& max,
+                                              int dim,
+                                              const int tid) {
   if (beam > 0) {
     int length = beam < beamSize ? beam : beamSize;
     if (firstStep) {
@@ -191,8 +193,8 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
         }
       }
       if (!isEmpty) {
-        getTopK<blockSize>(topK + maxLength - beam, val, col, tid, dim,
-                           max, length);
+        getTopK<blockSize>(
+            topK + maxLength - beam, val, col, tid, dim, max, length);
       }
     }
 
@@ -202,12 +204,16 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
   }
 }
 
-template<int maxLength, int blockSize>
-__device__ __forceinline__
-void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
-                 real** topVal, int** topIds,
-                 int& beam, int& beamSize,
-                 const int tid, const int warp) {
+template <int maxLength, int blockSize>
+__device__ __forceinline__ void blockReduce(Pair* shTopK,
+                                            int* maxId,
+                                            Pair topK[],
+                                            real** topVal,
+                                            int** topIds,
+                                            int& beam,
+                                            int& beamSize,
+                                            const int tid,
+                                            const int warp) {
   while (true) {
     __syncthreads();
     if (tid < blockSize / 2) {
@@ -218,7 +224,7 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
       }
     }
     __syncthreads();
-    for (int stride = blockSize / 4; stride > 0; stride = stride/2) {
+    for (int stride = blockSize / 4; stride > 0; stride = stride / 2) {
       if (tid < stride) {
         if (shTopK[maxId[tid]] < shTopK[maxId[tid + stride]]) {
           maxId[tid] = maxId[tid + stride];
@@ -257,10 +263,12 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
  * 3. go to the second setp, until one thread's topK value is null;
  * 4. go to the first setp, until get the topK value.
  */
-template<int maxLength, int blockSize>
-__global__ void KeMatrixTopK(real* topVal, int ldv,
-                             int * topIds,
-                             real* src, int lds,
+template <int maxLength, int blockSize>
+__global__ void KeMatrixTopK(real* topVal,
+                             int ldv,
+                             int* topIds,
+                             real* src,
+                             int lds,
                              int dim,
                              int beamSize) {
   __shared__ Pair shTopK[blockSize];
@@ -271,7 +279,7 @@ __global__ void KeMatrixTopK(real* topVal, int ldv,
   topVal += blockIdx.x * ldv;
   topIds += blockIdx.x * beamSize;
 
-  Pair topK[maxLength]; // NOLINT
+  Pair topK[maxLength];  // NOLINT
   int beam = maxLength;
   Pair max;
   bool isEmpty = false;
@@ -281,18 +289,19 @@ __global__ void KeMatrixTopK(real* topVal, int ldv,
     topK[k].set(-HL_FLOAT_MAX, -1);
   }
   while (beamSize) {
-    threadGetTopK<maxLength, blockSize>
-      (topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
+    threadGetTopK<maxLength, blockSize>(
+        topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
 
     shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>
-      (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+    blockReduce<maxLength, blockSize>(
+        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
   }
 }
 
-template<int maxLength, int blockSize>
-__global__ void KeSMatrixTopK(real* topVal, int ldv,
-                              int * topIds,
+template <int maxLength, int blockSize>
+__global__ void KeSMatrixTopK(real* topVal,
+                              int ldv,
+                              int* topIds,
                               real* val,
                               int* row,
                               int* col,
@@ -304,7 +313,7 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv,
   topVal += blockIdx.x * ldv;
   topIds += blockIdx.x * beamSize;
 
-  Pair topK[maxLength]; // NOLINT
+  Pair topK[maxLength];  // NOLINT
   int beam = maxLength;
   Pair max;
   bool isEmpty = false;
@@ -330,18 +339,20 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv,
     topK[k].set(-HL_FLOAT_MAX, -1);
   }
   while (beamSize) {
-    threadGetTopK<maxLength, blockSize>
-      (topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid);
+    threadGetTopK<maxLength, blockSize>(
+        topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid);
 
     shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>
-      (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+    blockReduce<maxLength, blockSize>(
+        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
   }
 }
 
-void hl_matrix_top_k(real* topVal, int ldv,
-                     int * topIds,
-                     real* src, int lds,
+void hl_matrix_top_k(real* topVal,
+                     int ldv,
+                     int* topIds,
+                     real* src,
+                     int lds,
                      int dim,
                      int beamSize,
                      int numSamples) {
@@ -353,33 +364,32 @@ void hl_matrix_top_k(real* topVal, int ldv,
 
   dim3 threads(256, 1);
   dim3 grid(numSamples, 1);
-  KeMatrixTopK<5, 256><<< grid, threads, 0, STREAM_DEFAULT >>>
-    (topVal, ldv, topIds, src, lds, dim, beamSize);
+  KeMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      topVal, ldv, topIds, src, lds, dim, beamSize);
 
   CHECK_SYNC("hl_matrix_top_k failed");
 }
 
-void hl_sparse_matrix_top_k(real* topVal, int ldv,
-                            int * topIds,
+void hl_sparse_matrix_top_k(real* topVal,
+                            int ldv,
+                            int* topIds,
                             hl_sparse_matrix_s src,
                             int beamSize,
                             int numSamples) {
   CHECK_NOTNULL(topVal);
   CHECK_NOTNULL(topIds);
   CHECK_NOTNULL(src);
-  CHECK_EQ(src->format, HL_SPARSE_CSR)
-    <<"sparse matrix format error!";
+  CHECK_EQ(src->format, HL_SPARSE_CSR) << "sparse matrix format error!";
 
   hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
-  if (csr->csr_val == NULL || csr->csr_row == NULL ||
-      csr->csr_col == NULL) {
+  if (csr->csr_val == NULL || csr->csr_row == NULL || csr->csr_col == NULL) {
     LOG(FATAL) << "parameter src is null!";
   }
 
   dim3 threads(256, 1);
   dim3 grid(numSamples, 1);
-  KeSMatrixTopK<5, 256><<< grid, threads, 0, STREAM_DEFAULT >>>
-    (topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize);
+  KeSMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize);
 
   CHECK_SYNC("hl_sparse_matrix_top_k failed");
 }
@@ -392,10 +402,12 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv,
  * 3. go to the second setp, until one thread's topK value is null;
  * 4. go to the first setp, until get the topK value.
  */
-template<int maxLength, int blockSize>
-__global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
-                                                int * topIds,
-                                                real* src, int lds,
+template <int maxLength, int blockSize>
+__global__ void KeMatrixTopKClassificationError(real* topVal,
+                                                int ldv,
+                                                int* topIds,
+                                                real* src,
+                                                int lds,
                                                 int dim,
                                                 int beamSize,
                                                 int* label,
@@ -408,7 +420,7 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
   topVal += blockIdx.x * ldv;
   topIds += blockIdx.x * beamSize;
 
-  Pair topK[maxLength]; // NOLINT
+  Pair topK[maxLength];  // NOLINT
   int beam = maxLength;
   Pair max;
   bool isEmpty = false;
@@ -420,34 +432,36 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
   }
 
   while (beamSize) {
-    threadGetTopK<maxLength, blockSize>
-      (topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
+    threadGetTopK<maxLength, blockSize>(
+        topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
 
     shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>
-      (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+    blockReduce<maxLength, blockSize>(
+        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
   }
 
   __syncthreads();
   if (tid == 0) {
     for (int i = 0; i < topkSize; i++) {
-        if (*--topIds == label[blockIdx.x]) {
-            recResult[blockIdx.x] = 0;
-            break;
-        }
-        recResult[blockIdx.x] = 1.0f;
+      if (*--topIds == label[blockIdx.x]) {
+        recResult[blockIdx.x] = 0;
+        break;
+      }
+      recResult[blockIdx.x] = 1.0f;
     }
   }
 }
 
-void hl_matrix_classification_error(real* topVal, int ldv,
-                                   int* topIds,
-                                   real* src, int lds,
-                                   int dim,
-                                   int topkSize,
-                                   int numSamples,
-                                   int* label,
-                                   real* recResult) {
+void hl_matrix_classification_error(real* topVal,
+                                    int ldv,
+                                    int* topIds,
+                                    real* src,
+                                    int lds,
+                                    int dim,
+                                    int topkSize,
+                                    int numSamples,
+                                    int* label,
+                                    real* recResult) {
   CHECK_NOTNULL(topVal);
   CHECK_NOTNULL(topIds);
   CHECK_NOTNULL(src);
@@ -456,9 +470,8 @@ void hl_matrix_classification_error(real* topVal, int ldv,
 
   dim3 threads(256, 1);
   dim3 grid(numSamples, 1);
-  KeMatrixTopKClassificationError<5, 256>
-  <<< grid, threads, 0, STREAM_DEFAULT >>>
-  (topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
+  KeMatrixTopKClassificationError<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
 
   CHECK_SYNC("hl_matrix_top_k classification error failed");
 }
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index de31952e79..68304c9fc8 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -1,24 +1,59 @@
 # ddim lib
-cc_library(ddim SRCS ddim.cc)
+cc_library(ddim SRCS ddim.cc DEPS eigen3)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
-cc_test(tensor_test SRCS tensor_test.cc DEPS ddim)
+
+cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context)
+cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
+cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
+
+cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor)
+cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor)
+
 cc_test(variable_test SRCS variable_test.cc)
-cc_test(scope_test SRCS scope_test.cc)
-cc_test(enforce_test SRCS enforce_test.cc)
-proto_library(attr_type SRCS attr_type.proto)
-proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
-cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
-proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
-cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
-cc_library(operator SRCS operator.cc DEPS op_desc protobuf device_context)
-cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry place)
-cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc)
-cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry operator)
-py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
+
+cc_library(scope SRCS scope.cc)
+cc_test(scope_test SRCS scope_test.cc DEPS scope)
+
+proto_library(framework_proto SRCS framework.proto)
+
+cc_library(attribute SRCS attribute.cc DEPS framework_proto)
+
+cc_library(operator SRCS operator.cc DEPS framework_proto device_context tensor scope attribute)
+cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
+
+cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS operator)
+cc_library(op_registry SRCS op_registry.cc DEPS grad_op_builder)
+cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
+cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_op)
+
+py_proto_compile(framework_py_proto SRCS framework.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
+add_custom_command(TARGET framework_py_proto POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/proto
+    COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/proto/
+    COMMENT "Copy generated python proto into directory paddle/v2/framework/proto."
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+
+cc_library(backward SRCS backward.cc DEPS net_op)
+cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
 
-proto_library(net_proto SRCS net_proto.proto DEPS op_proto)
-cc_library(net SRCS net.cc DEPS net_proto)
+if(WITH_PYTHON)
+cc_library(paddle_pybind SHARED
+    SRCS pybind.cc
+    DEPS pybind python backward
+    sgd_op
+    add_op
+    mul_op
+    rowwise_add_op
+    sigmoid_op
+    softmax_op
+    mean_op
+    cross_entropy_op
+    recurrent_op
+    uniform_random_op
+    gaussian_random_op
+    fill_zeros_like_op)
+endif(WITH_PYTHON)
diff --git a/paddle/framework/attribute.cc b/paddle/framework/attribute.cc
new file mode 100644
index 0000000000..9eb07acdff
--- /dev/null
+++ b/paddle/framework/attribute.cc
@@ -0,0 +1,85 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/attribute.h"
+
+#include <vector>
+
+namespace paddle {
+namespace framework {
+
+template <>
+AttrType AttrTypeID<int>() {
+  return INT;
+}
+template <>
+AttrType AttrTypeID<float>() {
+  return FLOAT;
+}
+template <>
+AttrType AttrTypeID<std::string>() {
+  return STRING;
+}
+template <>
+AttrType AttrTypeID<std::vector<int>>() {
+  return INTS;
+}
+template <>
+AttrType AttrTypeID<std::vector<float>>() {
+  return FLOATS;
+}
+template <>
+AttrType AttrTypeID<std::vector<std::string>>() {
+  return STRINGS;
+}
+
+Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
+  switch (attr_desc.type()) {
+    case paddle::framework::AttrType::INT: {
+      return attr_desc.i();
+    }
+    case paddle::framework::AttrType::FLOAT: {
+      return attr_desc.f();
+    }
+    case paddle::framework::AttrType::STRING: {
+      return attr_desc.s();
+    }
+    case paddle::framework::AttrType::INTS: {
+      std::vector<int> val(attr_desc.ints_size());
+      for (int i = 0; i < attr_desc.ints_size(); ++i) {
+        val[i] = attr_desc.ints(i);
+      }
+      return val;
+    }
+    case paddle::framework::AttrType::FLOATS: {
+      std::vector<float> val(attr_desc.floats_size());
+      for (int i = 0; i < attr_desc.floats_size(); ++i) {
+        val[i] = attr_desc.floats(i);
+      }
+      return val;
+    }
+    case paddle::framework::AttrType::STRINGS: {
+      std::vector<std::string> val(attr_desc.strings_size());
+      for (int i = 0; i < attr_desc.strings_size(); ++i) {
+        val[i] = attr_desc.strings(i);
+      }
+      return val;
+    }
+  }
+  PADDLE_ENFORCE(false, "Unknown OpDesc::AttrDesc::type !");
+  return boost::blank();
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/attr_checker.h b/paddle/framework/attribute.h
similarity index 61%
rename from paddle/framework/attr_checker.h
rename to paddle/framework/attribute.h
index c0c33d8114..08b47cabd4 100644
--- a/paddle/framework/attr_checker.h
+++ b/paddle/framework/attribute.h
@@ -1,11 +1,28 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma once
 
-#include <boost/variant.hpp>
 #include <functional>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
-#include "paddle/framework/enforce.h"
+
+#include "paddle/framework/framework.pb.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/variant.h"
 
 namespace paddle {
 namespace framework {
@@ -13,13 +30,19 @@ namespace framework {
 typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
                        std::vector<float>, std::vector<std::string>>
     Attribute;
+
 typedef std::unordered_map<std::string, Attribute> AttributeMap;
 
+template <typename T>
+AttrType AttrTypeID();
+
+Attribute GetAttrValue(const OpDesc::Attr& attr_desc);
+
 // check whether a value(attribute) fit a certain limit
 template <typename T>
 class LargerThanChecker {
  public:
-  LargerThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
+  explicit LargerThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
   void operator()(T& value) const {
     PADDLE_ENFORCE(value > lower_bound_, "larger_than check fail");
   }
@@ -34,13 +57,43 @@ class LargerThanChecker {
 template <typename T>
 class DefaultValueSetter {
  public:
-  DefaultValueSetter(T default_value) : default_value_(default_value) {}
+  explicit DefaultValueSetter(T default_value)
+      : default_value_(default_value) {}
   void operator()(T& value) const { value = default_value_; }
 
  private:
   T default_value_;
 };
 
+template <typename T>
+class EnumInContainer {
+ public:
+  explicit EnumInContainer(const std::unordered_set<T>& c) : container_(c) {}
+  void operator()(T& val) const {
+    PADDLE_ENFORCE(container_.find(val) != container_.end(),
+                   "Value %s is not in enum container %s", val,
+                   ContainerDebugString());
+  }
+
+ private:
+  std::string ContainerDebugString() const {
+    std::ostringstream sout;
+    sout << "[";
+    size_t cnt = 0;
+    for (auto& v : container_) {
+      sout << v;
+      ++cnt;
+      if (cnt != container_.size()) {
+        sout << " ,";
+      }
+    }
+    sout << "]";
+    return sout.str();
+  }
+
+  std::unordered_set<T> container_;
+};
+
 // check whether a certain attribute fit its limits
 // an attribute can have more than one limits
 template <typename T>
@@ -48,7 +101,13 @@ class TypedAttrChecker {
   typedef std::function<void(T&)> ValueChecker;
 
  public:
-  TypedAttrChecker(const std::string& attr_name) : attr_name_(attr_name) {}
+  explicit TypedAttrChecker(const std::string& attr_name)
+      : attr_name_(attr_name) {}
+
+  TypedAttrChecker& InEnum(const std::unordered_set<T>& range) {
+    value_checkers_.push_back(EnumInContainer<T>(range));
+    return *this;
+  }
 
   TypedAttrChecker& LargerThan(const T& lower_bound) {
     value_checkers_.push_back(LargerThanChecker<T>(lower_bound));
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
new file mode 100644
index 0000000000..bfda18724c
--- /dev/null
+++ b/paddle/framework/backward.cc
@@ -0,0 +1,224 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/backward.h"
+
+#include <list>
+#include <memory>
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/net_op.h"
+#include "paddle/operators/recurrent_op.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename Map, typename T>
+static void ForEachVarName(const Map& names, T callback) {
+  for (auto& name : names) {
+    for (auto& n : name.second) {
+      if (callback(n)) return;
+    }
+  }
+}
+
+// return whether all the names + suffixes in the set
+static bool AllInSet(
+    const std::map<std::string, std::vector<std::string>>& names,
+    const std::string& suffix, const std::unordered_set<std::string>& set) {
+  bool all_in_set = true;
+  ForEachVarName(names, [&all_in_set, &set, &suffix](const std::string& n) {
+    all_in_set = set.find(n + suffix) != set.end();
+    return !all_in_set;
+  });
+  return all_in_set;
+}
+
+static std::unique_ptr<OperatorBase> NOP() {
+  auto net_op = new operators::NetOp();
+  net_op->SetType("@NOP@");
+  net_op->CompleteAddOp();
+  return std::unique_ptr<OperatorBase>(net_op);
+}
+
+//  Get backward operator from a forward operator, a recursive implementation.
+//
+//  no_grad_names the gradient variable names without gradient calculating.
+//
+//  uniq_id is a unique index used inside recursively calling
+//  BackwardRecursive. use `uid = uniq_id++;` to get the unique index, and
+//  pass `uniq_id` through recursive calling.
+//
+//  returns The backward operator. In a simple situation, it may be a simple
+//  operator, in a complex situation, it maybe a NetOp.
+//
+//  See Backward.h for details
+static std::unique_ptr<OperatorBase> BackwardRecursive(
+    const OperatorBase& forwardOp,
+    std::unordered_set<std::string>& no_grad_names, size_t& uniq_id) {
+  //  If all input gradients of forwarding operator do not need to calculate,
+  //  just return an NOP. Not return null ptr because NOP does not take
+  //  too much time for calculation, but it is useful for simplifying logic.
+  if (AllInSet(forwardOp.Inputs() /*names*/, kGradVarSuffix /*suffix*/,
+               no_grad_names /*set*/)) {
+    return NOP();
+  }
+
+  //  All output gradients of forwarding operator do not need to calculate.
+  //  Then all input gradients cannot be computed at all, and we put them into
+  //  `no_grad_names` set. Return an NOP.
+  if (AllInSet(forwardOp.Outputs() /*names*/, kGradVarSuffix /*suffix*/,
+               no_grad_names /*set*/)) {
+    ForEachVarName(forwardOp.Inputs(),
+                   [&no_grad_names](const std::string& name) -> bool {
+                     no_grad_names.insert(GradVarName(name));
+                     return false;
+                   });
+    return NOP();
+  }
+
+  // Returned gradient network
+  auto net = std::unique_ptr<operators::NetOp>(new operators::NetOp());
+
+  if (forwardOp.IsNetOp()) {
+    // Because forwardOp is a net op, it can static_cast.
+    auto& forwardNet = static_cast<const operators::NetOp&>(forwardOp);
+
+    // Map from output gradient variable name to operator's indices in
+    // backward net's ops_. That operator generates that variable.
+    std::unordered_map<std::string, std::vector<size_t>> dup_output_ops;
+
+    size_t local_op_id = 0;
+    // reversely travel forwardNet and collect all duplicate outputs.
+    for (auto it = forwardNet.ops_.rbegin(); it != forwardNet.ops_.rend();
+         ++it, ++local_op_id) {
+      auto& fwd = *it;
+      auto bwd = BackwardRecursive(*fwd, no_grad_names, uniq_id);
+      ForEachVarName(bwd->Outputs(),
+                     [&dup_output_ops, local_op_id](const std::string& out) {
+                       dup_output_ops[out].emplace_back(local_op_id);
+                       return false;
+                     });
+      net->AppendOp(std::move(bwd));
+    }
+    // Get unique ID for this method.
+    auto uid = uniq_id++;
+    // TODO(dzh): more comment
+    // multiple operators which have the same output (y for example) may
+    // overwrite the same y variable when backward, special operations are token
+    // to handle this case. For each duplicate output, rename it to an alias
+    // (original name with a offset), append an `add` op for its operator,
+    // and finally sum all the alias variable to the final output variable y.
+    using Pos = std::pair<size_t, std::unique_ptr<OperatorBase>>;
+    std::list<Pos> insert_position;
+    for (auto& dup_output_op : dup_output_ops) {
+      const std::string& name = dup_output_op.first;
+      auto& dup_op = dup_output_op.second;
+      // no duplicate output
+      if (dup_op.size() == 1) continue;
+
+      // process the duplicate outputs
+      std::vector<std::string> dup_outputs;
+      for (size_t i = 0; i < dup_op.size(); ++i) {
+        // rename each duplicate output to an alias
+        auto op_offset = dup_op[i];
+        dup_outputs.push_back(name + "@RENAME@" + std::to_string(uid) + "@" +
+                              std::to_string(i));
+        net->ops_[op_offset]->Rename(name, dup_outputs.back());
+      }
+      // collect all the offset to append `add` op for each alias
+      insert_position.push_back(
+          {dup_op.back(), OpRegistry::CreateOp("add", {{"X", {dup_outputs}}},
+                                               {{"Out", {name}}}, {})});
+    }
+
+    // make sure the inserted `add` ops follow the BFS order.
+    insert_position.sort(
+        [](const Pos& l, const Pos& r) { return l.first > r.first; });
+
+    for (auto& pos : insert_position) {
+      net->InsertOp(pos.first + 1, std::move(pos.second));
+    }
+  } else {
+    std::unique_ptr<OperatorBase> grad_op(OpRegistry::CreateGradOp(forwardOp));
+
+    ForEachVarName(grad_op->Inputs(), [&no_grad_names, &net, &grad_op](
+                                          const std::string& grad_input) {
+      if (no_grad_names.count(grad_input)) {
+        // +1 for \0
+        std::string prefix = grad_input.substr(
+            0, grad_input.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
+        grad_op->Rename(grad_input, prefix + kZeroVarSuffix);
+
+        // If part of input gradient of that operator is not calculated, fill
+        // zero variables to that input gradient.
+        net->AppendOp(OpRegistry::CreateOp("fill_zeros_like",
+                                           {{"Src", {prefix}}},
+                                           {{"Dst", {grad_input}}}, {}));
+      }
+      return false;
+    });
+
+    ForEachVarName(grad_op->Outputs(),
+                   [&no_grad_names, &grad_op](const std::string& grad_output) {
+                     if (no_grad_names.count(grad_output)) {
+                       grad_op->Rename(grad_output, kEmptyVarName);
+                     }
+                     return false;
+                   });
+
+    // process recurrent gradient op as a special operator.
+    if (forwardOp.Type() == "recurrent_op") {
+      // NOTE clean up cycle call somewhere (RNN's stepnet constains itself), or
+      // this will result in infinite loop.
+      const auto& rnnop =
+          *static_cast<const operators::RecurrentOp*>(&forwardOp);
+      auto rnn_grad_op =
+          static_cast<operators::RecurrentGradientOp*>(grad_op.get());
+      const auto& stepnet_op =
+          *static_cast<const OperatorBase*>(&rnnop.stepnet());
+      // create stepnet's gradient op
+      rnn_grad_op->set_stepnet(
+          BackwardRecursive(stepnet_op, no_grad_names, uniq_id));
+    }
+
+    if (net->ops_.empty()) {  // Current no aux op is added to network
+      return grad_op;
+    }
+    net->AppendOp(std::move(grad_op));
+  }
+  net->SetType("@GENERATED_BACKWARD@");
+  net->CompleteAddOp();
+  return std::unique_ptr<OperatorBase>(
+      static_cast<OperatorBase*>(net.release()));
+}
+
+// See header for comments
+std::unique_ptr<OperatorBase> Backward(
+    const OperatorBase& forwardOp,
+    const std::unordered_set<std::string>& no_grad_vars) {
+  std::unordered_set<std::string> no_grad_names;
+  no_grad_names.reserve(no_grad_vars.size());
+
+  no_grad_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
+
+  for (auto& name : no_grad_vars) {
+    no_grad_names.insert(name + kGradVarSuffix);
+  }
+  size_t uid = 0;
+  return BackwardRecursive(forwardOp, no_grad_names, uid);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/backward.h b/paddle/framework/backward.h
new file mode 100644
index 0000000000..1ecf69881b
--- /dev/null
+++ b/paddle/framework/backward.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <unordered_set>
+#include "operator.h"
+namespace paddle {
+namespace framework {
+
+// Create the backward operator from a forward operator.
+// TODO(yuyang18): Add more API reference comment.
+extern std::unique_ptr<OperatorBase> Backward(
+    const OperatorBase& forwardOp,
+    const std::unordered_set<std::string>& no_grad_vars);
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/backward.md b/paddle/framework/backward.md
new file mode 100644
index 0000000000..74c001b06a
--- /dev/null
+++ b/paddle/framework/backward.md
@@ -0,0 +1,38 @@
+## Operator/expression 's Backward
+
+### Motivation
+
+In Neural Network, the backpropagation algorithm follows the chain rule, so we need to compound the fundmental gradient operators/expressions together with chain rule . Every forward network need a backward network to construct the full computation lineage, the operator/ expression's Backward feature will generate the backward pass respect to forward pass.
+
+### Implement : gradient operator registry
+
+|                        | forward operator | backward operator                |
+| ---------------------- | ---------------- | -------------------------------- |
+| **Operator::inputs_**  | Inputs           | Inputs, Outputs, OutputGradients |
+| **Operator::outputs_** | Outputs          | InputGradients                   |
+
+Inputs/Outputs means the input/output of the operator,  InputGradients/OutputGradients is the gradient respect to forward opeartor. Forward operator and Backward operator are isomorphic, save their corresponding needs into member attribute.
+
+We use a global hash map record the gradient operators available, follow the philosophy  of minimum core, make operator pluggable unit. Each gradient is an operator and it needs to regist itself. 
+
+grad_op_builder(fengjiayi)
+
+### Implement : Backward network
+
+given a forward network, it generates the backward network. We only care about the Gradients—`OutputGradients`,`InputGradients`.
+
+1. bla bla bla (yuyang)
+
+2. NetOp 
+
+   when the input forward network is a NetOp, it need to call the sub NetOp/Operators backward function recursively and ensure them done. During the process, we need to collect the `OutputGradients` name.
+
+   We share variable in the same scope, as a result, duplicate operator `OutputGradients` will overwirte then duplicate variable.  
+
+   ![./images/duplicate_op]()
+
+    Share variable between operators or same input variable used in multiple operators lead to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively, and add a generic add operator instead. 
+
+![./images/duplicate_op2]()
+
+​	Then collect the sub graph OutputGradients/InputGradients as the NetOp's and return it.
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
new file mode 100644
index 0000000000..b93ab66f2f
--- /dev/null
+++ b/paddle/framework/backward_test.cc
@@ -0,0 +1,401 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/backward.h"
+
+#include <gtest/gtest.h>
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/net_op.h"
+
+namespace paddle {
+namespace framework {
+
+using OperatorBase = framework::OperatorBase;
+using OpProtoAndCheckerMaker = framework::OpProtoAndCheckerMaker;
+using OpProto = framework::OpProto;
+using OpAttrChecker = framework::OpAttrChecker;
+using Scope = framework::Scope;
+using DeviceContext = platform::DeviceContext;
+
+class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input X of Add").NotInGradient();
+    AddInput("b", "Bias of Add").NotInGradient();
+    AddOutput("Out", "Out of Add").NotInGradient();
+    AddComment("Add Op");
+  }
+};
+
+class MulOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "A");
+    AddInput("Y", "B");
+    AddOutput("Out", "Out");
+    AddComment("Mul");
+  }
+};
+
+class SigmoidOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "X");
+    AddOutput("Out", "Y");
+    AddComment("Sigmoid");
+  }
+};
+
+class NoGradOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  NoGradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "X input");
+    AddOutput("Out", "Y output");
+    AddComment("NoGradOp, same input output. no Grad");
+  }
+};
+
+class FcOp : public operators::NetOp {
+ public:
+  FcOp(const std::string &type, const VarNameMap &inputs,
+       const VarNameMap &outputs, const AttributeMap &attrs)
+      : NetOp(type, inputs, outputs, attrs) {
+    AppendOp(OpRegistry::CreateOp("mul",
+                                  {{"X", {Input("X")}}, {"Y", {Input("W")}}},
+                                  {{"Out", {Output("mul_result")}}}, {}));
+    auto input_b = Inputs("b");
+    std::string before_act = "mul_result";
+    if (input_b.size() != 0) {
+      AppendOp(OpRegistry::CreateOp(
+          "rowwise_add", {{"X", {Output("mul_result")}}, {"b", {input_b[0]}}},
+          {{"Out", {Output("add_result")}}}, {}));
+      before_act = "add_result";
+    } else {
+      auto out_varname = Output("add_result");
+      if (out_varname != kEmptyVarName) {
+        this->Rename(out_varname, kEmptyVarName);
+      }
+    }
+
+    AppendOp(OpRegistry::CreateOp("sigmoid", {{"X", {Output(before_act)}}},
+                                  {{"Out", {Output("Out")}}}, {}));
+    CompleteAddOp(false);
+  }
+};
+
+class FcOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  FcOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "x");
+    AddInput("W", "w");
+    AddInput("b", "b");
+    AddOutput("mul_result", "").AsIntermediate();
+    AddOutput("add_result", "").AsIntermediate();
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+
+class ManyOutputOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  ManyOutputOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("x", "x");
+    AddOutput("y", "y");
+    AddOutput("z", "z");
+    AddComment("");
+  }
+};
+
+class FillZeroOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  FillZeroOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("x", "x");
+    AddOutput("out", "out");
+    AddComment("");
+  }
+};
+
+class AddOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  AddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "x").AsDuplicable();
+    AddOutput("Y", "y");
+    AddComment("");
+  }
+};
+}  // namespace framework
+}  // namespace paddle
+
+namespace f = paddle::framework;
+namespace ops = paddle::operators;
+using EnforceNotMet = paddle::platform::EnforceNotMet;
+REGISTER_OP(rowwise_add, f::NOP, f::RowWiseAddOpMaker, rowwise_add_grad,
+            f::NOP);
+REGISTER_OP(mul, f::NOP, f::MulOpMaker, mul_grad, f::NOP);
+REGISTER_OP(sigmoid, f::NOP, f::SigmoidOpMaker, sigmoid_grad, f::NOP);
+REGISTER_OP_WITHOUT_GRADIENT(nograd, f::NOP, f::NoGradOpMaker);
+REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, f::NOP, f::FillZeroOpMaker);
+REGISTER_OP(add, f::NOP, f::AddOpMaker, add_grad, f::NOP);
+REGISTER_OP_WITHOUT_GRADIENT(fc, f::FcOp, f::FcOpMaker);
+REGISTER_OP(many_output_op, f::NOP, f::ManyOutputOpMaker, many_output_op_grad,
+            f::NOP);
+
+TEST(Backward, simple_op_grad) {
+  auto fwd = f::OpRegistry::CreateOp(
+      "rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {});
+  ASSERT_NE(fwd, nullptr);
+  auto gop = f::OpRegistry::CreateGradOp(*fwd);
+  ASSERT_EQ(1UL, gop->Inputs().size());
+  ASSERT_EQ("rowwise_add_grad", gop->Type());
+  ASSERT_EQ(f::GradVarName("x"), gop->Output(f::GradVarName("X")));
+  ASSERT_EQ(f::GradVarName("b"), gop->Output(f::GradVarName("b")));
+}
+
+TEST(Backward, simple_op_not_need_grad) {
+  auto fwd = f::OpRegistry::CreateOp(
+      "rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {});
+  ASSERT_NE(fwd, nullptr);
+  auto gop = f::Backward(*fwd, {"x"});
+  ASSERT_EQ(gop->Output(f::GradVarName("X")), f::kEmptyVarName);
+
+  auto no_input_gop = f::Backward(*fwd, {"x", "b"});
+  ASSERT_NE(no_input_gop, nullptr);
+  ASSERT_TRUE(no_input_gop->IsNetOp());
+  ASSERT_EQ(0UL, static_cast<ops::NetOp *>(no_input_gop.get())->ops_.size());
+}
+
+TEST(Backward, net_fc_backward_normal) {
+  std::shared_ptr<f::OperatorBase> fwd =
+      f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {"b"}}},
+                              {{"mul_result", {"mul_res"}},
+                               {"add_result", {"add_re"}},
+                               {"Out", {"out"}}},
+                              {});
+  ASSERT_NE(fwd, nullptr);
+  std::shared_ptr<f::OperatorBase> gop = f::Backward(*fwd, {});
+  ASSERT_TRUE(gop->IsNetOp());
+  auto net = static_cast<ops::NetOp *>(gop.get());
+
+  ASSERT_NO_THROW(net->DebugString());
+
+  ASSERT_EQ(3UL, net->ops_.size());
+
+  f::OperatorBase &d_sigmoid = *net->ops_[0];
+  ASSERT_EQ("sigmoid_grad", d_sigmoid.Type());
+
+  f::OperatorBase &d_add = *net->ops_[1];
+  ASSERT_EQ("rowwise_add_grad", d_add.Type());
+
+  f::OperatorBase &d_mul = *net->ops_[2];
+  ASSERT_EQ("mul_grad", d_mul.Type());
+}
+
+TEST(Backward, net_fc_backward_not_have_b) {
+  std::shared_ptr<f::OperatorBase> fwd =
+      f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {}}},
+                              {{"mul_result", {"mul_res"}},
+                               {"add_result", {"add_res"}},
+                               {"Out", {"tmp"}}},
+                              {});
+  ASSERT_NE(fwd, nullptr);
+  std::shared_ptr<f::OperatorBase> gop = f::Backward(*fwd, {});
+  ASSERT_TRUE(gop->IsNetOp());
+  auto net = static_cast<ops::NetOp *>(gop.get());
+
+  ASSERT_NO_THROW(net->DebugString());
+
+  ASSERT_EQ(2UL, net->ops_.size());
+
+  f::OperatorBase &d_sigmoid = *net->ops_[0];
+  ASSERT_EQ("sigmoid_grad", d_sigmoid.Type());
+
+  f::OperatorBase &d_mul = *net->ops_[1];
+  ASSERT_EQ("mul_grad", d_mul.Type());
+}
+
+TEST(Backward, net_input_of_network_not_need_grad) {
+  ops::NetOp net;
+  net.AppendOp(f::OpRegistry::CreateOp(
+      "fc", {{"X", {"x"}}, {"W", {"W1"}}, {"b", {"b1"}}},
+      {{"mul_result", {"mul_tmp_0"}},
+       {"add_result", {"add_tmp_0"}},
+       {"Out", {"hidden0"}}},
+      {}));
+  net.AppendOp(f::OpRegistry::CreateOp(
+      "fc", {{"X", {"hidden0"}}, {"W", {"W2"}}, {"b", {"b2"}}},
+      {{"mul_result", {"mul_tmp_1"}},
+       {"add_result", {"add_tmp_1"}},
+       {"Out", {"hidden1"}}},
+      {}));
+  net.CompleteAddOp();
+  auto bwd = Backward(net, {"x"});  // x@GRAD is not need.
+  ASSERT_TRUE(bwd->IsNetOp());
+  auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
+
+  auto output_vars = bwd_net->OutputVars(true);
+  std::unordered_set<std::string> all_outputs =
+      std::unordered_set<std::string>(output_vars.begin(), output_vars.end());
+  all_outputs.erase(f::kEmptyVarName);
+
+  for (auto &out : {"W1", "b1", "hidden0", "W2", "b2"}) {
+    ASSERT_NE(all_outputs.find(f::GradVarName(out)), all_outputs.end());
+  }
+
+  // Not Generated X
+  ASSERT_EQ(all_outputs.find(f::GradVarName("X")), all_outputs.end());
+
+  ASSERT_EQ(2UL, bwd_net->ops_.size());
+  ASSERT_TRUE(bwd_net->ops_[1]->IsNetOp());
+  auto first_fc_grad = static_cast<ops::NetOp *>(bwd_net->ops_[1].get());
+  ASSERT_EQ(3UL, first_fc_grad->ops_.size());
+  ASSERT_EQ(f::kEmptyVarName,
+            first_fc_grad->ops_[2]->Output(f::GradVarName("X")));
+}
+
+TEST(Backward, net_shared_weight) {
+  ops::NetOp net;
+  net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"x"}}, {"Y", {"w"}}},
+                                       {{"Out", {"out"}}}, {}));
+  net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"out"}}, {"Y", {"w"}}},
+                                       {{"Out", {"FinalOut"}}}, {}));
+  net.CompleteAddOp();
+
+  auto bwd = f::Backward(net, {});
+  ASSERT_TRUE(bwd->IsNetOp());
+  auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
+  ASSERT_EQ(3UL, bwd_net->ops_.size());
+  ASSERT_EQ("add", bwd_net->ops_[2]->Type());
+}
+
+TEST(Backward, op_register_grad_not_for_network) {
+  auto fwd =
+      f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {"b"}}},
+                              {{"mul_result", {"mul_out"}},
+                               {"add_result", {"add_out"}},
+                               {"Out", {"out1"}}},
+                              {{"temporary_index", std::vector<int>{0, 1}}});
+
+  ASSERT_THROW(f::OpRegistry::CreateGradOp(*fwd), EnforceNotMet);
+}
+
+TEST(Backward, op_all_input_are_not_need) {
+  auto fwd = f::OpRegistry::CreateOp(
+      "rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {});
+  auto backward = f::Backward(*fwd, {"x", "b"});
+  ASSERT_TRUE(backward->IsNetOp());
+  auto net = static_cast<ops::NetOp *>(backward.get());
+  ASSERT_TRUE(net->ops_.empty());
+}
+
+TEST(Backward, op_all_output_are_not_need) {
+  auto fwd = f::OpRegistry::CreateOp(
+      "rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {});
+  auto backward = f::Backward(*fwd, {"out"});
+  ASSERT_TRUE(backward->IsNetOp());
+  auto net = static_cast<ops::NetOp *>(backward.get());
+  ASSERT_TRUE(net->ops_.empty());
+}
+
+TEST(Backward, op_part_of_output_are_not_need) {
+  auto fwd = f::OpRegistry::CreateOp("many_output_op", {{"x", {"X"}}},
+                                     {{"y", {"Y"}}, {"z", {"Z"}}}, {});
+  auto backward = f::Backward(*fwd, {"Z"});
+  ASSERT_TRUE(backward->IsNetOp());
+  auto net = static_cast<ops::NetOp *>(backward.get());
+  ASSERT_EQ(net->ops_.size(), 2UL);
+
+  auto &fill_zero = *net->ops_[0];
+  ASSERT_EQ("fill_zeros_like", fill_zero.Type());
+  ASSERT_EQ(1UL, fill_zero.Inputs("Src").size());
+  ASSERT_EQ("Z", fill_zero.Input("Src"));
+  ASSERT_EQ(1UL, fill_zero.Outputs("Dst").size());
+  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.Output("Dst"));
+
+  auto &d_many_out = *net->ops_[1];
+  ASSERT_EQ("many_output_op_grad", d_many_out.Type());
+  ASSERT_EQ(1UL + 2UL + 2UL, d_many_out.Inputs().size());  // I/O/OG
+  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix,
+            d_many_out.Input(f::GradVarName("z")));
+  ASSERT_EQ(f::GradVarName("Y"), d_many_out.Input(f::GradVarName("y")));
+  ASSERT_EQ(f::GradVarName("X"), d_many_out.Output(f::GradVarName("x")));
+}
+
+TEST(Backward, op_part_of_input_are_not_need) {
+  auto fwd = f::OpRegistry::CreateOp("mul", {{"X", {"a"}}, {"Y", {"b"}}},
+                                     {{"Out", {"out"}}}, {});
+  auto backward = f::Backward(*fwd, {"a"});
+  auto &grad_mul = *backward;
+  ASSERT_EQ(grad_mul.Type(), "mul_grad");
+  ASSERT_EQ(grad_mul.Inputs().size(), 2UL + 1UL + 1UL);
+  ASSERT_EQ(grad_mul.Outputs().size(), 2UL);
+  ASSERT_EQ(grad_mul.Output(f::GradVarName("X")), f::kEmptyVarName);
+  ASSERT_EQ(grad_mul.Output(f::GradVarName("Y")), f::GradVarName("b"));
+  ASSERT_EQ(grad_mul.Input(f::GradVarName("Out")), f::GradVarName("out"));
+  ASSERT_EQ(grad_mul.Input("X"), "a");
+  ASSERT_EQ(grad_mul.Input("Y"), "b");
+  ASSERT_EQ(grad_mul.Input("Out"), "out");
+}
+
+TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
+  ops::NetOp net;
+  net.AppendOp(f::OpRegistry::CreateOp(
+      "fc", {{"X", {"x1"}}, {"W", {"w1"}}, {"b", {"b1"}}},
+      {{"mul_result", {"mul_out1"}},
+       {"add_result", {"add_out1"}},
+       {"Out", {"out1"}}},
+      {}));
+  net.AppendOp(f::OpRegistry::CreateOp(
+      "fc", {{"X", {"out1"}}, {"W", {"w2"}}, {"b", {"b2"}}},
+      {{"mul_result", {"mul_out2"}},
+       {"add_result", {"tmp_out2"}},
+       {"Out", {"out2"}}},
+      {}));
+  net.AppendOp(f::OpRegistry::CreateOp(
+      "fc", {{"X", {"out2"}}, {"W", {"w3"}}, {"b", {"b3"}}},
+      {{"mul_result", {"mul_out3"}},
+       {"add_result", {"tmp_out3"}},
+       {"Out", {"out3"}}},
+      {}));
+  net.CompleteAddOp();
+
+  auto backward = f::Backward(net, {"mul_out2", "tmp_out2", "out2"});
+  ASSERT_TRUE(backward->IsNetOp());
+  auto bwd_net = static_cast<ops::NetOp *>(backward.get());
+  ASSERT_EQ(bwd_net->ops_.size(), 3UL);
+  auto &grad_fc = *bwd_net->ops_[0];
+
+  const char *all = paddle::operators::NetOp::kAll;
+  EXPECT_EQ(grad_fc.Inputs(all).size(),
+            2UL       /* external input number */
+                + 1UL /* external output number*/
+                + 1UL /* number of gradient of external output*/
+                + 2U /* internal variable number*/);
+  EXPECT_EQ(grad_fc.Outputs(all).size(),
+            2UL       /* input number of mul*/
+                + 2UL /* input number of rowwise_add
+                       */
+                + 1UL /* input number of sigmod */);
+  EXPECT_EQ(bwd_net->ops_[1]->Inputs(all).size(), 0UL);
+  EXPECT_EQ(bwd_net->ops_[1]->Outputs(all).size(), 0UL);
+  EXPECT_EQ(bwd_net->ops_[2]->Inputs(all).size(), 0UL);
+  EXPECT_EQ(bwd_net->ops_[2]->Outputs(all).size(), 0UL);
+}
diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
index 3f949a6595..cfd3e8dfde 100644
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -1,9 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #include "paddle/framework/ddim.h"
+#include "paddle/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
 
-///@cond HIDDEN
+/// @cond HIDDEN
 
 template <int i>
 Dim<i> make_dim(const int* d) {
@@ -50,7 +65,7 @@ void make_ddim(DDim& ddim, const int* dims, int n) {
   }
 }
 
-///@endcond
+/// @endcond
 
 DDim make_ddim(std::initializer_list<int> dims) {
   DDim result(make_dim(0));
@@ -64,11 +79,11 @@ DDim make_ddim(const std::vector<int>& dims) {
   return result;
 }
 
-///@cond HIDDEN
+/// @cond HIDDEN
 // XXX For some reason, putting this in an anonymous namespace causes errors
 class DynamicMutableIndexer : public boost::static_visitor<int&> {
  public:
-  DynamicMutableIndexer(int idx) : idx_(idx) {}
+  explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
 
   template <int D>
   int& operator()(Dim<D>& dim) const {
@@ -81,7 +96,7 @@ class DynamicMutableIndexer : public boost::static_visitor<int&> {
 
 class DynamicConstIndexer : public boost::static_visitor<int> {
  public:
-  DynamicConstIndexer(int idx) : idx_(idx) {}
+  explicit DynamicConstIndexer(int idx) : idx_(idx) {}
 
   template <int D>
   int operator()(const Dim<D>& dim) const {
@@ -92,7 +107,7 @@ class DynamicConstIndexer : public boost::static_visitor<int> {
   int idx_;
 };
 
-///@endcond
+/// @endcond
 
 int& DDim::operator[](int idx) {
   return boost::apply_visitor(DynamicMutableIndexer(idx), var);
@@ -102,6 +117,8 @@ int DDim::operator[](int idx) const {
   return boost::apply_visitor(DynamicConstIndexer(idx), var);
 }
 
+ssize_t DDim::size() const { return arity(*this); }
+
 bool DDim::operator==(DDim d) const {
   if (var.which() != d.getVar().which()) {
     return false;
@@ -155,11 +172,11 @@ int get(const DDim& ddim, int idx) { return ddim[idx]; }
 
 void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }
 
-///@cond HIDDEN
+/// @cond HIDDEN
 struct VectorizeVisitor : public boost::static_visitor<> {
   std::vector<int>& vector;
 
-  VectorizeVisitor(std::vector<int>& v) : vector(v) {}
+  explicit VectorizeVisitor(std::vector<int>& v) : vector(v) {}
 
   template <typename T>
   void operator()(const T& t) {
@@ -169,7 +186,7 @@ struct VectorizeVisitor : public boost::static_visitor<> {
 
   void operator()(const Dim<1>& t) { vector.push_back(t.head); }
 };
-///@endcond
+/// @endcond
 
 std::vector<int> vectorize(const DDim& ddim) {
   std::vector<int> result;
@@ -178,16 +195,59 @@ std::vector<int> vectorize(const DDim& ddim) {
   return result;
 }
 
+struct ProductVisitor : public boost::static_visitor<ssize_t> {
+  template <int D>
+  ssize_t operator()(const Dim<D>& dim) {
+    return product(dim);
+  }
+};
+
 ssize_t product(const DDim& ddim) {
-  ssize_t result = 1;
-  std::vector<int> v = vectorize(ddim);
-  for (auto i : v) {
-    result *= i;
+  ProductVisitor visitor;
+  return boost::apply_visitor(visitor, ddim);
+}
+
+struct SliceVectorizeVisitor : public boost::static_visitor<> {
+  std::vector<int>& vector;
+  int begin;
+  int end;
+
+  SliceVectorizeVisitor(std::vector<int>& v, int b, int e)
+      : vector(v), begin(b), end(e) {
+    PADDLE_ENFORCE(begin < end,
+                   "Begin index must be less than end index in ddim slice.");
+    PADDLE_ENFORCE(begin >= 0,
+                   "Begin index can't be less than zero in ddim slice.");
   }
-  return result;
+
+  template <int S>
+  void operator()(const Dim<S>& dim) {
+    if (begin == 0) {
+      vector.push_back(dim.head);
+    } else {
+      --begin;
+    }
+    --end;
+    if (end > 0) {
+      this->operator()(dim.tail);
+    }
+  }
+
+  void operator()(const Dim<1>& dim) {
+    PADDLE_ENFORCE(end == 1, "End index in ddim slice is out of bound.");
+    vector.push_back(dim.head);
+  }
+};
+
+DDim slice_ddim(const DDim& dim, int begin, int end) {
+  std::vector<int> vec;
+  vec.reserve(end - begin);
+  SliceVectorizeVisitor visitor(vec, begin, end);
+  boost::apply_visitor(visitor, dim);
+  return make_ddim(vec);
 }
 
-///\cond HIDDEN
+/// \cond HIDDEN
 
 struct ArityVisitor : boost::static_visitor<int> {
   template <int D>
@@ -196,15 +256,15 @@ struct ArityVisitor : boost::static_visitor<int> {
   }
 };
 
-///\endcond
+/// \endcond
 
 int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); }
 
-///\cond HIDDEN
+/// \cond HIDDEN
 
 struct DDimPrinter : boost::static_visitor<void> {
   std::ostream& os;
-  DDimPrinter(std::ostream& os_) : os(os_) {}
+  explicit DDimPrinter(std::ostream& os_) : os(os_) {}
 
   template <typename T>
   void operator()(const T& t) {
@@ -212,7 +272,7 @@ struct DDimPrinter : boost::static_visitor<void> {
   }
 };
 
-///\endcond
+/// \endcond
 
 std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
   DDimPrinter printer(os);
@@ -220,5 +280,8 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
   return os;
 }
 
+DDim::DDim(std::initializer_list<int> init_list) {
+  *this = make_ddim(init_list);
+}
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
index 223c4180be..95f294b627 100644
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -1,33 +1,46 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma once
 
-#include <boost/variant.hpp>
 #include <initializer_list>
 #include <stdexcept>
 #include <vector>
-
 #include "paddle/framework/dim.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/variant.h"
 
 namespace paddle {
 namespace framework {
 
-namespace {
-typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>,
-                       Dim<8>, Dim<9>>
-    DDimVar;
-}
-
 /**
  * \brief A dynamically sized dimension.
  *
  * The number of dimensions must be between [1, 9].
  */
 struct DDim {
+  typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>,
+                         Dim<8>, Dim<9>>
+      DDimVar;
   DDimVar var;
 
   DDim() : var(Dim<1>()) {}
 
   template <int D>
-  DDim(const Dim<D>& in) : var(in) {}
+  explicit DDim(const Dim<D>& in) : var(in) {}
+
+  /*implicit*/ DDim(std::initializer_list<int> init_list);
 
   template <int D>
   DDim& operator=(const Dim<D>& in) {
@@ -57,6 +70,8 @@ struct DDim {
   DDim operator+(DDim d) const;
 
   DDim operator*(DDim d) const;
+
+  ssize_t size() const;
 };
 
 /**
@@ -81,6 +96,15 @@ std::vector<int> vectorize(const DDim& ddim);
 
 ssize_t product(const DDim& ddim);
 
+/**
+ * \brief Slice a ddim
+ *
+ * Slice dim with [begin, end).
+ * e.g.  DDim d = make_ddim({1,2,3,4,5});
+ *       slice_ddim(d, 1, 3); ====> {2,3}
+ */
+DDim slice_ddim(const DDim& dim, int begin, int end);
+
 /**
  * \brief What is the length of this dimension?
  *
diff --git a/paddle/framework/ddim_test.cc b/paddle/framework/ddim_test.cc
index 36eef02370..9d18a2972c 100644
--- a/paddle/framework/ddim_test.cc
+++ b/paddle/framework/ddim_test.cc
@@ -49,9 +49,30 @@ TEST(DDim, Equality) {
 
   // arity of a DDim
   EXPECT_EQ(paddle::framework::arity(ddim), 3);
+  EXPECT_EQ(ddim.size(), 3);
 
   // product of a DDim
   EXPECT_EQ(paddle::framework::product(vddim), 45);
+  EXPECT_EQ(
+      paddle::framework::product(paddle::framework::make_ddim({3, 2, 5, 3})),
+      90);
+
+  // slice a DDim
+  paddle::framework::DDim ddim2 =
+      paddle::framework::make_ddim({1, 2, 3, 4, 5, 6});
+  paddle::framework::DDim ss = paddle::framework::slice_ddim(ddim2, 2, 5);
+  EXPECT_EQ(arity(ss), 3);
+  EXPECT_EQ(ss[0], 3);
+  EXPECT_EQ(ss[1], 4);
+  EXPECT_EQ(ss[2], 5);
+  paddle::framework::DDim ss2 = paddle::framework::slice_ddim(ddim2, 0, 6);
+  EXPECT_EQ(arity(ss2), 6);
+  EXPECT_EQ(ss2[0], 1);
+  EXPECT_EQ(ss2[1], 2);
+  EXPECT_EQ(ss2[2], 3);
+  EXPECT_EQ(ss2[3], 4);
+  EXPECT_EQ(ss2[4], 5);
+  EXPECT_EQ(ss2[5], 6);
 }
 
 TEST(DDim, Print) {
diff --git a/paddle/framework/dim_test.cu b/paddle/framework/dim_test.cu
index 0521741519..3898d0a447 100644
--- a/paddle/framework/dim_test.cu
+++ b/paddle/framework/dim_test.cu
@@ -1,100 +1,101 @@
 #include <thrust/device_vector.h>
 #include <sstream>
 
-#include "paddle/framework/dim.h"
 #include "gtest/gtest.h"
+#include "paddle/framework/dim.h"
 
 __global__ void test(paddle::framework::Dim<2>* o) {
-    o[0] = paddle::framework::make_dim(5, 6);
+  o[0] = paddle::framework::make_dim(5, 6);
 }
 
 __global__ void dyn_idx_gpu(int* o) {
-    auto d = paddle::framework::make_dim(5, 6);
-    o[0] = d[1];
+  auto d = paddle::framework::make_dim(5, 6);
+  o[0] = d[1];
 }
 
 TEST(Dim, Equality) {
-    // construct a Dim on the CPU
-    auto a = paddle::framework::make_dim(3, 4);
-    EXPECT_EQ(paddle::framework::get<0>(a), 3);
-    EXPECT_EQ(paddle::framework::get<1>(a), 4);
-
-    // construct a Dim on the GPU
-    thrust::device_vector<paddle::framework::Dim<2>> t(2);
-    test<<<1,1>>>(thrust::raw_pointer_cast(t.data()));
-    a = t[0];
-    EXPECT_EQ(paddle::framework::get<0>(a), 5);
-    EXPECT_EQ(paddle::framework::get<1>(a), 6);
-
-    // linearization
-    auto b = paddle::framework::make_dim(7, 8);
-    EXPECT_EQ(paddle::framework::linearize(a, b), 83);
-
-    // product
-    EXPECT_EQ(paddle::framework::product(a), 30);
-
-    // mutate a Dim
-    paddle::framework::get<1>(b) = 10;
-    EXPECT_EQ(paddle::framework::get<0>(b), 7);
-    EXPECT_EQ(paddle::framework::get<1>(b), 10);
-
-    // dynamic access
-    paddle::framework::get(b, 0) = 8;
-    b[1] = 11;
-    EXPECT_EQ(paddle::framework::get<0>(b), 8);
-    EXPECT_EQ(paddle::framework::get<1>(b), 11);
-    EXPECT_EQ(paddle::framework::get(b, 0), 8);
-    EXPECT_EQ(b[1], 11);
-
-    // dynamic access on GPU
-    thrust::device_vector<int> r(1);
-    dyn_idx_gpu<<<1,1>>>(thrust::raw_pointer_cast(r.data()));
-    int res = r[0];
-    EXPECT_EQ(res, 6);
-
-    // ex_prefix_mul
-    paddle::framework::Dim<3> c = paddle::framework::ex_prefix_mul(paddle::framework::Dim<3>(3, 4, 5));
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<1>(c), 3);
-    EXPECT_EQ(paddle::framework::get<2>(c), 12);
-
-    // generate from an index
-    auto size = paddle::framework::make_dim(4, 5, 2);
-    c = paddle::framework::Dim<3>(14, size);
-    EXPECT_EQ(paddle::framework::get<0>(c), 2);
-    EXPECT_EQ(paddle::framework::get<1>(c), 3);
-    EXPECT_EQ(paddle::framework::get<2>(c), 0);
-    c = paddle::framework::Dim<3>(25, size);
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<1>(c), 1);
-    EXPECT_EQ(paddle::framework::get<2>(c), 1);
+  // construct a Dim on the CPU
+  auto a = paddle::framework::make_dim(3, 4);
+  EXPECT_EQ(paddle::framework::get<0>(a), 3);
+  EXPECT_EQ(paddle::framework::get<1>(a), 4);
+
+  // construct a Dim on the GPU
+  thrust::device_vector<paddle::framework::Dim<2>> t(2);
+  test<<<1, 1>>>(thrust::raw_pointer_cast(t.data()));
+  a = t[0];
+  EXPECT_EQ(paddle::framework::get<0>(a), 5);
+  EXPECT_EQ(paddle::framework::get<1>(a), 6);
+
+  // linearization
+  auto b = paddle::framework::make_dim(7, 8);
+  EXPECT_EQ(paddle::framework::linearize(a, b), 83);
+
+  // product
+  EXPECT_EQ(paddle::framework::product(a), 30);
+
+  // mutate a Dim
+  paddle::framework::get<1>(b) = 10;
+  EXPECT_EQ(paddle::framework::get<0>(b), 7);
+  EXPECT_EQ(paddle::framework::get<1>(b), 10);
+
+  // dynamic access
+  paddle::framework::get(b, 0) = 8;
+  b[1] = 11;
+  EXPECT_EQ(paddle::framework::get<0>(b), 8);
+  EXPECT_EQ(paddle::framework::get<1>(b), 11);
+  EXPECT_EQ(paddle::framework::get(b, 0), 8);
+  EXPECT_EQ(b[1], 11);
+
+  // dynamic access on GPU
+  thrust::device_vector<int> r(1);
+  dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data()));
+  int res = r[0];
+  EXPECT_EQ(res, 6);
+
+  // ex_prefix_mul
+  paddle::framework::Dim<3> c =
+      paddle::framework::ex_prefix_mul(paddle::framework::Dim<3>(3, 4, 5));
+  EXPECT_EQ(paddle::framework::get<0>(c), 1);
+  EXPECT_EQ(paddle::framework::get<1>(c), 3);
+  EXPECT_EQ(paddle::framework::get<2>(c), 12);
+
+  // generate from an index
+  auto size = paddle::framework::make_dim(4, 5, 2);
+  c = paddle::framework::Dim<3>(14, size);
+  EXPECT_EQ(paddle::framework::get<0>(c), 2);
+  EXPECT_EQ(paddle::framework::get<1>(c), 3);
+  EXPECT_EQ(paddle::framework::get<2>(c), 0);
+  c = paddle::framework::Dim<3>(25, size);
+  EXPECT_EQ(paddle::framework::get<0>(c), 1);
+  EXPECT_EQ(paddle::framework::get<1>(c), 1);
+  EXPECT_EQ(paddle::framework::get<2>(c), 1);
 }
 
 TEST(Dim, Bool) {
-    auto a = paddle::framework::make_dim(3, 4);
-    auto b = paddle::framework::make_dim(5, 6);
-    auto c = paddle::framework::make_dim(3, 4);
-
-    // in_bounds check
-    EXPECT_TRUE(paddle::framework::contained(a, b));
-    EXPECT_FALSE(paddle::framework::contained(b, a));
-
-    // comparison
-    EXPECT_TRUE(a == a);
-    EXPECT_FALSE(a == b);
-    EXPECT_TRUE(a == c);
+  auto a = paddle::framework::make_dim(3, 4);
+  auto b = paddle::framework::make_dim(5, 6);
+  auto c = paddle::framework::make_dim(3, 4);
+
+  // in_bounds check
+  EXPECT_TRUE(paddle::framework::contained(a, b));
+  EXPECT_FALSE(paddle::framework::contained(b, a));
+
+  // comparison
+  EXPECT_TRUE(a == a);
+  EXPECT_FALSE(a == b);
+  EXPECT_TRUE(a == c);
 }
 
 TEST(Dim, Print) {
-    {
-        std::stringstream ss;
-        auto a = paddle::framework::make_dim(2, 3);
-        ss << a;
-        EXPECT_EQ(ss.str(), "2, 3");
-    }
-    {
-        std::stringstream ss;
-        ss << paddle::framework::make_dim(8);
-        EXPECT_EQ(ss.str(), "8");
-    }
+  {
+    std::stringstream ss;
+    auto a = paddle::framework::make_dim(2, 3);
+    ss << a;
+    EXPECT_EQ(ss.str(), "2, 3");
+  }
+  {
+    std::stringstream ss;
+    ss << paddle::framework::make_dim(8);
+    EXPECT_EQ(ss.str(), "8");
+  }
 }
diff --git a/paddle/framework/eigen.h b/paddle/framework/eigen.h
new file mode 100644
index 0000000000..a4667cc51f
--- /dev/null
+++ b/paddle/framework/eigen.h
@@ -0,0 +1,100 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/tensor.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace framework {
+
+// EigenDim converts paddle::platform::DDim into Eigen::DSizes.
+template <int D>
+struct EigenDim {
+  using Type = Eigen::DSizes<Eigen::DenseIndex, D>;
+
+  static Type From(const DDim& dims) {
+    PADDLE_ENFORCE(arity(dims) == D, "D must match arity(DDim)");
+    Type ret;
+    for (int d = 0; d < arity(dims); d++) {
+      ret[d] = dims[d];
+    }
+    return ret;
+  }
+};
+
+// Interpret paddle::platform::Tensor as EigenTensor and EigenConstTensor.
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenTensor {
+  // TODO(qijun) Now, default type in unaligned, and we will make a benchmark on
+  // the speed of aligned and unaligned version in future.
+  using Type = Eigen::TensorMap<Eigen::Tensor<T, D, MajorType, IndexType>>;
+
+  using ConstType =
+      Eigen::TensorMap<Eigen::Tensor<const T, D, MajorType, IndexType>>;
+
+  static Type From(Tensor& tensor, DDim dims) {
+    return Type(tensor.data<T>(), EigenDim<D>::From(dims));
+  }
+
+  static Type From(Tensor& tensor) { return From(tensor, tensor.dims_); }
+
+  static ConstType From(const Tensor& tensor, DDim dims) {
+    return ConstType(tensor.data<T>(), EigenDim<D>::From(dims));
+  }
+
+  static ConstType From(const Tensor& tensor) {
+    return From(tensor, tensor.dims_);
+  }
+};
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {};
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
+  // Flatten reshapes a Tensor into an EigenVector.
+  static typename EigenVector::Type Flatten(Tensor& tensor) {
+    return EigenVector::From(
+        tensor, make_ddim({static_cast<int>(product(tensor.dims_))}));
+  }
+
+  static typename EigenVector::ConstType Flatten(const Tensor& tensor) {
+    return EigenVector::From(
+        tensor, make_ddim({static_cast<int>(product(tensor.dims_))}));
+  }
+};
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenScalar {
+  // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
+  using Type = Eigen::TensorMap<
+      Eigen::TensorFixedSize<T, Eigen::Sizes<>, MajorType, IndexType>>;
+  using ConstType = Eigen::TensorMap<
+      Eigen::TensorFixedSize<const T, Eigen::Sizes<>, MajorType, IndexType>>;
+
+  static Type From(Tensor& tensor) { return Type(tensor.data<T>()); }
+
+  static ConstType From(const Tensor& tensor) {
+    return ConstType(tensor.data<T>());
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/eigen_test.cc b/paddle/framework/eigen_test.cc
new file mode 100644
index 0000000000..dc1957691b
--- /dev/null
+++ b/paddle/framework/eigen_test.cc
@@ -0,0 +1,112 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#include "paddle/framework/eigen.h"
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+
+TEST(EigenDim, From) {
+  EigenDim<3>::Type ed = EigenDim<3>::From(make_ddim({1, 2, 3}));
+  ASSERT_EQ(1, ed[0]);
+  ASSERT_EQ(2, ed[1]);
+  ASSERT_EQ(3, ed[2]);
+}
+
+TEST(Eigen, Tensor) {
+  Tensor t;
+  float* p = t.mutable_data<float>(make_ddim({1, 2, 3}), platform::CPUPlace());
+  for (int i = 0; i < 1 * 2 * 3; i++) {
+    p[i] = static_cast<float>(i);
+  }
+
+  EigenTensor<float, 3>::Type et = EigenTensor<float, 3>::From(t);
+
+  ASSERT_EQ(1, et.dimension(0));
+  ASSERT_EQ(2, et.dimension(1));
+  ASSERT_EQ(3, et.dimension(2));
+
+  for (int i = 0; i < 1; i++) {
+    for (int j = 0; j < 2; j++) {
+      for (int k = 0; k < 3; k++) {
+        ASSERT_NEAR((i * 2 + j) * 3 + k, et(i, j, k), 1e-6f);
+      }
+    }
+  }
+}
+
+TEST(Eigen, ScalarFrom) {
+  Tensor t;
+  int* p = t.mutable_data<int>(make_ddim({1}), platform::CPUPlace());
+  *p = static_cast<int>(100);
+
+  EigenScalar<int>::Type es = EigenScalar<int>::From(t);
+
+  ASSERT_EQ(0, es.dimension(0));
+  ASSERT_EQ(100, es(0));
+}
+
+TEST(Eigen, VectorFrom) {
+  Tensor t;
+  float* p = t.mutable_data<float>(make_ddim({6}), platform::CPUPlace());
+  for (int i = 0; i < 6; i++) {
+    p[i] = static_cast<float>(i);
+  }
+
+  EigenVector<float>::Type ev = EigenVector<float>::From(t);
+
+  ASSERT_EQ(6, ev.dimension(0));
+
+  for (int i = 0; i < 6; i++) {
+    ASSERT_NEAR(i, ev(i), 1e-6f);
+  }
+}
+
+TEST(Eigen, VectorFlatten) {
+  Tensor t;
+  float* p = t.mutable_data<float>(make_ddim({1, 2, 3}), platform::CPUPlace());
+  for (int i = 0; i < 1 * 2 * 3; i++) {
+    p[i] = static_cast<float>(i);
+  }
+
+  EigenVector<float>::Type ev = EigenVector<float>::Flatten(t);
+
+  ASSERT_EQ(1 * 2 * 3, ev.dimension(0));
+
+  for (int i = 0; i < 1 * 2 * 3; i++) {
+    ASSERT_NEAR(i, ev(i), 1e-6f);
+  }
+}
+
+TEST(Eigen, Matrix) {
+  Tensor t;
+  float* p = t.mutable_data<float>(make_ddim({2, 3}), platform::CPUPlace());
+  for (int i = 0; i < 2 * 3; i++) {
+    p[i] = static_cast<float>(i);
+  }
+
+  EigenMatrix<float>::Type em = EigenMatrix<float>::From(t);
+
+  ASSERT_EQ(2, em.dimension(0));
+  ASSERT_EQ(3, em.dimension(1));
+
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < 3; j++) {
+      ASSERT_NEAR(i * 3 + j, em(i, j), 1e-6f);
+    }
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/enforce.h b/paddle/framework/enforce.h
deleted file mode 100644
index 56cb7f9564..0000000000
--- a/paddle/framework/enforce.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <paddle/string/printf.h>
-#include <exception>
-#include <sstream>
-
-namespace paddle {
-namespace framework {
-
-/**
- * @brief Enforce exception. Inherits std::exception
- *
- * All enforce condition not met, will throw an EnforceNotMet exception.
- */
-class EnforceNotMet : public std::exception {
- public:
-  EnforceNotMet(const std::string& msg, const char* file, int fileline) {
-    std::ostringstream sout;
-    sout << msg << " at [" << file << ":" << fileline << "];";
-    all_msg_ = sout.str();
-  }
-
-  const char* what() const noexcept override { return all_msg_.c_str(); }
-
- private:
-  std::string all_msg_;
-};
-
-// From https://stackoverflow.com/questions/30130930/
-// __buildin_expect is in C++ 11 standard. Since the condition which enforced
-// should be true in most situation, it will make the compiler generate faster
-// code by adding `UNLIKELY` macro.
-#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
-
-/**
- * @brief Throw a EnforceNotMet exception, automatically filled __FILE__ &
- * __LINE__
- *
- * This macro take __VA_ARGS__, user can pass any type if that type can
- * serialize to std::ostream
- */
-#define PADDLE_THROW(...)                                            \
-  do {                                                               \
-    throw ::paddle::framework::EnforceNotMet(                        \
-        ::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__); \
-  } while (0)
-
-/**
- * @brief Enforce a condition, otherwise throw an EnforceNotMet
- */
-#define PADDLE_ENFORCE(condition, ...) \
-  do {                                 \
-    if (UNLIKELY(!(condition))) {      \
-      PADDLE_THROW(__VA_ARGS__);       \
-    }                                  \
-  } while (0)
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/enforce_test.cc b/paddle/framework/enforce_test.cc
deleted file mode 100644
index f8da1a192f..0000000000
--- a/paddle/framework/enforce_test.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/framework/enforce.h>
-
-TEST(ENFORCE, OK) {
-  PADDLE_ENFORCE(true, "Enforce is ok %d now %f", 123, 0.345);
-  size_t val = 1;
-  const size_t limit = 10;
-  PADDLE_ENFORCE(val < limit, "Enforce is OK too");
-}
-
-TEST(ENFORCE, FAILED) {
-  bool in_catch = false;
-  try {
-    PADDLE_ENFORCE(false, "Enforce is not ok %d at all", 123);
-  } catch (paddle::framework::EnforceNotMet err) {
-    in_catch = true;
-    std::string msg = "Enforce is not ok 123 at all";
-    const char* what = err.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(what[i], msg[i]);
-    }
-  }
-  ASSERT_TRUE(in_catch);
-}
\ No newline at end of file
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
new file mode 100644
index 0000000000..ae44a1ffd4
--- /dev/null
+++ b/paddle/framework/framework.proto
@@ -0,0 +1,82 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax = "proto2";
+package paddle.framework;
+
+enum AttrType {
+  INT = 0;
+  FLOAT = 1;
+  STRING = 2;
+  INTS = 3;
+  FLOATS = 4;
+  STRINGS = 5;
+}
+
+// OpDesc describes an instance of a C++ framework::OperatorBase
+// derived class type.
+message OpDesc {
+
+  message Attr {
+    required string name = 1;
+    required AttrType type = 2;
+    optional int32 i = 3;
+    optional float f = 4;
+    optional string s = 5;
+    repeated int32 ints = 6;
+    repeated float floats = 7;
+    repeated string strings = 8;
+  };
+
+  message Var {
+    required string parameter = 1;
+    repeated string arguments = 2;
+  };
+
+  required string type = 3;
+  repeated Var inputs = 1;
+  repeated Var outputs = 2;
+  repeated Attr attrs = 4;
+};
+
+// OpProto describes a C++ framework::OperatorBase derived class.
+message OpProto {
+
+  // VarProto describes the C++ type framework::Variable.
+  message Var {
+    required string name = 1;
+    required string comment = 2;
+
+    optional bool duplicable = 3 [ default = false ];
+    optional bool intermediate = 4 [ default = false ];
+    optional bool not_in_gradient = 5 [ default = false ];
+  }
+
+  // AttrProto describes the C++ type Attribute.
+  message Attr {
+    required string name = 1;
+    required AttrType type = 2;
+    required string comment = 3;
+    // If that attribute is generated, it means the Paddle third
+    // language binding has responsibility to fill that
+    // attribute. End-User should not set that attribute.
+    optional bool generated = 4 [ default = false ];
+  }
+
+  required string type = 1;
+  repeated Var inputs = 2;
+  repeated Var outputs = 3;
+  repeated Attr attrs = 4;
+  required string comment = 5;
+}
diff --git a/paddle/framework/grad_op_builder.cc b/paddle/framework/grad_op_builder.cc
new file mode 100644
index 0000000000..0a2a41f6b6
--- /dev/null
+++ b/paddle/framework/grad_op_builder.cc
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOpArgType::OUT WARRANTIES OR CONDITIONS OF ANY KOpArgType::IND, either
+express or implied. See the License for the specific language governing
+permissions and limitations under the License. */
+
+#include "paddle/framework/grad_op_builder.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+enum class OpArgType { IN, OUT };
+
+static void TransOpArg(const OperatorBase* src_op, const OpArgType& src_type,
+                       bool is_grad, OperatorBase::VarNameMap* vars) {
+  const auto& src_inout =
+      src_type == OpArgType::IN ? src_op->Inputs() : src_op->Outputs();
+  auto& dst_inout = *vars;
+  const OpProto* proto = OpRegistry::op_info_map().at(src_op->Type()).proto_;
+  const auto& src_arg_list =
+      src_type == OpArgType::IN ? proto->inputs() : proto->outputs();
+  for (const auto& arg : src_arg_list) {
+    if (arg.not_in_gradient() && !is_grad) continue;
+    const std::string src_name = arg.name();
+    std::string dst_name = is_grad ? GradVarName(src_name) : src_name;
+    dst_inout[dst_name].reserve(src_inout.at(src_name).size());
+    for (auto& var_name : src_inout.at(src_name)) {
+      std::string s = is_grad ? GradVarName(var_name) : var_name;
+      dst_inout[dst_name].emplace_back(s);
+    }
+  }
+}
+
+OperatorBase* BuildGradOp(const OperatorBase* op) {
+  auto it = OpRegistry::op_info_map().find(op->Type());
+  PADDLE_ENFORCE(it != OpRegistry::op_info_map().end(),
+                 "'%s' has not been registered.", op->Type());
+  PADDLE_ENFORCE(it->second.proto_ != nullptr, "'%s' has no OpProto.",
+                 op->Type());
+  std::string grad_op_type = it->second.grad_op_type_;
+  PADDLE_ENFORCE(!grad_op_type.empty(), "'%s' has no gradient operator.",
+                 op->Type());
+
+  OperatorBase::VarNameMap inputs;
+  OperatorBase::VarNameMap outputs;
+  TransOpArg(op, OpArgType::IN, false, &inputs);   // I
+  TransOpArg(op, OpArgType::OUT, false, &inputs);  // O
+  TransOpArg(op, OpArgType::OUT, true, &inputs);   // OG
+  TransOpArg(op, OpArgType::IN, true, &outputs);   // IG
+
+  it = OpRegistry::op_info_map().find(grad_op_type);
+  PADDLE_ENFORCE(it != OpRegistry::op_info_map().end(),
+                 "'%s' has not been registered.", grad_op_type);
+  return it->second.creator_(grad_op_type, inputs, outputs, op->Attrs());
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/attr_type.proto b/paddle/framework/grad_op_builder.h
similarity index 64%
rename from paddle/framework/attr_type.proto
rename to paddle/framework/grad_op_builder.h
index 2d8e0476d7..998f8ebbb5 100644
--- a/paddle/framework/attr_type.proto
+++ b/paddle/framework/grad_op_builder.h
@@ -12,17 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-syntax="proto2";
-package paddle.framework;
-
-// Attribute Type for paddle's Op.
-// Op contains many attributes. Each type of attributes could be different.
-// The AttrType will be shared between AttrDesc and AttrProto.
-enum AttrType {
-    INT = 0;
-    FLOAT = 1;
-    STRING = 2;
-    INTS = 3;
-    FLOATS = 4;
-    STRINGS = 5;
-}
\ No newline at end of file
+#pragma once
+
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+
+OperatorBase* BuildGradOp(const OperatorBase* op);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/grad_op_builder_test.cc b/paddle/framework/grad_op_builder_test.cc
new file mode 100644
index 0000000000..902c2655e9
--- /dev/null
+++ b/paddle/framework/grad_op_builder_test.cc
@@ -0,0 +1,122 @@
+#include "paddle/framework/grad_op_builder.h"
+#include <gtest/gtest.h>
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+USE_OP(add_two);
+
+namespace paddle {
+namespace framework {
+
+class MutiInOutOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  MutiInOutOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("In1", "a single input");
+    AddInput("In2_mult", "a multiple input").AsDuplicable();
+    AddInput("In3", "another single input");
+    AddOutput("Out1", "a single output");
+    AddOutput("Out2_mult", "a multiple output").AsDuplicable();
+    AddComment("test op with multiple inputs and outputs");
+  }
+};
+
+class IOIgnoredOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  IOIgnoredOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("In1", "a single input");
+    AddInput("In2_mult", "a multiple input").AsDuplicable().NotInGradient();
+    AddInput("In3_mult", "another multiple input").AsDuplicable();
+    AddOutput("Out1_mult", "a multiple output").AsDuplicable();
+    AddOutput("Out2", "a single output").NotInGradient();
+    AddComment("op with inputs and outputs ignored in gradient calculating");
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+namespace f = paddle::framework;
+
+TEST(GradOpBuilder, AddTwo) {
+  std::shared_ptr<f::OperatorBase> add_op(f::OpRegistry::CreateOp(
+      "add_two", {{"X", {"x"}}, {"Y", {"y"}}}, {{"Out", {"out"}}}, {}));
+  std::shared_ptr<f::OperatorBase> grad_add_op =
+      f::OpRegistry::CreateGradOp(*add_op);
+  EXPECT_EQ(grad_add_op->Inputs().size(), 4UL);
+  EXPECT_EQ(grad_add_op->Outputs().size(), 2UL);
+  EXPECT_EQ(grad_add_op->Input("X"), "x");
+  EXPECT_EQ(grad_add_op->Input("Y"), "y");
+  EXPECT_EQ(grad_add_op->Input("Out"), "out");
+  EXPECT_EQ(grad_add_op->Input(f::GradVarName("Out")), f::GradVarName("out"));
+  EXPECT_EQ(grad_add_op->Output(f::GradVarName("X")), f::GradVarName("x"));
+  EXPECT_EQ(grad_add_op->Output(f::GradVarName("Y")), f::GradVarName("y"));
+}
+
+REGISTER_OP(mult_io, f::NOP, f::MutiInOutOpMaker, mult_io_grad, f::NOP);
+REGISTER_OP(io_ignored, f::NOP, f::IOIgnoredOpMaker, io_ignored_grad, f::NOP);
+
+TEST(GradOpBuilder, MutiInOut) {
+  std::shared_ptr<f::OperatorBase> test_op(f::OpRegistry::CreateOp(
+      "mult_io", {{"In1", {"in1"}},
+                  {"In2_mult", {"in2_1", "in2_2", "in2_3"}},
+                  {"In3", {"in3"}}},
+      {{"Out1", {"out1"}}, {"Out2_mult", {"out2_1", "out2_2"}}}, {}));
+  std::shared_ptr<f::OperatorBase> grad_test_op =
+      f::OpRegistry::CreateGradOp(*test_op);
+
+  ASSERT_EQ(grad_test_op->Inputs().size(), 3UL + 2UL + 2UL);
+  EXPECT_EQ(grad_test_op->Input("In1"), "in1");
+  EXPECT_EQ(grad_test_op->Inputs("In2_mult"),
+            std::vector<std::string>({"in2_1", "in2_2", "in2_3"}));
+  EXPECT_EQ(grad_test_op->Input("In3"), "in3");
+  EXPECT_EQ(grad_test_op->Input("Out1"), "out1");
+  EXPECT_EQ(grad_test_op->Inputs("Out2_mult"),
+            std::vector<std::string>({"out2_1", "out2_2"}));
+  EXPECT_EQ(grad_test_op->Input(f::GradVarName("Out1")),
+            f::GradVarName("out1"));
+  EXPECT_EQ(grad_test_op->Inputs(f::GradVarName("Out2_mult")),
+            std::vector<std::string>(
+                {f::GradVarName("out2_1"), f::GradVarName("out2_2")}));
+
+  ASSERT_EQ(grad_test_op->Outputs().size(), 3UL);
+  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In1")), f::GradVarName("in1"));
+  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In2_mult")),
+            std::vector<std::string>({f::GradVarName("in2_1"),
+                                      f::GradVarName("in2_2"),
+                                      f::GradVarName("in2_3")}));
+  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In3")), f::GradVarName("in3"));
+}
+
+TEST(GradOpBuilder, IOIgnoredInGradient) {
+  std::shared_ptr<f::OperatorBase> test_op(f::OpRegistry::CreateOp(
+      "io_ignored", {{"In1", {"in1"}},
+                     {"In2_mult", {"in2_1", "in2_2"}},
+                     {"In3_mult", {"in3_1", "in3_2"}}},
+      {{"Out1_mult", {"out1_1", "out1_2"}}, {"Out2", {"out2"}}}, {}));
+  std::shared_ptr<f::OperatorBase> grad_test_op =
+      f::OpRegistry::CreateGradOp(*test_op);
+
+  // 'In2' and 'Out2' are ignored in gradient calculating
+  ASSERT_EQ(grad_test_op->Inputs().size(), 2UL + 1UL + 2UL);
+  EXPECT_EQ(grad_test_op->Input("In1"), "in1");
+  EXPECT_EQ(grad_test_op->Inputs("In3_mult"),
+            std::vector<std::string>({"in3_1", "in3_2"}));
+  EXPECT_EQ(grad_test_op->Inputs("Out1_mult"),
+            std::vector<std::string>({"out1_1", "out1_2"}));
+  EXPECT_EQ(grad_test_op->Inputs(f::GradVarName("Out1_mult")),
+            std::vector<std::string>(
+                {f::GradVarName("out1_1"), f::GradVarName("out1_2")}));
+  EXPECT_EQ(grad_test_op->Input(f::GradVarName("Out2")),
+            f::GradVarName("out2"));
+
+  ASSERT_EQ(grad_test_op->Outputs().size(), 3UL);
+  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In1")), f::GradVarName("in1"));
+  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In2_mult")),
+            std::vector<std::string>(
+                {f::GradVarName("in2_1"), f::GradVarName("in2_2")}));
+  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In3_mult")),
+            std::vector<std::string>(
+                {f::GradVarName("in3_1"), f::GradVarName("in3_2")}));
+}
diff --git a/paddle/framework/images/duplicate_op.graffle b/paddle/framework/images/duplicate_op.graffle
new file mode 100644
index 0000000000..5979f792e2
Binary files /dev/null and b/paddle/framework/images/duplicate_op.graffle differ
diff --git a/paddle/framework/images/duplicate_op.png b/paddle/framework/images/duplicate_op.png
new file mode 100644
index 0000000000..f299c5d37f
Binary files /dev/null and b/paddle/framework/images/duplicate_op.png differ
diff --git a/paddle/framework/images/duplicate_op2.graffle b/paddle/framework/images/duplicate_op2.graffle
new file mode 100644
index 0000000000..2b658085d6
Binary files /dev/null and b/paddle/framework/images/duplicate_op2.graffle differ
diff --git a/paddle/framework/images/duplicate_op2.png b/paddle/framework/images/duplicate_op2.png
new file mode 100644
index 0000000000..c5588015d1
Binary files /dev/null and b/paddle/framework/images/duplicate_op2.png differ
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
new file mode 100644
index 0000000000..2b17890774
--- /dev/null
+++ b/paddle/framework/lod_tensor.cc
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/lod_tensor.h"
+
+#include <glog/logging.h>
+
+namespace paddle {
+namespace framework {
+
+LODTensor::LOD LODTensor::LOD::SliceLevels(size_t level_begin,
+                                           size_t level_end) const {
+  LOD new_lod;
+  new_lod.reserve(level_end - level_begin);
+  for (size_t i = level_begin; i < level_end; i++) {
+    new_lod.emplace_back(at(i));
+  }
+  return new_lod;
+}
+
+LODTensor::LOD LODTensor::LOD::SliceInLevel(size_t level, size_t elem_begin,
+                                            size_t elem_end) const {
+  // slice the lod.
+  LOD new_lod;
+  new_lod.reserve(size() - level);
+  auto start = this->at(level)[elem_begin];
+  auto end = this->at(level)[elem_end];
+
+  for (auto it = this->begin() + level; it != this->end(); it++) {
+    auto it_begin = std::find(it->begin(), it->end(), start);
+    auto it_end = std::find(it_begin, it->end(), end);
+    PADDLE_ENFORCE(it_begin != it->end(), "error in parsing lod info");
+    PADDLE_ENFORCE(it_end != it->end(), "error in parsing lod info");
+    new_lod.emplace_back(it_begin, it_end + 1);
+    // reset offset if tensor is copyed and sliced.
+    std::transform(new_lod.back().begin(), new_lod.back().end(),
+                   new_lod.back().begin(),
+                   [start](int v) { return v - start; });
+    PADDLE_ENFORCE_EQ(new_lod.back().front(), 0, "error in slice LOD");
+  }
+  PADDLE_ENFORCE_LE(new_lod.size(), this->size());
+  return new_lod;
+}
+
+bool operator==(const LODTensor::LOD& a, const LODTensor::LOD& b) {
+  if (a.size() != b.size()) {
+    return false;
+  }
+
+  for (size_t i = 0; i < a.size(); i++) {
+    const auto& a_level = a[i];
+    const auto& b_level = b[i];
+    if (a_level.size() != b_level.size()) {
+      return false;
+    }
+    for (size_t j = 0; j < a_level.size(); j++) {
+      if (a_level[j] != b_level[j]) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
new file mode 100644
index 0000000000..9e27aec38d
--- /dev/null
+++ b/paddle/framework/lod_tensor.h
@@ -0,0 +1,149 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#if !defined(PADDLE_ONLY_CPU)
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#endif
+
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+/*
+ * LODTensor (Level of details Tensor)
+ * see https://en.wikipedia.org/wiki/Level_of_details for reference.
+ */
+class LODTensor : public Tensor {
+ public:
+// Level save offsets of each unit.
+#ifdef PADDLE_ONLY_CPU
+  template <typename T>
+  using Vector = std::vector<T>;
+#else
+  template <typename T>
+  using Vector = thrust::host_vector<T>;
+#endif
+  // LoD stores offsets of each level of units, the largest units level first,
+  // then the smaller units level. Each Level stores the offsets of units in
+  // Tesor.
+  class LOD : public std::vector<Vector<size_t>> {
+   public:
+    LOD SliceLevels(size_t level_begin, size_t level_end) const;
+    LOD SliceInLevel(size_t level, size_t elem_begin, size_t elem_end) const;
+  };
+
+  LODTensor() {}
+  explicit LODTensor(const LOD &lod) : lod_(lod) {}
+
+  virtual Tensor *Clone() const { return new LODTensor(lod_); }
+
+  /*
+   * Get a element from LOD.
+   */
+  size_t lod_element(size_t level, size_t elem) const {
+    PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
+                   NumLevels());
+    PADDLE_ENFORCE(elem < NumElements(level),
+                   "element begin [%d] out of range [%d]", elem,
+                   NumElements(level));
+    return (lod_)[level][elem];
+  }
+
+  /*
+   * Number of LODTensor's levels, each level has units of data, for example,
+   * in the sentence's view, article, paragraph, sentence are 3 levels.
+   */
+  size_t NumLevels() const { return lod_.size(); }
+  /*
+   * Number of elements in a level.
+   */
+  size_t NumElements(size_t level = 0) const {
+    PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
+                   NumLevels());
+    // the last offset is the end of last element
+    return lod_[level].size() - 1;
+  }
+
+  /*
+   * Slice of levels[level_begin:level_end], with tensor shared.
+   */
+  template <typename T>
+  LODTensor SliceLevels(size_t level_begin, size_t level_end) const;
+
+  /*
+   * Slice of elements of a level, [elem_begin: elem_end], with tensor shared.
+   * @note: low performance in slice lod_.
+   */
+  template <typename T>
+  LODTensor SliceInLevel(size_t level, size_t elem_begin,
+                         size_t elem_end) const;
+
+  /*
+   * Copy other's lod_'s content, free to mutate.
+   */
+  void CopyLOD(const LODTensor &other) { lod_ = other.lod_; }
+  /*
+   * Determine whether LODTensor has a valid LOD info.
+   */
+  const LOD &lod() const { return lod_; }
+  LOD *mutable_lod() { return &lod_; }
+
+  virtual ~LODTensor() {}
+
+ private:
+  LOD lod_;
+};
+
+bool operator==(const LODTensor::LOD &a, const LODTensor::LOD &b);
+
+template <typename T>
+LODTensor LODTensor::SliceLevels(size_t level_begin, size_t level_end) const {
+  auto new_lod = lod_.SliceLevels(level_begin, level_end);
+  // slice levels just need to update LOD info, each level will contains the
+  // whole tensor_, so no need to modify tensor_.
+  LODTensor new_tensor(new_lod);
+  new_tensor.ShareDataWith<T>(*this);
+  return new_tensor;
+}
+
+template <typename T>
+LODTensor LODTensor::SliceInLevel(size_t level, size_t elem_begin,
+                                  size_t elem_end) const {
+  PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
+                 NumLevels());
+  PADDLE_ENFORCE(elem_begin < NumElements(level),
+                 "element begin [%d] out of range [%d]", elem_begin,
+                 NumElements(level));
+  PADDLE_ENFORCE(elem_end < NumElements(level) + 1,
+                 "element end [%d] out of range [%d]", elem_end,
+                 NumElements(level));
+
+  auto new_lod = lod_.SliceInLevel(level, elem_begin, elem_end);
+
+  // slice elements just need to update LOD info, because offsets are not
+  // changed, so the original tensor_ can be reused.
+  LODTensor new_tensor(new_lod);
+  new_tensor.ShareDataWith<T>(*this);
+  return new_tensor;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc
new file mode 100644
index 0000000000..2881136ced
--- /dev/null
+++ b/paddle/framework/lod_tensor_test.cc
@@ -0,0 +1,116 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#include "paddle/framework/lod_tensor.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <memory>
+
+namespace paddle {
+namespace framework {
+
+class LODTensorTester : public ::testing::Test {
+ public:
+  virtual void SetUp() override {
+    lod_tensor.reset(new LODTensor);
+    // tensor's batch_size: 30
+    // 3 levels
+    // 0 10 20
+    // 0 5 10 15 20
+    // 0 2 5 7 10 12 15 20
+    LODTensor::LOD lod;
+    lod.push_back(std::vector<size_t>{0, 10, 20});
+    lod.push_back(std::vector<size_t>{0, 5, 10, 15, 20});
+    lod.push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20});
+
+    ASSERT_EQ(lod.size(), 3UL);
+
+    tensor.Resize({20 /*batch size*/, 128 /*dim*/});
+    // malloc memory
+    tensor.mutable_data<float>(place);
+
+    lod_tensor.reset(new LODTensor(lod));
+    lod_tensor->Resize({20 /*batch size*/, 128 /*dim*/});
+
+    lod_tensor->ShareDataWith<float>(tensor);
+    // lod_tensor->ShareDataWith<Tensor>(tensor);
+  }
+
+ protected:
+  std::unique_ptr<LODTensor> lod_tensor;
+  platform::CPUPlace place;
+  Tensor tensor;
+};
+
+TEST_F(LODTensorTester, NumLevels) { ASSERT_EQ(lod_tensor->NumLevels(), 3UL); }
+
+TEST_F(LODTensorTester, NumElements) {
+  ASSERT_EQ(lod_tensor->NumElements(0), 2UL);
+  ASSERT_EQ(lod_tensor->NumElements(1), 4UL);
+  ASSERT_EQ(lod_tensor->NumElements(2), 8UL);
+}
+
+TEST_F(LODTensorTester, SliceLevels) {
+  // slice 1 level
+  for (size_t level = 0; level < 3UL; ++level) {
+    auto new_lod_tensor = lod_tensor->SliceLevels<float>(level, level + 1);
+    ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
+    ASSERT_EQ(new_lod_tensor.NumElements(0UL), lod_tensor->NumElements(level));
+    // ASSERT_EQ(new_lod_tensor, *lod_tensor);
+  }
+  // slice 2 level
+  for (size_t level = 0; level < 2UL; ++level) {
+    auto new_lod_tensor = lod_tensor->SliceLevels<float>(level, level + 2);
+    ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
+    ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor->NumElements(level));
+    ASSERT_EQ(new_lod_tensor.NumElements(1),
+              lod_tensor->NumElements(level + 1));
+    ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor->data<float>());
+  }
+}
+
+TEST_F(LODTensorTester, SliceInLevel) {
+  size_t level = 0;
+  auto new_lod_tensor = lod_tensor->SliceInLevel<float>(level, 0, 2);
+  EXPECT_EQ(new_lod_tensor.NumLevels(), 3UL);
+  EXPECT_EQ(new_lod_tensor.NumElements(0), 2UL);
+  EXPECT_EQ(new_lod_tensor.NumElements(1), 4UL);
+  EXPECT_EQ(new_lod_tensor.NumElements(2), 8UL);
+  ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor->data<float>());
+
+  level = 1;
+  new_lod_tensor = lod_tensor->SliceInLevel<float>(level, 0, 2);
+  ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
+  ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor->data<float>());
+}
+
+TEST_F(LODTensorTester, ShareLOD) {
+  LODTensor new_lod_tensor;
+  new_lod_tensor.CopyLOD(*lod_tensor);
+  ASSERT_EQ(new_lod_tensor.lod(), lod_tensor->lod());
+}
+
+TEST_F(LODTensorTester, CopyLOD) {
+  LODTensor new_lod_tensor;
+  new_lod_tensor.CopyLOD(*lod_tensor);
+  bool equals = std::equal(lod_tensor->lod().begin(), lod_tensor->lod().end(),
+                           new_lod_tensor.lod().begin());
+  ASSERT_TRUE(equals);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/net.cc b/paddle/framework/net.cc
deleted file mode 100644
index 73b3051235..0000000000
--- a/paddle/framework/net.cc
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "paddle/framework/net.h"
-
-namespace paddle {
-namespace framework {
-
-PlainNet::PlainNet(const NetDesc& def) {}
-
-void PlainNet::InferShape(Scope* scope) {
-  for (auto& op : ops_) {
-    op.InferShape();
-  }
-}
-
-void PlainNet::Run(std::shared_ptr<Scope> scope, DeviceContext* ctx) {
-  for (auto& op : ops_) {
-    op.Run(ctx);
-  }
-}
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/net.h b/paddle/framework/net.h
deleted file mode 100644
index 76992e0728..0000000000
--- a/paddle/framework/net.h
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/net_proto.pb.h"
-#include "paddle/framework/op_proto.pb.h"
-#include "paddle/framework/scope.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-using namespace paddle::platform;
-
-// operator's index stored in a network.
-typedef int OpIndex;
-/**
- * NOTE following codes are some definitions of unimplemented concepts.
- * We write some basic implementation to make Net compilable. These APIs will
- * keep updating if the concepts related are implemented.
- */
-
-struct OpDesc;
-struct OpAttrs {};
-
-class Operator {
- public:
-  Operator(const OpDesc &def) {}
-  void InferShape() {}
-  void Run(DeviceContext *ctx) {}
-};
-
-/**
- * @brief Network that manage the operators it has.
- *
- * Network is the container and controller of a set of operators, user can build
- * a real network from a NetDesc which is a protobuf message and use
- * Network.Run() * to run all the operators in the network.
-
- * A network object knows all Operators belonging to this network. Variables,
- * which are inputs and outputs of these operators, are created and managed by a
- * hierarchy of Scope objects.
- *
- * This is the base class of network, all the networks should implement the apis
- * it defines.
- */
-class Net {
- public:
-  /**
-   * @brief Infer shapes of all inputs and outputs of operators.
-   */
-  virtual void InferShape(Scope *scope) = 0;
-  /**
-   * @brief Run the network.
-   *
-   * Run all the operators and return success(true) or not, with all the
-   * variables are located in `scope`. `context` describes the detail execution
-   * environment for ops. `begin` and `end` specify the scope of `ops_` to run,
-   * If no positive indexes are provided, all operators in `ops_` will run.
-   */
-  virtual void Run(std::shared_ptr<Scope> scope, DeviceContext *ctx) = 0;
-
-  /**
-   * @brief Add an Operator according to `def`.
-   */
-  virtual OpIndex AddOp(const OpProto &def) = 0;
-
-  /**
-   * @brief Add optimizer operators acctording to `attrs`.
-   */
-  virtual void AddOptimizerOps(const OpAttrs &attrs) = 0;
-
-  /**
-   * @brief Add backward operators.
-   */
-  virtual void AddBackwardOps() = 0;
-
-  /**
-   * @brief Create a network.
-   */
-  static std::unique_ptr<Net> Create(const NetDesc &def = NetDesc());
-
-  virtual ~Net() {}
-};
-
-/**
- * @brief a basic implementation of Net.
- *
- * PlainNet is a very simple Net, it create a list of operators, and run them
- * sequentially following the order they added.
- */
-class PlainNet : public Net {
- public:
-  /**
-   * @brief Initialize a PlainNet.
-   *
-   * Initialize from  a network describe by `def`. NetDesc is the definition of
-   * a network.
-   */
-  PlainNet(const NetDesc &def);
-
-  /**
-   * Infer all the operators' input and output varialbes' shapes, will be called
-   * before every mini-batch
-   */
-  virtual void InferShape(Scope *scope) override;
-
-  /**
-   * @brief Run the network.
-   *
-   * Run all the operators with the `scope`, if no scope is provided, default
-   * scope will be used instead. If no OpContext is provicded, default context
-   * will be used.
-   */
-  virtual void Run(std::shared_ptr<Scope> scope, DeviceContext *ctx) override;
-
-  /**
-   * @brief Add an operator to this network.
-   */
-  virtual OpIndex AddOp(const OpProto &def) override;
-
-  /**
-   * @brief Add all optimizer operators related into the network.
-   */
-  virtual void AddOptimizerOps(const OpAttrs &attrs) override;
-
-  /**
-   * @brief Add all backward operators related into the network.
-   */
-  virtual void AddBackwardOps() override;
-
-  virtual ~PlainNet() override {}
-
- protected:
-  /**
-   * @brief Build the network.
-   *
-   * Create operators accordding to `def`, will be called by the constructor.
-   */
-  void BuildNet(const NetDesc &def);
-
-  /**
-   * @brief Add an operator into this network.
-   *
-   * Add a operator which is identified as `type` and has attributes described
-   * in `attrs`, the `inputs` are the keys of readonly input variables,
-   * `outputs` are keys of mutable output variables. An `OpIndex` will be
-   * returned to indicate the offset of the new operator in `ops_`.
-   */
-  OpIndex AddOp(const std::string &type, const std::vector<std::string> &inputs,
-                const std::vector<std::string> &outputs,
-                const OpAttrs &attrs = OpAttrs());
-
- private:
-  // the operators owned by `Network`.
-  std::vector<Operator> ops_;
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/net_proto.proto b/paddle/framework/net_proto.proto
deleted file mode 100644
index 0779f49fe2..0000000000
--- a/paddle/framework/net_proto.proto
+++ /dev/null
@@ -1,15 +0,0 @@
-syntax="proto2";
-package paddle.framework;
-
-import "op_proto.proto";
-
-message NetDesc {
-  // network identification
-  optional string name = 1;
-  // operator contains in network
-  repeated OpProto operators = 2;
-  // network type to run with. e.g "plainNet", "DAG"
-  optional string net_type = 3;
-  // num worker always
-  optional int32 num_workers = 4;
-}
diff --git a/paddle/framework/op_desc.proto b/paddle/framework/op_desc.proto
deleted file mode 100644
index 89497f3c16..0000000000
--- a/paddle/framework/op_desc.proto
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-syntax="proto2";
-package paddle.framework;
-
-import "attr_type.proto";
-
-// AttrDesc is used to describe Attributes of an Operator. It contain's
-// name, type, and value of Attribute.
-//
-// e.g, for scale=3.0: name=scala, type=AttrType.FLOAT, value=3.0
-message AttrDesc {
-    required string name = 1;
-    required AttrType type = 2;
-    optional int32 i = 3;
-    optional float f = 4;
-    optional string s = 5;
-    repeated int32 ints = 6;
-    repeated float floats = 7;
-    repeated string strings = 8;
-};
-
-// Protocol Message to describe an Operator.
-//
-// In PaddlePaddle, Operator is used to do a certain computation such
-// as "add", "sub", "cosine", etc.
-//  (1) Operator needs to know the input and output variable names.
-//  (2) Some ops may have special attributes such as "scale" in "CosineOp".
-//
-// 3rd-party language can build this proto message and call
-// AddOp(const OpDesc& op_desc) of Paddle core to create an Operator.
-message OpDesc {
-    // input names of this Operator.
-    repeated string inputs = 1;
-
-    // output names of this Operator.
-    repeated string outputs = 2;
-
-    // type of this Operator, such as "add", "sub", "fc".
-    required string type = 3;
-
-    // Attributes of this Operator. e.g., scale=3.0 in cosine op.
-    repeated AttrDesc attrs = 4;
-};
\ No newline at end of file
diff --git a/paddle/framework/op_proto.proto b/paddle/framework/op_proto.proto
deleted file mode 100644
index 22df6f9c6b..0000000000
--- a/paddle/framework/op_proto.proto
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// Protocol Message for 3rd-party language binding.
-//
-// Paddle Python package will use `OpProto` to generate op creation methods.
-// The op creation methods take user's input and generate `OpDesc` proto message,
-// then pass `OpDesc` to C++ side and create Op pointer.
-//
-syntax="proto2";
-package paddle.framework;
-
-import "attr_type.proto";
-
-// Attribute protocol message for 3rd-party language binding.
-// It will store the Op support what attribute and what type.
-message AttrProto {
-    // Supported attribute name. e.g. `scale` for cosine op.
-    required string name = 1;
-
-    // Supported attribute type.
-    required AttrType type = 2;
-
-    // Supported attribute comments. It helps 3rd-party language generate doc-string.
-    required string comment = 3;
-}
-
-// Input or output message for 3rd-party language binding.
-// It contains parameter name and its comments.
-message VarProto {
-    // Input or output name in that op creation function.
-    // e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names.
-    required string name = 1;
-
-    // The comment for that input. It helps 3rd-party language generate doc-string.
-    required string comment = 2;
-}
-
-// Op protocol message for 3rd-party language binding.
-// It contains all information for generating op creation method.
-message OpProto {
-    // The input information to generate op creation method.
-    repeated VarProto inputs = 1;
-
-    // The output information to generate op creation method.
-    repeated VarProto outputs = 2;
-
-    // The attribute information to generate op creation method.
-    repeated AttrProto attrs = 3;
-
-    // The comments for that Op. It helps 3rd-party language generate
-    // doc-string. The whole documentation of that Op is generated by comment,
-    // inputs, outputs, attrs together.
-    required string comment = 4;
-
-    // The type of that Op.
-    required string type = 5;
-}
diff --git a/paddle/framework/op_proto_test.cc b/paddle/framework/op_proto_test.cc
deleted file mode 100644
index 9c054bde44..0000000000
--- a/paddle/framework/op_proto_test.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-#include <gtest/gtest.h>
-#include <paddle/framework/op_proto.pb.h>
-
-TEST(TestOpProto, ALL) {
-  paddle::framework::OpProto proto;
-  {
-    auto ipt = proto.mutable_inputs()->Add();
-    *ipt->mutable_name() = "a";
-    *ipt->mutable_comment() = "the one input of cosine op";
-  }
-  {
-    auto ipt = proto.mutable_inputs()->Add();
-    *ipt->mutable_name() = "b";
-    *ipt->mutable_comment() = "the other input of cosine op";
-  }
-  {
-    auto opt = proto.mutable_outputs()->Add();
-    *opt->mutable_name() = "output";
-    *opt->mutable_comment() = "the output of cosine op";
-  }
-  {
-    auto attr = proto.mutable_attrs()->Add();
-    *attr->mutable_name() = "scale";
-    attr->set_type(paddle::framework::AttrType::FLOAT);
-    *attr->mutable_comment() = "the scale attribute of cosine op";
-  }
-  proto.set_type("cos");
-  *proto.mutable_comment() = "cosine op, output = scale * cos(a, b)";
-
-  ASSERT_TRUE(proto.IsInitialized());
-}
\ No newline at end of file
diff --git a/paddle/framework/op_registry.cc b/paddle/framework/op_registry.cc
index 4b35e04e68..8eae86e960 100644
--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
@@ -1,36 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #include <paddle/framework/op_registry.h>
 
+#include <vector>
+
 namespace paddle {
 namespace framework {
 
-template <>
-void AttrTypeHelper::SetAttrType<int>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::INT);
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const std::string& type,
+                                                   const VarNameMap& inputs,
+                                                   const VarNameMap& outputs,
+                                                   AttributeMap attrs) {
+  auto it = op_info_map().find(type);
+  PADDLE_ENFORCE(it != op_info_map().end(),
+                 "Operator '%s' has not been registered.", type);
+  it->second.checker_->Check(attrs);
+  auto op = it->second.creator_(type, inputs, outputs, attrs);
+  return std::unique_ptr<OperatorBase>(op);
 }
 
-template <>
-void AttrTypeHelper::SetAttrType<float>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::FLOAT);
-}
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
+  VarNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
+  VarNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
+  AttributeMap attrs;
+  for (auto& attr : op_desc.attrs()) {
+    attrs[attr.name()] = GetAttrValue(attr);
+  }
 
-template <>
-void AttrTypeHelper::SetAttrType<std::string>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::STRING);
+  return CreateOp(op_desc.type(), inputs, outputs, attrs);
 }
 
-template <>
-void AttrTypeHelper::SetAttrType<std::vector<int>>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::INTS);
+OperatorBase::VarNameMap OpRegistry::ConvertOpDescVarsToVarNameMap(
+    const google::protobuf::RepeatedPtrField<OpDesc::Var>& op_desc_vars) {
+  VarNameMap ret_val;
+  for (auto& var : op_desc_vars) {
+    auto& var_names = ret_val[var.parameter()];
+    auto& var_names_in_proto = var.arguments();
+    var_names.reserve(static_cast<size_t>(var_names_in_proto.size()));
+    std::copy(var_names_in_proto.begin(), var_names_in_proto.end(),
+              std::back_inserter(var_names));
+  }
+  return ret_val;
 }
 
-template <>
-void AttrTypeHelper::SetAttrType<std::vector<float>>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::FLOATS);
+std::unique_ptr<OperatorBase> OpRegistry::CreateGradOp(const OperatorBase& op) {
+  PADDLE_ENFORCE(!op.IsNetOp(), "Use framework::Backward to get backward ops");
+  return std::unique_ptr<OperatorBase>(BuildGradOp(&op));
 }
 
-template <>
-void AttrTypeHelper::SetAttrType<std::vector<std::string>>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::STRINGS);
-}
 }  // namespace framework
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index e46da822c6..4c2d13d639 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -1,160 +1,230 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma once
 
 #include <algorithm>
-#include "paddle/framework/attr_checker.h"
-#include "paddle/framework/op_desc.pb.h"
-#include "paddle/framework/op_proto.pb.h"
+#include <atomic>
+#include <type_traits>
+#include <typeinfo>
+#include <unordered_map>
+#include <unordered_set>
+#include "paddle/framework/attribute.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/grad_op_builder.h"
 #include "paddle/framework/operator.h"
+#include "paddle/framework/scope.h"
 
 namespace paddle {
 namespace framework {
 
-// helper class to set attribute type
-struct AttrTypeHelper {
-  template <typename T>
-  static void SetAttrType(AttrProto* attr);
-
-  static Attribute GetAttrValue(const AttrDesc& attr_desc) {
-    switch (attr_desc.type()) {
-      case paddle::framework::AttrType::INT: {
-        return attr_desc.i();
-      }
-      case paddle::framework::AttrType::FLOAT: {
-        return attr_desc.f();
-      }
-      case paddle::framework::AttrType::STRING: {
-        return attr_desc.s();
-      }
-      case paddle::framework::AttrType::INTS: {
-        std::vector<int> val(attr_desc.ints_size());
-        for (int i = 0; i < attr_desc.ints_size(); ++i) {
-          val[i] = attr_desc.ints(i);
-        }
-        return val;
-      }
-      case paddle::framework::AttrType::FLOATS: {
-        std::vector<float> val(attr_desc.floats_size());
-        for (int i = 0; i < attr_desc.floats_size(); ++i) {
-          val[i] = attr_desc.floats(i);
-        }
-        return val;
-      }
-      case paddle::framework::AttrType::STRINGS: {
-        std::vector<std::string> val(attr_desc.strings_size());
-        for (int i = 0; i < attr_desc.strings_size(); ++i) {
-          val[i] = attr_desc.strings(i);
-        }
-        return val;
-      }
-    }
-    PADDLE_ENFORCE(false, "Unknown OpDesc::AttrDesc::type !");
-    return boost::blank();
-  }
-};
+class OpRegistry {
+  using VarNameMap = OperatorBase::VarNameMap;
+  using OpCreator = std::function<OperatorBase*(
+      const std::string& /*type*/, const VarNameMap& /*inputs*/,
+      const VarNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
 
-// this class not only make proto but also init attribute checkers.
-class OpProtoAndCheckerMaker {
  public:
-  OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : proto_(proto), op_checker_(op_checker) {}
-
- protected:
-  void AddInput(const std::string& name, const std::string& comment) {
-    auto input = proto_->mutable_inputs()->Add();
-    *input->mutable_name() = name;
-    *input->mutable_comment() = comment;
-  }
+  struct OpInfo {
+    OpCreator creator_;
+    std::string grad_op_type_;
+    OpProto* proto_;
+    OpAttrChecker* checker_;
+  };
 
-  void AddOutput(const std::string& name, const std::string& comment) {
-    auto output = proto_->mutable_outputs()->Add();
-    *output->mutable_name() = name;
-    *output->mutable_comment() = comment;
+  template <typename OpType, typename ProtoMakerType, typename GradOpType>
+  static void RegisterOp(const std::string& op_type,
+                         const std::string& grad_op_type) {
+    PADDLE_ENFORCE(op_info_map().count(op_type) == 0,
+                   "'%s' is registered more than once.", op_type);
+    OpInfo op_info;
+    op_info.creator_ = [](const std::string& type, const VarNameMap& inputs,
+                          const VarNameMap& outputs,
+                          const AttributeMap& attrs) {
+      return new OpType(type, inputs, outputs, attrs);
+    };
+    op_info.grad_op_type_ = grad_op_type;
+    if (std::type_index(typeid(ProtoMakerType)) !=
+        std::type_index(typeid(NOPMaker))) {
+      op_info.proto_ = new OpProto;
+      op_info.checker_ = new OpAttrChecker;
+      auto maker = ProtoMakerType(op_info.proto_, op_info.checker_);
+      maker.Validate();
+      op_info.proto_->set_type(op_type);
+      PADDLE_ENFORCE(
+          op_info.proto_->IsInitialized(),
+          "Fail to initialize %s's OpProto, because %s is not initialized",
+          op_type, op_info.proto_->InitializationErrorString());
+    } else {
+      op_info.proto_ = nullptr;
+      op_info.checker_ = nullptr;
+    }
+    op_info_map().insert(std::make_pair(op_type, op_info));
+    // register gradient op
+    if (!grad_op_type.empty()) {
+      RegisterOp<GradOpType, NOPMaker, NOP>(grad_op_type, "");
+    }
   }
 
-  template <typename T>
-  TypedAttrChecker<T>& AddAttr(const std::string& name,
-                               const std::string& comment) {
-    auto attr = proto_->mutable_attrs()->Add();
-    *attr->mutable_name() = name;
-    *attr->mutable_comment() = comment;
-    AttrTypeHelper::SetAttrType<T>(attr);
-    return op_checker_->AddAttrChecker<T>(name);
-  }
+  static std::unique_ptr<OperatorBase> CreateOp(const std::string& type,
+                                                const VarNameMap& inputs,
+                                                const VarNameMap& outputs,
+                                                AttributeMap attrs);
 
-  void AddType(const std::string& op_type) { proto_->set_type(op_type); }
+  static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);
 
-  void AddComment(const std::string& comment) {
-    *(proto_->mutable_comment()) = comment;
-  }
+  static VarNameMap ConvertOpDescVarsToVarNameMap(
+      const google::protobuf::RepeatedPtrField<OpDesc::Var>& op_desc_vars);
+
+  static std::unique_ptr<OperatorBase> CreateGradOp(const OperatorBase& op);
 
-  OpProto* proto_;
-  OpAttrChecker* op_checker_;
+  static std::unordered_map<std::string, const OpInfo>& op_info_map() {
+    static std::unordered_map<std::string, const OpInfo> op_info_map_;
+    return op_info_map_;
+  }
 };
 
-class OpRegistry {
-  using OpCreator = std::function<OperatorBase*()>;
+class Registrar {
+ public:
+  // In our design, various kinds of classes, e.g., operators and kernels,
+  // have their corresponding registry and registrar. The action of
+  // registration is in the constructor of a global registrar variable, which,
+  // however, are not used in the code that calls package framework, and would
+  // be removed from the generated binary file by the linker. To avoid such
+  // removal, we add Touch to all registrar classes and make USE_OP macros to
+  // call this method. So, as long as the callee code calls USE_OP, the global
+  // registrar variable won't be removed by the linker.
+  void Touch() {}
+};
 
+template <typename OpType, typename ProtoMakerType, typename GradOpType>
+class OpRegistrar : public Registrar {
  public:
-  template <typename OpType, typename ProtoMakerType>
-  static void RegisterOp(const std::string& op_type) {
-    creators()[op_type] = [] { return new OpType; };
-    OpProto& op_proto = protos()[op_type];
-    OpAttrChecker& op_checker = op_checkers()[op_type];
-    ProtoMakerType(&op_proto, &op_checker);
-    PADDLE_ENFORCE(op_proto.IsInitialized(),
-                   "Fail to initialize %s's OpProto !", op_type);
+  explicit OpRegistrar(const char* op_type) { OpRegistrar(op_type, ""); }
+  OpRegistrar(const char* op_type, const char* grad_op_type) {
+    OpRegistry::RegisterOp<OpType, ProtoMakerType, GradOpType>(op_type,
+                                                               grad_op_type);
   }
+};
 
-  static OperatorBase* CreateOp(const OpDesc& op_desc) {
-    std::string op_type = op_desc.type();
-    OperatorBase* op = creators().at(op_type)();
-    op->desc_ = op_desc;
-    op->inputs_.reserve((size_t)op_desc.inputs_size());
-    std::copy(op_desc.inputs().begin(), op_desc.inputs().end(),
-              std::back_inserter(op->inputs_));
-    op->outputs_.reserve((size_t)op_desc.outputs_size());
-    std::copy(op_desc.outputs().begin(), op_desc.outputs().end(),
-              std::back_inserter(op->outputs_));
-    for (auto& attr : op_desc.attrs()) {
-      op->attrs_[attr.name()] = AttrTypeHelper::GetAttrValue(attr);
-    }
-    op_checkers().at(op_type).Check(op->attrs_);
-    op->Init();
-    return op;
+template <typename PlaceType, typename KernelType>
+class OpKernelRegistrar : public Registrar {
+ public:
+  explicit OpKernelRegistrar(const char* op_type) {
+    OperatorWithKernel::OpKernelKey key;
+    key.place_ = PlaceType();
+    OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KernelType);
   }
+};
 
- private:
-  static std::unordered_map<std::string, OpCreator>& creators() {
-    static std::unordered_map<std::string, OpCreator> creators_;
-    return creators_;
+/**
+ * check if MACRO is used in GLOBAL NAMESPACE.
+ */
+#define STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                        \
+  struct __test_global_namespace_##uniq_name##__ {};                          \
+  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
+                             __test_global_namespace_##uniq_name##__>::value, \
+                msg)
+
+/**
+ * Macro to register Operator.
+ */
+#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type,          \
+                    grad_op_class)                                            \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                             \
+      __reg_op__##op_type, "REGISTER_OP must be called in global namespace"); \
+  class _OpClass_##op_type##_ : public op_class {                             \
+   public:                                                                    \
+    DEFINE_OP_CLONE_METHOD(_OpClass_##op_type##_);                            \
+    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_, op_class);                   \
+  };                                                                          \
+  class _OpGradClass_##op_type##_ : public grad_op_class {                    \
+   public:                                                                    \
+    DEFINE_OP_CLONE_METHOD(_OpGradClass_##op_type##_);                        \
+    DEFINE_OP_CONSTRUCTOR(_OpGradClass_##op_type##_, grad_op_class);          \
+  };                                                                          \
+  static ::paddle::framework::OpRegistrar<                                    \
+      _OpClass_##op_type##_, op_maker_class, _OpGradClass_##op_type##_>       \
+      __op_registrar_##op_type##__(#op_type, #grad_op_type);                  \
+  int TouchOpRegistrar_##op_type() {                                          \
+    __op_registrar_##op_type##__.Touch();                                     \
+    return 0;                                                                 \
   }
 
-  static std::unordered_map<std::string, OpProto>& protos() {
-    static std::unordered_map<std::string, OpProto> protos_;
-    return protos_;
-  };
-
-  static std::unordered_map<std::string, OpAttrChecker>& op_checkers() {
-    static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
-    return op_checkers_;
-  };
-};
-
-template <typename OpType, typename ProtoMakerType>
-class OpRegisterHelper {
- public:
-  OpRegisterHelper(std::string op_type) {
-    OpRegistry::RegisterOp<OpType, ProtoMakerType>(op_type);
+#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
+  REGISTER_OP(op_type, op_class, op_maker_class, , ::paddle::framework::NOP)
+
+/**
+ * Macro to register OperatorKernel.
+ */
+#define REGISTER_OP_KERNEL(op_type, DEVICE_TYPE, place_class, ...)        \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
+      __reg_op_kernel_##op_type##_##DEVICE_TYPE##__,                      \
+      "REGISTER_OP_KERNEL must be called in global namespace");           \
+  static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__> \
+      __op_kernel_registrar_##op_type##_##DEVICE_TYPE##__(#op_type);      \
+  int TouchOpKernelRegistrar_##op_type##_##DEVICE_TYPE() {                \
+    __op_kernel_registrar_##op_type##_##DEVICE_TYPE##__.Touch();          \
+    return 0;                                                             \
   }
-};
 
-#define REGISTER_OP(type, op_class, op_maker_class)                         \
-  class op_class##Register {                                                \
-   private:                                                                 \
-    const static OpRegisterHelper<op_class, op_maker_class> reg;            \
-  };                                                                        \
-  const OpRegisterHelper<op_class, op_maker_class> op_class##Register::reg( \
-      #type)
+#define REGISTER_OP_GPU_KERNEL(op_type, ...) \
+  REGISTER_OP_KERNEL(op_type, GPU, ::paddle::platform::GPUPlace, __VA_ARGS__)
+
+#define REGISTER_OP_CPU_KERNEL(op_type, ...) \
+  REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
+
+/**
+ * Macro to mark what Operator and Kernel
+ * we will use and tell the compiler to
+ * link them into target.
+ */
+#define USE_OP_ITSELF(op_type)                                    \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                 \
+      __use_op_itself_##op_type,                                  \
+      "USE_OP_ITSELF must be called in global namespace");        \
+  extern int TouchOpRegistrar_##op_type();                        \
+  static int use_op_itself_##op_type##_ __attribute__((unused)) = \
+      TouchOpRegistrar_##op_type()
+
+#define USE_OP_DEVICE_KERNEL(op_type, DEVICE_TYPE)               \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                \
+      __use_op_kernel_##op_type##_##DEVICE_TYPE##__,             \
+      "USE_OP_DEVICE_KERNEL must be in global namespace");       \
+  extern int TouchOpKernelRegistrar_##op_type##_##DEVICE_TYPE(); \
+  static int use_op_kernel_##op_type##_##DEVICE_TYPE##_          \
+      __attribute__((unused)) =                                  \
+          TouchOpKernelRegistrar_##op_type##_##DEVICE_TYPE()
+
+// TODO(fengjiayi): The following macros
+// seems ugly, do we have better method?
+
+#ifdef PADDLE_ONLY_CPU
+#define USE_OP_KERNEL(op_type) USE_OP_DEVICE_KERNEL(op_type, CPU)
+#else
+#define USE_OP_KERNEL(op_type)        \
+  USE_OP_DEVICE_KERNEL(op_type, CPU); \
+  USE_OP_DEVICE_KERNEL(op_type, GPU)
+#endif
+
+#define USE_CPU_ONLY_OP(op_type) \
+  USE_OP_ITSELF(op_type);        \
+  USE_OP_DEVICE_KERNEL(op_type, CPU);
+
+#define USE_OP(op_type)   \
+  USE_OP_ITSELF(op_type); \
+  USE_OP_KERNEL(op_type)
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index f5162fb870..50c45919c5 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -1,15 +1,16 @@
 #include "paddle/framework/op_registry.h"
 #include <gtest/gtest.h>
 
-using namespace paddle::framework;
+namespace pd = paddle::framework;
 
 namespace paddle {
 namespace framework {
 class CosineOp : public OperatorBase {
  public:
-  void Run(const std::shared_ptr<Scope>& scope,
+  using OperatorBase::OperatorBase;
+  void Run(const Scope& scope,
            const platform::DeviceContext& dev_ctx) const override {}
-  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
+  void InferShape(const Scope& scope) const override {}
 };
 
 class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -21,47 +22,53 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
     AddAttr<float>("scale", "scale of cosine op")
         .SetDefault(1.0)
         .LargerThan(0.0);
-    AddType("cos");
     AddComment("This is cos op");
   }
 };
 
-REGISTER_OP(cos_sim, CosineOp, CosineOpProtoAndCheckerMaker);
-
 class MyTestOp : public OperatorBase {
  public:
-  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
-  void Run(const std::shared_ptr<Scope>& scope,
+  using OperatorBase::OperatorBase;
+  void InferShape(const Scope& scope) const override {}
+  void Run(const Scope& scope,
            const platform::DeviceContext& dev_ctx) const override {}
-
- public:
 };
 
 class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
  public:
   MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of cosine op");
-    AddOutput("output", "output of cosine op");
+    AddInput("input", "input of cosine op").AsDuplicable();
+    AddOutput("output", "output of cosine op").AsIntermediate();
     auto my_checker = [](int i) {
       PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
     };
     AddAttr<int>("test_attr", "a simple test attribute")
         .AddCustomChecker(my_checker);
-    AddType("my_test_op");
     AddComment("This is my_test op");
   }
 };
-
-REGISTER_OP(my_test_op, MyTestOp, MyTestOpProtoAndCheckerMaker);
 }  // namespace framework
 }  // namespace paddle
 
+static void BuildVar(const std::string& param_name,
+                     std::initializer_list<const char*> arguments,
+                     paddle::framework::OpDesc::Var* var) {
+  var->set_parameter(param_name);
+  for (auto& arg_name : arguments) {
+    var->add_arguments(arg_name);
+  }
+}
+REGISTER_OP_WITHOUT_GRADIENT(cos_sim, paddle::framework::CosineOp,
+                             paddle::framework::CosineOpProtoAndCheckerMaker);
+REGISTER_OP_WITHOUT_GRADIENT(my_test_op, paddle::framework::MyTestOp,
+                             paddle::framework::MyTestOpProtoAndCheckerMaker);
+
 TEST(OpRegistry, CreateOp) {
   paddle::framework::OpDesc op_desc;
   op_desc.set_type("cos_sim");
-  op_desc.add_inputs("aa");
-  op_desc.add_outputs("bb");
+  BuildVar("input", {"aa"}, op_desc.add_inputs());
+  BuildVar("output", {"bb"}, op_desc.add_outputs());
 
   float scale = 3.3;
   auto attr = op_desc.mutable_attrs()->Add();
@@ -69,9 +76,8 @@ TEST(OpRegistry, CreateOp) {
   attr->set_type(paddle::framework::AttrType::FLOAT);
   attr->set_f(scale);
 
-  paddle::framework::OperatorBase* op =
-      paddle::framework::OpRegistry::CreateOp(op_desc);
-  auto scope = std::make_shared<Scope>();
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  paddle::framework::Scope scope;
   paddle::platform::CPUDeviceContext dev_ctx;
   op->Run(scope, dev_ctx);
   float scale_get = op->GetAttr<float>("scale");
@@ -81,8 +87,8 @@ TEST(OpRegistry, CreateOp) {
 TEST(OpRegistry, IllegalAttr) {
   paddle::framework::OpDesc op_desc;
   op_desc.set_type("cos_sim");
-  op_desc.add_inputs("aa");
-  op_desc.add_outputs("bb");
+  BuildVar("input", {"aa"}, op_desc.add_inputs());
+  BuildVar("output", {"bb"}, op_desc.add_outputs());
 
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("scale");
@@ -91,9 +97,8 @@ TEST(OpRegistry, IllegalAttr) {
 
   bool caught = false;
   try {
-    paddle::framework::OperatorBase* op __attribute__((unused)) =
-        paddle::framework::OpRegistry::CreateOp(op_desc);
-  } catch (paddle::framework::EnforceNotMet err) {
+    paddle::framework::OpRegistry::CreateOp(op_desc);
+  } catch (paddle::platform::EnforceNotMet err) {
     caught = true;
     std::string msg = "larger_than check fail";
     const char* err_msg = err.what();
@@ -107,14 +112,13 @@ TEST(OpRegistry, IllegalAttr) {
 TEST(OpRegistry, DefaultValue) {
   paddle::framework::OpDesc op_desc;
   op_desc.set_type("cos_sim");
-  op_desc.add_inputs("aa");
-  op_desc.add_outputs("bb");
+  BuildVar("input", {"aa"}, op_desc.add_inputs());
+  BuildVar("output", {"bb"}, op_desc.add_outputs());
 
   ASSERT_TRUE(op_desc.IsInitialized());
 
-  paddle::framework::OperatorBase* op =
-      paddle::framework::OpRegistry::CreateOp(op_desc);
-  auto scope = std::make_shared<Scope>();
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  paddle::framework::Scope scope;
   paddle::platform::CPUDeviceContext dev_ctx;
   op->Run(scope, dev_ctx);
   ASSERT_EQ(op->GetAttr<float>("scale"), 1.0);
@@ -123,15 +127,14 @@ TEST(OpRegistry, DefaultValue) {
 TEST(OpRegistry, CustomChecker) {
   paddle::framework::OpDesc op_desc;
   op_desc.set_type("my_test_op");
-  op_desc.add_inputs("ii");
-  op_desc.add_outputs("oo");
+  BuildVar("input", {"ii"}, op_desc.add_inputs());
+  BuildVar("output", {"oo"}, op_desc.add_outputs());
 
   // attr 'test_attr' is not set
   bool caught = false;
   try {
-    paddle::framework::OperatorBase* op __attribute__((unused)) =
-        paddle::framework::OpRegistry::CreateOp(op_desc);
-  } catch (paddle::framework::EnforceNotMet err) {
+    paddle::framework::OpRegistry::CreateOp(op_desc);
+  } catch (paddle::platform::EnforceNotMet err) {
     caught = true;
     std::string msg = "Attribute 'test_attr' is required!";
     const char* err_msg = err.what();
@@ -148,9 +151,8 @@ TEST(OpRegistry, CustomChecker) {
   attr->set_i(3);
   caught = false;
   try {
-    paddle::framework::OperatorBase* op __attribute__((unused)) =
-        paddle::framework::OpRegistry::CreateOp(op_desc);
-  } catch (paddle::framework::EnforceNotMet err) {
+    paddle::framework::OpRegistry::CreateOp(op_desc);
+  } catch (paddle::platform::EnforceNotMet err) {
     caught = true;
     std::string msg = "'test_attr' must be even!";
     const char* err_msg = err.what();
@@ -166,16 +168,42 @@ TEST(OpRegistry, CustomChecker) {
   attr->set_name("test_attr");
   attr->set_type(paddle::framework::AttrType::INT);
   attr->set_i(4);
-  paddle::framework::OperatorBase* op =
-      paddle::framework::OpRegistry::CreateOp(op_desc);
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   paddle::platform::CPUDeviceContext dev_ctx;
-  auto scope = std::make_shared<Scope>();
+  paddle::framework::Scope scope;
   op->Run(scope, dev_ctx);
   int test_attr = op->GetAttr<int>("test_attr");
   ASSERT_EQ(test_attr, 4);
 }
 
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
\ No newline at end of file
+class TestAttrProtoMaker : public pd::OpProtoAndCheckerMaker {
+ public:
+  TestAttrProtoMaker(pd::OpProto* proto, pd::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddAttr<float>("scale", "scale of test op");
+    AddAttr<float>("scale", "scale of test op");
+  }
+};
+
+TEST(ProtoMaker, DuplicatedAttr) {
+  pd::OpProto op_proto;
+  pd::OpAttrChecker op_checker;
+  auto proto_maker = TestAttrProtoMaker(&op_proto, &op_checker);
+  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
+}
+
+class TestInOutProtoMaker : public pd::OpProtoAndCheckerMaker {
+ public:
+  TestInOutProtoMaker(pd::OpProto* proto, pd::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of test op");
+    AddInput("input", "input of test op");
+  }
+};
+
+TEST(ProtoMaker, DuplicatedInOut) {
+  pd::OpProto op_proto;
+  pd::OpAttrChecker op_checker;
+  auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker);
+  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
+}
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 8f7adff8b3..eadd8f3316 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -13,31 +13,194 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/framework/operator.h"
+#include <algorithm>
+#include "paddle/framework/op_registry.h"
 
 namespace paddle {
 namespace framework {
 
+template <>
+Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
+    platform::CPUPlace, Eigen::DefaultDevice>() const {
+  return *device_context_->get_eigen_device<Eigen::DefaultDevice>();
+}
+
+#ifndef PADDLE_ONLY_CPU
+template <>
+Eigen::GpuDevice&
+ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
+  return *device_context_->get_eigen_device<Eigen::GpuDevice>();
+}
+#endif
+
+const std::string& OperatorBase::Input(const std::string& name) const {
+  auto& ins = Inputs(name);
+  PADDLE_ENFORCE_EQ(ins.size(), 1UL,
+                    "Op %s input %s should contain only one variable", type_,
+                    name);
+  return ins[0];
+}
+
+const std::vector<std::string>& OperatorBase::Inputs(
+    const std::string& name) const {
+  auto it = inputs_.find(name);
+  PADDLE_ENFORCE(it != inputs_.end(), "Op %s do not have input %s", type_,
+                 name);
+  return it->second;
+}
+
+const std::string& OperatorBase::Output(const std::string& name) const {
+  auto& outs = Outputs(name);
+  PADDLE_ENFORCE_EQ(outs.size(), 1UL,
+                    "Op %s output %s should contain only one variable", type_,
+                    name);
+  return outs[0];
+}
+
+const std::vector<std::string>& OperatorBase::Outputs(
+    const std::string& name) const {
+  auto it = outputs_.find(name);
+  PADDLE_ENFORCE(it != outputs_.end(), "Op %s does not have output %s", type_,
+                 name);
+  return it->second;
+}
+
 std::string OperatorBase::DebugString() const {
   std::stringstream ss;
-  ss << "=================\n";
-  ss << "type = " << desc_.type() << "\n";
-  ss << "inputs = [";
-  for (auto& ipt : inputs_) {
-    ss << ipt << ", ";
-  }
-  ss << "]\n";
-  ss << "outputs = [";
-  for (auto& opt : outputs_) {
-    ss << opt << ", ";
-  }
-  ss << "]\n";
-  ss << "attr_keys = [";
-  for (auto& attr : attrs_) {
-    ss << attr.first << ", ";
-  }
-  ss << "]\n";
+  ss << "Op(" << type_ << "), inputs:{";
+  for (auto it = inputs_.begin(); it != inputs_.end();) {
+    auto& input = *it;
+    ss << input.first << "[";
+    for (size_t i = 0; i < input.second.size(); ++i) {
+      ss << input.second[i];
+      if (i != input.second.size() - 1) {
+        ss << ", ";
+      }
+    }
+    ss << "]";
+    ++it;
+    if (it != inputs_.end()) {
+      ss << ", ";
+    }
+  }
+  ss << "}, outputs:{";
+  for (auto it = outputs_.begin(); it != outputs_.end();) {
+    auto& output = *it;
+    ss << output.first << "[";
+    for (size_t i = 0; i < output.second.size(); ++i) {
+      ss << output.second[i];
+      if (i != output.second.size() - 1) {
+        ss << ", ";
+      }
+    }
+    ss << "]";
+    ++it;
+    if (it != outputs_.end()) {
+      ss << ", ";
+    }
+  }
+  ss << "}.";
   return ss.str();
 }
 
+void OperatorBase::Rename(const std::string& old_name,
+                          const std::string& new_name) {
+  for (auto& input : inputs_) {
+    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
+  }
+  for (auto& output : outputs_) {
+    std::replace(output.second.begin(), output.second.end(), old_name,
+                 new_name);
+  }
+}
+
+OperatorBase::OperatorBase(const std::string& type,
+                           const OperatorBase::VarNameMap& inputs,
+                           const OperatorBase::VarNameMap& outputs,
+                           const AttributeMap& attrs)
+    : type_(type), inputs_(inputs), outputs_(outputs), attrs_(attrs) {
+  static std::atomic<size_t> gUniqId(0UL);
+  for (auto& output : outputs_) {
+    for (auto& output_name : output.second) {
+      if (output_name == kTempVarName) {
+        output_name += type_;
+        output_name += "@";
+        output_name += std::to_string(gUniqId.fetch_add(1));
+      }
+    }
+  }
+}
+
+std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const {
+  std::vector<std::string> ret_val;
+  if (has_intermediate) {
+    // push all outputs into ret_val
+    for (auto& o : outputs_) {
+      ret_val.reserve(ret_val.size() + o.second.size());
+      ret_val.insert(ret_val.end(), o.second.begin(), o.second.end());
+    }
+    return ret_val;
+  }
+  auto it = OpRegistry::op_info_map().find(type_);
+  PADDLE_ENFORCE(
+      it != OpRegistry::op_info_map().end(),
+      "Operator %s not registered, cannot figure out intermediate outputs",
+      type_);
+  PADDLE_ENFORCE(
+      it->second.proto_ != nullptr,
+      "Operator %s has no OpProto, cannot figure out intermediate outputs",
+      type_);
+
+  // get all OpProto::Var for outputs
+  for (auto& o : it->second.proto_->outputs()) {
+    // ignore all intermediate output
+    if (o.intermediate()) continue;
+    auto out = outputs_.find(o.name());
+    if (out != outputs_.end()) {
+      ret_val.reserve(ret_val.size() + out->second.size());
+      ret_val.insert(ret_val.end(), out->second.begin(), out->second.end());
+    }
+  }
+  return ret_val;
+}
+
+void OpProtoAndCheckerMaker::Validate() {
+  validated_ = true;
+  CheckNoDuplicatedInOutAttrs();
+}
+
+OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddInput(
+    const std::string& name, const std::string& comment) {
+  auto* input = proto_->add_inputs();
+  input->set_name(name);
+  input->set_comment(comment);
+  return OpProtoAndCheckerMaker::VariableBuilder{input};
+}
+
+OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput(
+    const std::string& name, const std::string& comment) {
+  auto* output = proto_->add_outputs();
+  output->set_name(name);
+  output->set_comment(comment);
+  return OpProtoAndCheckerMaker::VariableBuilder{output};
+}
+
+void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
+  std::unordered_set<std::string> names;
+  auto checker = [&](const std::string& name) {
+    PADDLE_ENFORCE(!names.count(name), "[%s] is duplicated", name);
+    names.insert(name);
+  };
+  for (auto& attr : proto_->attrs()) {
+    checker(attr.name());
+  }
+  for (auto& input : proto_->inputs()) {
+    checker(input.name());
+  }
+  for (auto& output : proto_->outputs()) {
+    checker(output.name());
+  }
+}
+
 }  // namespace framework
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 4336115670..8072980889 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -14,21 +14,45 @@ limitations under the License. */
 
 #pragma once
 
-#include <paddle/framework/attr_checker.h>
-#include <paddle/framework/op_desc.pb.h>
-#include <paddle/framework/scope.h>
-#include <paddle/platform/device_context.h>
-#include <paddle/platform/place.h>
-#include <paddle/utils/Error.h>
-#include <boost/variant.hpp>
+#include <algorithm>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/framework/attribute.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/place.h"
+#include "paddle/platform/variant.h"
+#include "paddle/utils/Error.h"
+
 namespace paddle {
 namespace framework {
 
+/// If a variable is a empty variable, that name will be used.
+constexpr char kEmptyVarName[] = "@EMPTY@";
+
+/// If a variable is a temporary variable, that name will be set in Python,
+/// but it will be convert to a unique name in scope after OpCreator.
+constexpr char kTempVarName[] = "@TEMP@";
+
+/// If a variable's name has a certain suffix, it means that the
+/// variable is the gradient of another varibale.
+/// e.g. Variable "x@GRAD" is the gradient of varibale "x".
+constexpr char kGradVarSuffix[] = "@GRAD";
+
+/// Variables with this suffix are supposed to be filled up with zeros.
+constexpr char kZeroVarSuffix[] = "@ZERO";
+
+inline std::string GradVarName(const std::string& var_name) {
+  return var_name + kGradVarSuffix;
+}
+
 class OperatorBase;
+class InferShapeContext;
+class ExecutionContext;
 
 /**
  * OperatorBase has the basic element that Net will call to do computation.
@@ -38,6 +62,11 @@ class OperatorBase;
  */
 class OperatorBase {
  public:
+  using VarNameMap = std::map<std::string, std::vector<std::string>>;
+
+  OperatorBase(const std::string& type, const VarNameMap& inputs,
+               const VarNameMap& outputs, const AttributeMap& attrs);
+
   virtual ~OperatorBase() {}
 
   template <typename T>
@@ -47,58 +76,290 @@ class OperatorBase {
     return boost::get<T>(attrs_.at(name));
   }
 
-  std::string DebugString() const;
-
-  /// Init will be called after CreateOperator, you can put some initialization
-  /// logic here.
-  virtual void Init() {}
+  virtual std::string DebugString() const;
 
   /// InferShape infer the size of Variables used by this Operator with
   /// information inside scope
-  virtual void InferShape(const std::shared_ptr<Scope>& scope) const = 0;
+  virtual void InferShape(const Scope& scope) const = 0;
 
   /// Net will call this function to Run an op.
-  virtual void Run(const std::shared_ptr<Scope>& scope,
+  virtual void Run(const Scope& scope,
                    const platform::DeviceContext& dev_ctx) const = 0;
 
+  virtual bool IsNetOp() const { return false; }
+
+  virtual bool SupportGPU() const { return false; }
+
+  /// rename inputs outputs name
+  void Rename(const std::string& old_name, const std::string& new_name);
+
+  const VarNameMap& Inputs() const { return inputs_; }
+  const VarNameMap& Outputs() const { return outputs_; }
+  //! Get a input with argument's name described in `op_proto`
+  const std::string& Input(const std::string& name) const;
+  //! Get a input which has multiple variables.
+  const std::vector<std::string>& Inputs(const std::string& name) const;
+
+  //! Get a output with argument's name described in `op_proto`
+  const std::string& Output(const std::string& name) const;
+  //! Get an output which has multiple variables.
+  //! TODO add a vector_view to prevent memory copy.
+  const std::vector<std::string>& Outputs(const std::string& name) const;
+
+  virtual std::vector<std::string> OutputVars(bool has_intermediate) const;
+
+  const std::string& Type() const { return type_; }
+  void SetType(const std::string& type) { type_ = type; }
+  const AttributeMap& Attrs() const { return attrs_; }
+
+  // Return a new operator instance, which is as same as this.
+  // Use unique_ptr to prevent caller forget to delete this pointer.
+  virtual std::unique_ptr<OperatorBase> Clone() const = 0;
+
  protected:
-  std::string Type() const { return desc_.type(); }
+  std::string type_;
+  // NOTE: in case of OpGrad, inputs_ contains:
+  // I (Inputs)opear
+  // O (Outputs)
+  // OG (Output Gradients)
+  VarNameMap inputs_;
 
- public:
-  OpDesc desc_;
-  std::vector<std::string> inputs_;
-  std::vector<std::string> outputs_;
+  // NOTE: in case of OpGrad, outputs_ contains
+  // IG (Inputs Gradients)
+  VarNameMap outputs_;
   AttributeMap attrs_;
 };
 
-class OpKernel {
+// Macro for define a clone method.
+// If you are writing an kernel operator, `Clone` will be defined when you
+// register it. i.e. `Clone` method is not needed to define by yourself.
+#define DEFINE_OP_CLONE_METHOD(CLS)                       \
+  std::unique_ptr<OperatorBase> Clone() const final {     \
+    return std::unique_ptr<OperatorBase>(new CLS(*this)); \
+  }
+
+// Macro for define a default constructor for Operator.
+// You can also use
+//   using PARENT_CLASS::PARENT_CLASS;
+// to use parent's constructor.
+#define DEFINE_OP_CONSTRUCTOR(CLS, PARENT_CLS)                                 \
+  CLS(const std::string& type, const VarNameMap& inputs,                       \
+      const VarNameMap& outputs, const paddle::framework::AttributeMap& attrs) \
+      : PARENT_CLS(type, inputs, outputs, attrs) {}
+
+class NOP : public OperatorBase {
  public:
-  /**
-   * KernelContext is the only parameter of Kernel Run function.
-   * Run will get input/output variables, state such as momentum and
-   * device resource such as CUDA stream, cublas handle, etc. from
-   * KernelContext. User should construct it before run the Operator.
-   */
-  class KernelContext {
-   public:
-    KernelContext(const OperatorBase* op, const std::shared_ptr<Scope>& scope,
-                  const platform::DeviceContext& device_context)
-        : op_(*op), scope_(scope), device_context_(device_context) {}
-
-    const Variable* Input(int index) const {
-      return scope_->GetVariable(op_.inputs_[index]);
+  using OperatorBase::OperatorBase;
+  void InferShape(const Scope& scope) const override {}
+  void Run(const Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {}
+  std::unique_ptr<OperatorBase> Clone() const override {
+    return std::unique_ptr<OperatorBase>(new NOP(*this));
+  }
+};
+
+// this class not only make proto but also init attribute checkers.
+class OpProtoAndCheckerMaker {
+ public:
+  OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : proto_(proto), op_checker_(op_checker) {}
+
+  ~OpProtoAndCheckerMaker() {
+    PADDLE_ENFORCE(validated_, "should call Validate after build");
+  }
+
+  void Validate();
+
+ protected:
+  struct VariableBuilder {
+    OpProto::Var* var_;
+
+    VariableBuilder& AsDuplicable() {
+      var_->set_duplicable(true);
+      return *this;
     }
 
-    Variable* Output(int index) const {
-      return scope_->GetVariable(op_.outputs_[index]);
+    VariableBuilder& AsIntermediate() {
+      var_->set_intermediate(true);
+      return *this;
     }
 
-    const OperatorBase& op_;
-    const std::shared_ptr<Scope>& scope_;
-    const platform::DeviceContext& device_context_;
+    VariableBuilder& NotInGradient() {
+      var_->set_not_in_gradient(true);
+      return *this;
+    }
   };
 
-  virtual void Compute(const KernelContext& context) const = 0;
+  VariableBuilder AddInput(const std::string& name, const std::string& comment);
+
+  VariableBuilder AddOutput(const std::string& name,
+                            const std::string& comment);
+
+  template <typename T>
+  TypedAttrChecker<T>& AddAttr(const std::string& name,
+                               const std::string& comment,
+                               bool generated = false) {
+    auto* attr = proto_->add_attrs();
+    attr->set_name(name);
+    attr->set_comment(comment);
+    attr->set_generated(generated);
+    attr->set_type(AttrTypeID<T>());
+    return op_checker_->AddAttrChecker<T>(name);
+  }
+
+  void AddComment(const std::string& comment) { proto_->set_comment(comment); }
+
+ private:
+  void CheckNoDuplicatedInOutAttrs();
+
+  OpProto* proto_;
+  OpAttrChecker* op_checker_;
+  bool validated_{false};
+};
+
+class NOPMaker : public OpProtoAndCheckerMaker {
+ public:
+  NOPMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {}
+};
+
+class InferShapeContext {
+ public:
+  InferShapeContext(const OperatorBase& op, const Scope& scope)
+      : op_(op), scope_(scope) {}
+
+  size_t InputSize(const std::string& name) const {
+    return op_.Inputs(name).size();
+  }
+
+  size_t OutputSize(const std::string& name) const {
+    return op_.Outputs(name).size();
+  }
+
+  const Variable* InputVar(const std::string& name) const {
+    return scope_.FindVar(op_.Input(name));
+  }
+
+  Variable* OutputVar(const std::string& name) const {
+    return scope_.FindVar(op_.Output(name));
+  }
+
+  const std::vector<const Variable*> MultiInputVar(
+      const std::string& name) const {
+    auto names = op_.Inputs(name);
+    std::vector<const Variable*> res;
+    res.reserve(names.size());
+    std::transform(
+        names.begin(), names.end(), std::back_inserter(res),
+        [this](const std::string& name) { return scope_.FindVar(name); });
+    return res;
+  }
+
+  std::vector<const Variable*> MultiOutputVar(const std::string& name) const {
+    auto names = op_.Outputs(name);
+    std::vector<const Variable*> res;
+    res.reserve(names.size());
+    std::transform(
+        names.begin(), names.end(), std::back_inserter(res),
+        [this](const std::string& name) { return scope_.FindVar(name); });
+    return res;
+  }
+
+  template <typename T>
+  const T* Input(const std::string& name) const {
+    auto* var = InputVar(name);
+    PADDLE_ENFORCE_NOT_NULL(var, "Input(%s) should not be nullptr", name);
+    return &var->Get<T>();
+  }
+
+  template <typename T>
+  T* Output(const std::string& name) const {
+    auto var = OutputVar(name);
+    PADDLE_ENFORCE_NOT_NULL(var, "Output(%s) should not be nullptr", name);
+    return var->GetMutable<T>();
+  }
+
+  template <typename T>
+  const std::vector<const T*> MultiInput(const std::string& name) const {
+    auto names = op_.Inputs(name);
+    std::vector<const T*> res;
+    res.reserve(names.size());
+    std::transform(names.begin(), names.end(), std::back_inserter(res),
+                   [&](const std::string& sub_name) {
+                     auto var = scope_.FindVar(sub_name);
+                     PADDLE_ENFORCE_NOT_NULL(
+                         var, "MultiInput(%s:%s) should not be nullptr", name,
+                         sub_name);
+                     return &var->Get<T>();
+                   });
+    return res;
+  }
+
+  template <typename T>
+  std::vector<const T*> MultiOutput(const std::string& name) const {
+    auto names = op_.Outputs(name);
+    std::vector<const T*> res;
+    res.reserve(names.size());
+    std::transform(names.begin(), names.end(), std::back_inserter(res),
+                   [&](const std::string& sub_name) {
+                     auto var = scope_.FindVar(sub_name);
+                     PADDLE_ENFORCE_NOT_NULL(
+                         var, "MultiOutput(%s:%s) should not be nullptr.", name,
+                         sub_name);
+                     return var->GetMutable<T>();
+                   });
+    return res;
+  }
+
+  const OperatorBase& op_;
+  const Scope& scope_;
+};
+
+template <typename T>
+struct EigenDeviceConverter;
+
+template <>
+struct EigenDeviceConverter<platform::CPUPlace> {
+  using EigenDeviceType = Eigen::DefaultDevice;
+};
+
+#ifndef PADDLE_ONLY_CPU
+template <>
+struct EigenDeviceConverter<platform::GPUPlace> {
+  using EigenDeviceType = Eigen::GpuDevice;
+};
+#endif
+
+class ExecutionContext : public InferShapeContext {
+ public:
+  ExecutionContext(const OperatorBase& op, const Scope& scope,
+                   const platform::DeviceContext* device_context)
+      : InferShapeContext(op, scope), device_context_(device_context) {}
+
+  template <typename PlaceType,
+            typename DeviceType =
+                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
+  DeviceType& GetEigenDevice() const;
+
+  platform::Place GetPlace() const { return device_context_->GetPlace(); }
+
+  const platform::DeviceContext* device_context() const {
+    return device_context_;
+  }
+
+  const platform::DeviceContext* device_context_;
+};
+
+class OpKernel {
+ public:
+  /**
+   * ExecutionContext is the only parameter of Kernel Run function.
+   * Run will get input/output variables, state such as momentum and
+   * device resource such as CUDA stream, cublas handle, etc. from
+   * ExecutionContext. User should construct it before run the Operator.
+   */
+
+  virtual void Compute(const ExecutionContext& context) const = 0;
 
   virtual ~OpKernel() {}
 };
@@ -109,11 +370,13 @@ class OperatorWithKernel : public OperatorBase {
     platform::Place place_;
 
     OpKernelKey() = default;
-    OpKernelKey(const platform::DeviceContext& dev_ctx) {
+    explicit OpKernelKey(const platform::DeviceContext& dev_ctx) {
       place_ = dev_ctx.GetPlace();
     }
 
-    bool operator==(const OpKernelKey& o) const { return place_ == o.place_; }
+    bool operator==(const OpKernelKey& o) const {
+      return platform::places_are_same_class(place_, o.place_);
+    }
   };
 
   struct OpKernelHash {
@@ -126,29 +389,35 @@ class OperatorWithKernel : public OperatorBase {
   using OpKernelMap =
       std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
 
-  void Run(const std::shared_ptr<Scope>& scope,
+  OperatorWithKernel(const std::string& type, const VarNameMap& inputs,
+                     const VarNameMap& outputs, const AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void InferShape(const Scope& scope) const override {
+    InferShape(InferShapeContext(*this, scope));
+  }
+
+  void Run(const Scope& scope,
            const platform::DeviceContext& dev_ctx) const final {
-    auto& opKernel = AllOpKernels().at(Type()).at(OpKernelKey(dev_ctx));
-    opKernel->Compute(OpKernel::KernelContext(this, scope, dev_ctx));
+    auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
+    opKernel->Compute(ExecutionContext(*this, scope, &dev_ctx));
   }
 
   static std::unordered_map<std::string /* op_type */, OpKernelMap>&
   AllOpKernels() {
     static std::unordered_map<std::string, OpKernelMap> g_all_op_kernels;
     return g_all_op_kernels;
-  };
+  }
+
+  bool SupportGPU() const override {
+    OperatorWithKernel::OpKernelKey key;
+    key.place_ = platform::GPUPlace();
+    return OperatorWithKernel::AllOpKernels().at(type_).count(key) != 0;
+  }
+
+ protected:
+  virtual void InferShape(const InferShapeContext& ctx) const = 0;
 };
 
 }  // namespace framework
 }  // namespace paddle
-
-#define REGISTER_OP_KERNEL(type, PlaceType, KernelType)                   \
-  struct __op_kernel_register__##type##__ {                               \
-    __op_kernel_register__##type##__() {                                  \
-      ::paddle::framework::OperatorWithKernel::OpKernelKey key;           \
-      key.place_ = PlaceType();                                           \
-      ::paddle::framework::OperatorWithKernel::AllOpKernels()[#type][key] \
-          .reset(new KernelType());                                       \
-    }                                                                     \
-  };                                                                      \
-  static __op_kernel_register__##type##__ __reg_kernel_##type##__
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index 01b87bb50e..2425b87779 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -19,110 +19,247 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class OperatorTest : public OperatorBase {
+static int op_run_num = 0;
+
+class OpWithoutKernelTest : public OperatorBase {
  public:
-  void Init() override { x = 1; }
-  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
-  void Run(const std::shared_ptr<Scope>& scope,
+  OpWithoutKernelTest(const std::string& type, const VarNameMap& inputs,
+                      const VarNameMap& outputs, const AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs), x(1) {}
+  void InferShape(const Scope& scope) const override {}
+  void Run(const Scope& scope,
            const platform::DeviceContext& dev_ctx) const override {
-    float scale = GetAttr<float>("scale");
-    ASSERT_NEAR(scale, 3.14, 1e-5);
-    ASSERT_EQ(scope->GetVariable(inputs_[0]), nullptr);
+    ++op_run_num;
+    ASSERT_EQ(static_cast<int>(inputs_.size()), 1);
+    ASSERT_EQ(static_cast<int>(outputs_.size()), 1);
+    ASSERT_EQ(scope.FindVar(inputs_.at("input")[0]), nullptr);
     ASSERT_EQ(x, 1);
-    ASSERT_NE(scope->GetVariable(outputs_[0]), nullptr);
+    ASSERT_NE(scope.FindVar(outputs_.at("output")[0]), nullptr);
   }
 
  public:
-  float x = 0;
+  int x{0};
 };
 
-class OperatorTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+class OpeWithoutKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
  public:
-  OperatorTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+  OpeWithoutKernelTestProtoAndCheckerMaker(OpProto* proto,
+                                           OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("input", "input of test op");
     AddOutput("output", "output of test op");
-    AddAttr<float>("scale", "scale of cosine op")
-        .SetDefault(1.0)
-        .LargerThan(0.0);
-    AddType("test_operator");
+    AddAttr<float>("scale", "scale of cosine op");
     AddComment("This is test op");
   }
 };
 
-REGISTER_OP(test_operator, OperatorTest, OperatorTestProtoAndCheckerMaker);
+}  // namespace framework
+}  // namespace paddle
+
+static void BuildVar(const std::string& param_name,
+                     std::initializer_list<const char*> arguments,
+                     paddle::framework::OpDesc::Var* var) {
+  var->set_parameter(param_name);
+  for (auto& arg_name : arguments) {
+    *var->mutable_arguments()->Add() = arg_name;
+  }
+}
+
+REGISTER_OP_WITHOUT_GRADIENT(
+    test_operator, paddle::framework::OpWithoutKernelTest,
+    paddle::framework::OpeWithoutKernelTestProtoAndCheckerMaker);
 
 TEST(OperatorBase, all) {
-  OpDesc op_desc;
+  paddle::framework::OpDesc op_desc;
   op_desc.set_type("test_operator");
-  *op_desc.mutable_inputs()->Add() = "IN1";
-  *op_desc.mutable_outputs()->Add() = "OUT1";
+  BuildVar("input", {"IN1"}, op_desc.add_inputs());
+  BuildVar("output", {"OUT1"}, op_desc.add_outputs());
+
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("scale");
   attr->set_type(paddle::framework::AttrType::FLOAT);
-  float scale = 3.14;
-  attr->set_f(scale);
+  attr->set_f(3.14);
 
-  platform::CPUDeviceContext device_context;
-  auto scope = std::make_shared<Scope>();
+  paddle::platform::CPUDeviceContext device_context;
+  paddle::framework::Scope scope;
 
-  OperatorBase* op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  ASSERT_EQ(op->GetAttr<float>("scale"), scale);
-  scope->CreateVariable("OUT1");
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  scope.NewVar("OUT1");
+  ASSERT_EQ(paddle::framework::op_run_num, 0);
+  op->InferShape(scope);
   op->Run(scope, device_context);
-  std::cout << op->DebugString() << std::endl;
-  delete op;
+  ASSERT_EQ(paddle::framework::op_run_num, 1);
 }
 
+namespace paddle {
+namespace framework {
+
 class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
  public:
   OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of test op");
-    AddOutput("output", "output of test op");
+    AddInput("x", "input of test op");
+    AddOutput("y", "output of test op");
     AddAttr<float>("scale", "scale of cosine op")
         .SetDefault(1.0)
         .LargerThan(0.0);
-    AddType("test_operator");
     AddComment("This is test op");
   }
 };
 
+static int cpu_kernel_run_num = 0;
+
 class OpWithKernelTest : public OperatorWithKernel {
  public:
-  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
+  using OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext& ctx) const override {}
 };
 
+template <typename T1, typename T2>
 class CPUKernelTest : public OpKernel {
  public:
-  void Compute(const KernelContext& context) const {
-    float scale = context.op_.GetAttr<float>("scale");
-    ASSERT_NEAR(scale, 3.14, 1e-5);
+  void Compute(const ExecutionContext& ctx) const {
     std::cout << "this is cpu kernel" << std::endl;
-    std::cout << context.op_.DebugString() << std::endl;
+    std::cout << ctx.op_.DebugString() << std::endl;
+    cpu_kernel_run_num++;
+    ASSERT_EQ(ctx.op_.Input("x"), "IN1");
+    ASSERT_EQ(ctx.op_.Output("y"), "OUT1");
   }
 };
 
-REGISTER_OP(op_with_kernel, OpWithKernelTest, OpKernelTestProtoAndCheckerMaker);
-REGISTER_OP_KERNEL(op_with_kernel, platform::CPUPlace, CPUKernelTest);
+class OpKernelTestMultiInputsProtoAndCheckerMaker
+    : public OpProtoAndCheckerMaker {
+ public:
+  OpKernelTestMultiInputsProtoAndCheckerMaker(OpProto* proto,
+                                              OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("xs", "inputs of test op").AsDuplicable();
+    AddInput("k", "input of test op");
+    AddOutput("ys", "outputs of test op").AsDuplicable();
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .LargerThan(0.0);
+    AddComment("This is test op");
+  }
+};
+
+class CPUKernalMultiInputsTest : public OpKernel {
+ public:
+  void Compute(const ExecutionContext& ctx) const {
+    auto xs = ctx.op_.Inputs("xs");
+    ASSERT_EQ(xs.size(), 3UL);
+    ASSERT_EQ(xs[0], "x0");
+    ASSERT_EQ(xs[1], "x1");
+    ASSERT_EQ(xs[2], "x2");
+
+    auto inVar0 = ctx.MultiInputVar("xs");
+    ASSERT_EQ(inVar0.size(), 3U);
+
+    auto intVar1 = ctx.InputVar("k");
+    ASSERT_NE(intVar1, nullptr);
 
+    auto outVar0 = ctx.MultiOutputVar("ys");
+    ASSERT_EQ(outVar0.size(), 2U);
+
+    auto inTensor0 = ctx.MultiInput<Tensor>("xs");
+    ASSERT_EQ(inTensor0.size(), 3U);
+
+    auto intTensor1 = ctx.Input<Tensor>("k");
+    ASSERT_NE(intTensor1, nullptr);
+
+    auto outTensor0 = ctx.MultiOutput<Tensor>("ys");
+    ASSERT_EQ(outTensor0.size(), 2U);
+
+    auto k = ctx.op_.Input("k");
+    ASSERT_EQ(k, "k0");
+
+    auto ys = ctx.op_.Outputs("ys");
+    ASSERT_EQ(ys.size(), 2UL);
+    ASSERT_EQ(ys[0], "y0");
+    ASSERT_EQ(ys[1], "y1");
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(
+    op_with_kernel, paddle::framework::OpWithKernelTest,
+    paddle::framework::OpKernelTestProtoAndCheckerMaker);
+REGISTER_OP_CPU_KERNEL(op_with_kernel,
+                       paddle::framework::CPUKernelTest<float, float>);
+
+// test with single input
 TEST(OpKernel, all) {
-  OpDesc op_desc;
+  paddle::framework::OpDesc op_desc;
   op_desc.set_type("op_with_kernel");
-  *op_desc.mutable_inputs()->Add() = "IN1";
-  *op_desc.mutable_outputs()->Add() = "OUT1";
+  BuildVar("x", {"IN1"}, op_desc.add_inputs());
+  BuildVar("y", {"OUT1"}, op_desc.add_outputs());
+
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("scale");
   attr->set_type(paddle::framework::AttrType::FLOAT);
   attr->set_f(3.14);
 
-  platform::CPUDeviceContext cpu_device_context;
-  auto scope = std::make_shared<Scope>();
+  paddle::platform::CPUDeviceContext cpu_device_context;
+  paddle::framework::Scope scope;
 
-  OperatorBase* op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
   op->Run(scope, cpu_device_context);
+  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
+}
+
+REGISTER_OP_WITHOUT_GRADIENT(
+    op_multi_inputs_with_kernel, paddle::framework::OpWithKernelTest,
+    paddle::framework::OpKernelTestMultiInputsProtoAndCheckerMaker);
+REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
+                       paddle::framework::CPUKernalMultiInputsTest);
 
-  delete op;
+// test with multi inputs
+TEST(OpKernel, multi_inputs) {
+  using namespace paddle::framework;
+
+  OpDesc op_desc;
+  op_desc.set_type("op_multi_inputs_with_kernel");
+  BuildVar("xs", {"x0", "x1", "x2"}, op_desc.add_inputs());
+  BuildVar("k", {"k0"}, op_desc.add_inputs());
+  BuildVar("ys", {"y0", "y1"}, op_desc.add_outputs());
+
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_f(3.14);
+
+  paddle::platform::CPUDeviceContext cpu_device_context;
+  paddle::framework::Scope scope;
+  scope.NewVar("x0")->GetMutable<Tensor>();
+  scope.NewVar("x1")->GetMutable<Tensor>();
+  scope.NewVar("x2")->GetMutable<Tensor>();
+  scope.NewVar("k0")->GetMutable<Tensor>();
+  scope.NewVar("y0")->GetMutable<Tensor>();
+  scope.NewVar("y1")->GetMutable<Tensor>();
+
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  op->Run(scope, cpu_device_context);
 }
-}  // namespace framework
-}  // namespace paddle
\ No newline at end of file
+
+class OperatorClone : public paddle::framework::OperatorBase {
+ public:
+  DEFINE_OP_CLONE_METHOD(OperatorClone);
+  OperatorClone(const std::string& type, const VarNameMap& inputs,
+                const VarNameMap& outputs,
+                const paddle::framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void InferShape(const paddle::framework::Scope& scope) const override {}
+  void Run(const paddle::framework::Scope& scope,
+           const paddle::platform::DeviceContext& dev_ctx) const override {}
+};
+
+TEST(Operator, Clone) {
+  OperatorClone a("ABC", {}, {}, {});
+  auto b = a.Clone();
+  ASSERT_EQ(a.Type(), b->Type());
+}
\ No newline at end of file
diff --git a/paddle/framework/pybind.cc b/paddle/framework/pybind.cc
new file mode 100644
index 0000000000..de119e9e06
--- /dev/null
+++ b/paddle/framework/pybind.cc
@@ -0,0 +1,258 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <Python.h>
+#include <fstream>
+#include <vector>
+
+#include "paddle/framework/backward.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/tensor_py.h"
+#include "paddle/operators/net_op.h"
+#include "paddle/operators/recurrent_op.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/place.h"
+#include "paddle/string/to_string.h"
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
+USE_OP(add_two);
+USE_OP(onehot_cross_entropy);
+USE_OP(sgd);
+USE_OP(mul);
+USE_OP(mean);
+USE_OP(sigmoid);
+USE_OP(softmax);
+USE_OP(rowwise_add);
+USE_OP(fill_zeros_like);
+USE_OP_ITSELF(recurrent_op);
+USE_OP(gaussian_random);
+USE_OP(uniform_random);
+
+namespace paddle {
+namespace framework {
+
+using Tensor = framework::Tensor;
+
+static size_t UniqueIntegerGenerator() {
+  static std::atomic<size_t> generator;
+  return generator.fetch_add(1);
+}
+
+bool IsCompileGPU() {
+#ifdef PADDLE_ONLY_CPU
+  return false;
+#else
+  return true;
+#endif
+}
+
+PYBIND11_PLUGIN(core) {
+  py::module m("core", "C++ core of PaddlePaddle");
+
+  py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
+      .def_buffer(
+          [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
+      .def("get_dims",
+           [](const Tensor &self) { return vectorize(self.dims()); })
+      .def("set_dims",
+           [](Tensor &self, const std::vector<int> &dim) {
+             self.Resize(make_ddim(dim));
+           })
+      .def("alloc_float",
+           [](Tensor &self, paddle::platform::GPUPlace &place) {
+             self.mutable_data<float>(place);
+           })
+      .def("alloc_float",
+           [](Tensor &self, paddle::platform::CPUPlace &place) {
+             self.mutable_data<float>(place);
+           })
+      .def("alloc_int",
+           [](Tensor &self, paddle::platform::CPUPlace &place) {
+             self.mutable_data<int>(place);
+           })
+      .def("alloc_int",
+           [](Tensor &self, paddle::platform::GPUPlace &place) {
+             self.mutable_data<int>(place);
+           })
+      .def("set", PyCPUTensorSetFromArray<float>)
+      .def("set", PyCPUTensorSetFromArray<int>)
+#ifndef PADDLE_ONLY_CPU
+      .def("set", PyCUDATensorSetFromArray<float>)
+      .def("set", PyCUDATensorSetFromArray<int>)
+#endif
+      .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
+      .def("set_float_element",
+           [](Tensor &self, size_t offset, float f) {
+             // TODO(yuyang18): Only support GPU now.
+             self.data<float>()[offset] = f;
+           })
+      .def("get_float_element", [](Tensor &self, size_t offset) -> float {
+        // TODO(yuyang18): Only support GPU now.
+        return self.data<float>()[offset];
+      });
+
+  py::class_<Variable>(m, "Variable", R"DOC(Variable Class.
+
+All parameter, weight, gradient are variables in Paddle.
+)DOC")
+      .def("is_int", [](const Variable &var) { return var.IsType<int>(); })
+      .def("set_int",
+           [](Variable &var, int val) -> void { *var.GetMutable<int>() = val; })
+      .def("get_int", [](const Variable &var) -> int { return var.Get<int>(); })
+      .def("get_tensor",
+           [](Variable &self) -> Tensor * { return self.GetMutable<Tensor>(); },
+           py::return_value_policy::reference)
+      .def("get_net",
+           [](Variable &self) -> operators::NetOp * {
+             return self.GetMutable<operators::NetOp>();
+           },
+           py::return_value_policy::reference);
+
+  py::class_<Scope>(m, "Scope", "")
+      .def("new_var",
+           [](Scope &self, const std::string &name) -> Variable * {
+             return self.NewVar(name);
+           },
+           py::return_value_policy::reference)
+      .def("find_var", &Scope::FindVar, py::return_value_policy::reference)
+      .def(py::init<>())
+      .def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); },
+           py::return_value_policy::reference)
+      .def("drop_kids", &Scope::DropKids);
+
+  //! @note: Be careful! PyBind will return std::string as an unicode, not
+  //! Python str. If you want a str object, you should cast them in Python.
+  m.def("get_all_op_protos", []() -> std::vector<py::bytes> {
+    auto &op_info_map = OpRegistry::op_info_map();
+    std::vector<py::bytes> ret_values;
+    for (auto it = op_info_map.begin(); it != op_info_map.end(); ++it) {
+      const OpProto *proto = it->second.proto_;
+      if (proto == nullptr) {
+        continue;
+      }
+      PADDLE_ENFORCE(proto->IsInitialized(), "OpProto must all be initialized");
+      std::string str;
+      PADDLE_ENFORCE(proto->SerializeToString(&str),
+                     "Serialize OpProto Error. This could be a bug of Paddle.");
+      ret_values.push_back(py::bytes(str));
+    }
+    return ret_values;
+  });
+  m.def_submodule(
+       "var_names",
+       "The module will return special predefined variable name in Paddle")
+      .def("empty", []() { return kEmptyVarName; })
+      .def("temp", []() { return kTempVarName; });
+  // clang-format off
+  py::class_<paddle::platform::DeviceContext>(m, "DeviceContext")
+      .def_static("create",
+                  [](paddle::platform::CPUPlace& place)
+                      -> paddle::platform::DeviceContext* {
+                    return new paddle::platform::CPUDeviceContext();
+                  })
+      .def_static("create",
+                  [](paddle::platform::GPUPlace& place)
+                      -> paddle::platform::DeviceContext* {
+#ifdef PADDLE_ONLY_CPU
+                    PADDLE_THROW("GPUPlace is not supported in CPU device.");
+#else
+                    return new paddle::platform::CUDADeviceContext(place);
+#endif
+                  });
+  // clang-format on
+
+  py::class_<platform::GPUPlace>(m, "GPUPlace")
+      .def(py::init<int>())
+      .def("__str__", string::to_string<const platform::GPUPlace &>);
+
+  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace")
+      .def(py::init<>())
+      .def("__str__", string::to_string<const platform::CPUPlace &>);
+
+  py::class_<OperatorBase>(m, "Operator")
+      .def_static("create",
+                  [](py::bytes protobin) {
+                    OpDesc desc;
+                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
+                                   "Cannot parse user input to OpDesc");
+                    PADDLE_ENFORCE(desc.IsInitialized(),
+                                   "User OpDesc is not initialized, reason %s",
+                                   desc.InitializationErrorString());
+                    return OpRegistry::CreateOp(desc);
+                  })
+      .def("backward",
+           [](const OperatorBase &forwardOp,
+              const std::unordered_set<std::string> &no_grad_vars) {
+             return Backward(forwardOp, no_grad_vars).release();
+           })
+      .def("infer_shape", &OperatorBase::InferShape)
+      .def("run", &OperatorBase::Run)
+      .def("type",
+           [](const OperatorBase &op) -> std::string { return op.Type(); })
+      .def("outputs",
+           [](const OperatorBase &op)
+               -> std::map<std::string, std::vector<std::string>> {
+                 return op.Outputs();
+               })
+      .def("inputs", [](const OperatorBase &op) { return op.Inputs(); })
+      .def("__str__", &OperatorBase::DebugString)
+      .def("no_intermediate_outputs",
+           [](const OperatorBase &op) { return op.OutputVars(false); })
+      .def("support_gpu", &OperatorBase::SupportGPU);
+
+  py::class_<operators::NetOp, OperatorBase>(m, "Net")
+      .def_static("create",
+                  []() -> operators::NetOp * {
+                    auto *retv = new operators::NetOp;
+                    retv->SetType("plain_net");
+                    return retv;
+                  })
+      .def("append_op", [](operators::NetOp &self,
+                           const OperatorBase &op) { self.AppendOp(op); })
+      .def("complete_add_op", &operators::NetOp::CompleteAddOp)
+      .def("complete_add_op", [](std::shared_ptr<operators::NetOp> &self) {
+        self->CompleteAddOp();
+      });
+
+  // recurrent_op
+  py::class_<operators::RecurrentOp, OperatorBase>(m, "RecurrentOp")
+      .def_static(
+          "create",
+          [](py::bytes protobin) -> operators::RecurrentOp * {
+            OpDesc desc;
+            PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
+                           "Cannot parse user input to OpDesc");
+            PADDLE_ENFORCE(desc.IsInitialized(),
+                           "User OpDesc is not initialized, reason %s",
+                           desc.InitializationErrorString());
+            auto rnn_op = OpRegistry::CreateOp(desc);
+            return static_cast<operators::RecurrentOp *>(rnn_op.release());
+          })
+      .def("set_stepnet", [](operators::RecurrentOp &self,
+                             const operators::NetOp &net) -> void {
+        self.set_stepnet(net.Clone());
+      });
+
+  m.def("unique_integer", UniqueIntegerGenerator);
+
+  m.def("is_compile_gpu", IsCompileGPU);
+
+  return m.ptr();
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
new file mode 100644
index 0000000000..080b4ac621
--- /dev/null
+++ b/paddle/framework/scope.cc
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/scope.h"
+#include "paddle/string/printf.h"
+
+namespace paddle {
+namespace framework {
+
+Scope::~Scope() {
+  DropKids();
+  for (auto& kv : vars_) delete kv.second;
+}
+
+Scope& Scope::NewScope() const {
+  kids_.push_back(new Scope(this));
+  return *kids_.back();
+}
+
+Variable* Scope::NewVar(const std::string& name) {
+  auto iter = vars_.find(name);
+  if (iter != vars_.end()) {
+    return iter->second;
+  }
+  Variable* v = new Variable();
+  vars_[name] = v;
+  v->name_ = &(vars_.find(name)->first);
+  return v;
+}
+
+Variable* Scope::NewVar() {
+  return NewVar(string::Sprintf("%p.%d", this, vars_.size()));
+}
+
+Variable* Scope::FindVar(const std::string& name) const {
+  auto it = vars_.find(name);
+  if (it != vars_.end()) return it->second;
+  return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
+}
+
+const Scope* Scope::FindScope(const Variable* var) const {
+  for (auto& kv : vars_) {
+    if (kv.second == var) {
+      return this;
+    }
+  }
+  return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
+}
+void Scope::DropKids() {
+  for (Scope* s : kids_) delete s;
+  kids_.clear();
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index a4470f726f..2ba3f8ed35 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -14,15 +14,17 @@ limitations under the License. */
 
 #pragma once
 
+#include <list>
 #include <string>
 #include <unordered_map>
-#include <vector>
 
 #include "paddle/framework/variable.h"
 
 namespace paddle {
 namespace framework {
 
+class Scope;
+
 /**
  * @brief Scope that manage all variables.
  *
@@ -33,62 +35,42 @@ namespace framework {
  */
 class Scope {
  public:
-  /**
-   * @brief Initialize s Scope without parent.
-   */
   Scope() {}
+  ~Scope();
+
+  // Disable Copy, Assign, Move.
+  Scope(const Scope& other) = delete;
+  Scope& operator=(const Scope& other) = delete;
+  Scope(Scope&& other) = delete;
+
+  /// Create a sub-scope. Returns a reference other than a pointer so
+  /// to prevent from manual deletion.
+  /// Mark it to const because that new kid scope cannot change parent scope.
+  Scope& NewScope() const;
+
+  /// Create a variable with given name if it doesn't exist.
+  Variable* NewVar(const std::string& name);
 
-  /**
-   * @brief Initialize a Scope with parent.
-   */
-  explicit Scope(const std::shared_ptr<Scope>& parent) : parent_(parent) {}
-
-  /**
-   * @brief Create Variable
-   *
-   * Create Variable in this Scope. Return the exist one if Variable already
-   * been created.
-   */
-  Variable* CreateVariable(const std::string& name) {
-    auto var = GetVariable(name);
-    if (var) {
-      return var;
-    } else {
-      vars_[name] = std::unique_ptr<Variable>(new Variable());
-      return GetVariable(name);
-    }
-  }
-
-  /**
-   * @brief Get Variable.
-   *
-   * Get Variable from this Scope, this function will recursive find Variable
-   * from it's parent scope. Return nullptr if not found.
-   */
-  Variable* GetVariable(const std::string& name) const {
-    auto it = vars_.find(name);
-    if (it != vars_.end()) {
-      return it->second.get();
-    } else if (parent_ != nullptr) {
-      return parent_->GetVariable(name);
-    } else {
-      return nullptr;
-    }
-  }
-
-  /**
-   * @brief If this scope has a Var named name.
-   *
-   * Find if there is a Variable in this scope and it's parent scope
-   */
-  bool HasVariable(const std::string& name) const {
-    return (vars_.find(name) != vars_.end() ||
-            (parent_ && parent_->HasVariable(name)));
-  }
+  /// Create a variable with a scope-unique name.
+  Variable* NewVar();
+
+  /// Find a variable in the scope or any of its ancestors.  Returns
+  /// nullptr if cannot find.
+  Variable* FindVar(const std::string& name) const;
+
+  /// Find the scope or an ancestor scope that contains the given variable.
+  const Scope* FindScope(const Variable* var) const;
+
+  /// Drop all kids scopes belonged to this scope.
+  void DropKids();
 
  private:
-  std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
-  std::shared_ptr<Scope> parent_{nullptr};
+  // Call Scope::NewScope for a sub-scope.
+  explicit Scope(Scope const* parent) : parent_(parent) {}
+
+  std::unordered_map<std::string, Variable*> vars_;
+  mutable std::list<Scope*> kids_;
+  Scope const* parent_{nullptr};
 };
 
 }  // namespace framework
diff --git a/paddle/framework/scope_test.cc b/paddle/framework/scope_test.cc
index df1afb200c..9d51e355b0 100644
--- a/paddle/framework/scope_test.cc
+++ b/paddle/framework/scope_test.cc
@@ -15,44 +15,42 @@ limitations under the License. */
 #include "paddle/framework/scope.h"
 #include "gtest/gtest.h"
 
-TEST(Scope, Create) {
-  using paddle::framework::Scope;
-  using paddle::framework::Variable;
+using paddle::framework::Scope;
+using paddle::framework::Variable;
 
-  auto scope = std::make_shared<Scope>();
+TEST(Scope, VarsShadowing) {
+  Scope s;
+  Scope& ss1 = s.NewScope();
+  Scope& ss2 = s.NewScope();
 
-  Variable* var0 = scope->CreateVariable("");
-  EXPECT_NE(var0, nullptr);
+  Variable* v0 = s.NewVar("a");
+  Variable* v1 = ss1.NewVar("a");
 
-  /// GetVariable will return nullptr if not exist.
-  Variable* var1 = scope->GetVariable("a");
-  EXPECT_EQ(var1, nullptr);
+  EXPECT_NE(v0, v1);
 
-  /// CreateVariable will return one.
-  Variable* var2 = scope->CreateVariable("a");
-  EXPECT_NE(var2, nullptr);
+  EXPECT_EQ(v0, s.FindVar("a"));
+  EXPECT_EQ(v1, ss1.FindVar("a"));
+  EXPECT_EQ(v0, ss2.FindVar("a"));
+}
 
-  /// Get the created variable.
-  Variable* var3 = scope->GetVariable("a");
-  EXPECT_EQ(var2, var3);
+TEST(Scope, FindVar) {
+  Scope s;
+  Scope& ss = s.NewScope();
 
-  /// CreateVariable will just return the variable if it's
-  /// already exist.
-  Variable* var4 = scope->CreateVariable("a");
-  EXPECT_EQ(var4, var2);
-}
+  EXPECT_EQ(nullptr, s.FindVar("a"));
+  EXPECT_EQ(nullptr, ss.FindVar("a"));
 
-TEST(Scope, Parent) {
-  using paddle::framework::Scope;
-  using paddle::framework::Variable;
+  ss.NewVar("a");
 
-  auto parent_scope = std::make_shared<Scope>();
-  auto scope = std::make_shared<Scope>(parent_scope);
+  EXPECT_EQ(nullptr, s.FindVar("a"));
+  EXPECT_NE(nullptr, ss.FindVar("a"));
+}
 
-  Variable* var0 = parent_scope->CreateVariable("a");
-  EXPECT_NE(var0, nullptr);
+TEST(Scope, FindScope) {
+  Scope s;
+  Scope& ss = s.NewScope();
+  Variable* v = s.NewVar("a");
 
-  /// GetVariable will get Variable from parent scope if exist.
-  Variable* var1 = scope->GetVariable("a");
-  EXPECT_EQ(var0, var1);
+  EXPECT_EQ(&s, s.FindScope(v));
+  EXPECT_EQ(&s, ss.FindScope(v));
 }
diff --git a/paddle/framework/net_test.cc b/paddle/framework/tensor.cc
similarity index 78%
rename from paddle/framework/net_test.cc
rename to paddle/framework/tensor.cc
index a8e31c1497..ea7b2a1f7b 100644
--- a/paddle/framework/net_test.cc
+++ b/paddle/framework/tensor.cc
@@ -12,13 +12,8 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/framework/net.h"
-#include "paddle/framework/op_registry.h"
-
-#include <gtest/gtest.h>
+#include "paddle/framework/tensor.h"
 
 namespace paddle {
-namespace framework {
-class FakeFC : public Operator {}
-}  // namespace framework
+namespace framework {}
 }  // namespace paddle
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index a0945e8055..b8c779f4e5 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -15,123 +15,158 @@ limitations under the License. */
 #pragma once
 
 #include <cstdint>
+#include <cstring>
 #include <memory>
-#include <type_traits>
+#include <typeindex>
+#include <vector>
+
 #include "paddle/framework/ddim.h"
-#include "paddle/framework/enforce.h"
 #include "paddle/memory/memory.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
+#include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
+
 namespace framework {
+namespace details {
+template <bool less, size_t i, typename... args>
+struct CastToPyBufferImpl;
+}
 
 class Tensor {
+ public:
+  template <bool less, size_t i, typename... args>
+  friend struct details::CastToPyBufferImpl;
+
+  template <typename T, size_t D, int MajorType, typename IndexType>
+  friend struct EigenTensor;
+
+  template <typename T, int MajorType, typename IndexType>
+  friend struct EigenVector;
+
  public:
   Tensor() : offset_(0) {}
 
-  explicit Tensor(const DDim& dims) : dims_(dims), offset_(0) {}
+  /*! Return a pointer to mutable memory block. */
+  template <typename T>
+  inline T* data();
+
+  /*! Return a pointer to constant memory block. */
+  template <typename T>
+  inline const T* data() const;
 
+  /**
+   * @brief   Return a pointer to mutable memory block.
+   * @note    If not exist, then allocation.
+   */
   template <typename T>
-  const T* data() const {
-    PADDLE_ENFORCE(
-        holder_ != nullptr,
-        "Tenosr has not been initialized. Call Tensor::mutable_data first.");
-    return reinterpret_cast<const T*>(
-        reinterpret_cast<uintptr_t>(holder_->Ptr()) + offset_);
-  }
-
-  template <typename T,  // must be POD types
-            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
-  T* mutable_data(DDim dims, paddle::platform::Place place) {
-    dims_ = dims;
-    if (holder_ == nullptr ||
-        !(holder_->Place() ==
-          place) /* some versions of boost::variant don't have operator!= */
-        || holder_->Size() < product(dims) * sizeof(T) + offset_) {
-      holder_.reset(new PlaceholderImpl<T>(place, product(dims) * sizeof(T)));
-      offset_ = 0;
-    }
-    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->Ptr()) +
-                                offset_);
-  }
-
-  void ShareDataFrom(const Tensor& src) {
-    PADDLE_ENFORCE(src.holder_ != nullptr,
-                   "Can not share data from an uninitialized tensor.");
-    holder_ = src.holder_;
-    dims_ = src.dims_;
-    offset_ = src.offset_;
-  }
-
-  Tensor Slice(const int& begin_idx, const int& end_idx) const {
-    PADDLE_ENFORCE(holder_ != nullptr,
-                   "The sliced tenosr has not been initialized.");
-    PADDLE_ENFORCE(begin_idx >= 0 && end_idx <= dims_[0],
-                   "Slice index is less than zero or out of bound.");
-    PADDLE_ENFORCE(begin_idx < end_idx,
-                   "Begin index must be less than end index.");
-    PADDLE_ENFORCE(dims_[0] != 1, "Can not slice a tensor with dims_[0] = 1.");
-    std::vector<int> d = vectorize(dims_);
-    int base = 1;
-    for (size_t i = 1; i < d.size(); ++i) {
-      base *= d[i];
-    }
-    Tensor dst;
-    dst.holder_ = holder_;
-    dst.dims_ = dims_;
-    dst.dims_[0] = end_idx - begin_idx;
-    dst.offset_ = offset_ + begin_idx * base * holder_->TypeSize();
-    return dst;
-  }
+  inline T* mutable_data(platform::Place place);
+
+  /**
+   * @brief     Return a pointer to mutable memory block.
+   *
+   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] place   The place of the memory block.
+   *
+   * @note      If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(DDim dims, platform::Place place);
+
+  /*! Return the dimensions of the memory block. */
+  inline const DDim& dims() const;
+
+  /*! Resize the dimensions of the memory block. */
+  inline Tensor& Resize(const DDim& dims);
 
-  DDim dims() const { return dims_; }
+  /*! The internal of two tensors share the same memory block. */
+  template <typename T>
+  inline Tensor& ShareDataWith(const Tensor& src);
+
+  /**
+   * @brief   Copy the content of external tensor to a new place.
+   *
+   * @param[in] src   The external tensor.
+   * @param[in] ctx   The device context contains place where to store.
+   *
+   * @note    CopyFrom supports CPU <-> GPU, GPU <-> GPU.
+   */
+  template <typename T>
+  inline void CopyFrom(const Tensor& src, const platform::Place& dst_place);
+
+  /**
+   * @brief   Return the slice of the tensor.
+   *
+   * @param[in] begin_idx   The begin index of the slice.
+   * @param[in] end_idx     The end index of the slice.
+   */
+  template <typename T>
+  inline Tensor Slice(const int& begin_idx, const int& end_idx) const;
+
+  platform::Place place() const { return holder_->place(); }
+
+ private:
+  template <typename T>
+  inline void check_memory_size() const;
 
  private:
-  // Placeholder hides type T, so it doesn't appear as a template
-  // parameter of Variable.
+  /**
+   * @note    Placeholder hides type T, so it doesn't appear as a template
+   *          parameter of Variable.
+   */
   struct Placeholder {
     virtual ~Placeholder() {}
-    virtual void* Ptr() const = 0;
-    virtual paddle::platform::Place Place() const = 0;
-    virtual size_t Size() const = 0;
-    virtual size_t TypeSize() const = 0;
+    virtual void* ptr() const = 0;
+    virtual size_t size() const = 0;
+    virtual std::type_index type() const = 0;
+    virtual platform::Place place() const = 0;
   };
 
-  template <typename T>
+  template <typename T, typename Place>
   struct PlaceholderImpl : public Placeholder {
-   private:
-    class Deleter {
-     public:
-      Deleter(platform::Place place) : place_(place) {}
-      void operator()(T* ptr) {
-        paddle::memory::Free(place_, static_cast<void*>(ptr));
-      }
-
-     private:
-      paddle::platform::Place place_;
-    };
-
-   public:
-    PlaceholderImpl(paddle::platform::Place place, size_t size)
-        : ptr_(static_cast<T*>(paddle::memory::Alloc(place, size)),
-               Deleter(place)),
+    PlaceholderImpl(Place place, size_t size)
+        : ptr_(static_cast<T*>(memory::Alloc(place, size)),
+               memory::PODDeleter<T, Place>(place)),
           place_(place),
-          size_(size) {}
+          size_(size) {
+      PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
+                              (is_cpu_place(place_) ? "CPU" : "GPU"));
+    }
+
+    virtual size_t size() const { return size_; }
+    virtual platform::Place place() const { return place_; }
+    virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
+    virtual std::type_index type() const { return std::type_index(typeid(T)); }
+
+    /*! the pointer of memory block. */
+    std::unique_ptr<T, memory::PODDeleter<T, Place>> ptr_;
 
-    virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); }
-    virtual size_t Size() const { return size_; }
-    virtual paddle::platform::Place Place() const { return place_; }
-    virtual size_t TypeSize() const { return sizeof(T); }
+    /*! the place of memory block. */
+    platform::Place place_;
 
-    std::unique_ptr<T, Deleter> ptr_;
-    paddle::platform::Place place_;  // record the place of ptr_.
-    size_t size_;                    // size of the memory block.
+    /*! the size of memory block. */
+    size_t size_;
   };
 
-  std::shared_ptr<Placeholder> holder_;  // holds the memory block if allocated.
+  /*! holds the memory block if allocated. */
+  std::shared_ptr<Placeholder> holder_;
+
+  /*! points to dimensions of memory block. */
   DDim dims_;
-  size_t offset_;  // marks the begin of tensor data area.
+
+  /**
+   * @brief   A PlaceHolder may be shared by more than one tensor.
+   *
+   * @note    Some of them may be slices of the others. So the offset_
+   *          is introduced here to indicate the byte offset between
+   *          PlaceHolder::ptr_ and where the tensor data really begins.
+   */
+  size_t offset_;
 };
 
 }  // namespace framework
 }  // namespace paddle
+
+#include "paddle/framework/tensor_impl.h"
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
new file mode 100644
index 0000000000..7d7263b899
--- /dev/null
+++ b/paddle/framework/tensor_impl.h
@@ -0,0 +1,150 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/memory/memcpy.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+inline void Tensor::check_memory_size() const {
+  PADDLE_ENFORCE_NOT_NULL(
+      holder_, "Tenosr holds no memory. Call Tensor::mutable_data first.");
+  PADDLE_ENFORCE_GE(
+      holder_->size(), product(dims_) * sizeof(T) + offset_,
+      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
+      "first to re-allocate memory.\n"
+      "or maybe the required data-type mismatches the data already stored.");
+}
+
+template <typename T>
+inline const T* Tensor::data() const {
+  check_memory_size<T>();
+  return reinterpret_cast<const T*>(
+      reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
+}
+
+template <typename T>
+inline T* Tensor::data() {
+  check_memory_size<T>();
+  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                              offset_);
+}
+
+template <typename T>
+inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
+  static_assert(std::is_pod<T>::value, "T must be POD");
+  Resize(dims);
+  return mutable_data<T>(place);
+}
+
+template <typename T>
+inline T* Tensor::mutable_data(platform::Place place) {
+  static_assert(std::is_pod<T>::value, "T must be POD");
+  PADDLE_ENFORCE_GT(product(dims_), 0,
+                    "Tensor's numel must be larger than zero to call "
+                    "Tensor::mutable_data. Call Tensor::set_dim first.");
+  /* some versions of boost::variant don't have operator!= */
+  size_t size = product(dims_) * sizeof(T);
+  if (holder_ == nullptr || !(holder_->place() == place) ||
+      holder_->size() < size + offset_) {
+    if (platform::is_cpu_place(place)) {
+      holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
+          boost::get<platform::CPUPlace>(place), size));
+    } else if (platform::is_gpu_place(place)) {
+#ifdef PADDLE_ONLY_CPU
+      PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
+    }
+#else
+      holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
+          boost::get<platform::GPUPlace>(place), size));
+    }
+#endif
+    offset_ = 0;
+  }
+  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                              offset_);
+}
+
+template <typename T>
+inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
+  src.check_memory_size<T>();
+  *this = src;
+  return *this;
+}
+
+template <typename T>
+inline void Tensor::CopyFrom(const Tensor& src,
+                             const platform::Place& dst_place) {
+  src.check_memory_size<T>();
+  Resize(src.dims());
+
+  auto src_place = src.holder_->place();
+  auto src_ptr = static_cast<const void*>(src.data<T>());
+
+  auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
+
+  auto size = product(src.dims_) * sizeof(T);
+
+  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
+  }
+#ifndef PADDLE_ONLY_CPU
+  else if (platform::is_gpu_place(src_place) &&
+           platform::is_cpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::GPUPlace>(src_place), src_ptr, size, 0);
+  } else if (platform::is_cpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::CPUPlace>(src_place), src_ptr, size, 0);
+  } else if (platform::is_gpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::GPUPlace>(src_place), src_ptr, size, 0);
+  }
+
+#endif
+}
+
+template <typename T>
+inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
+  check_memory_size<T>();
+  PADDLE_ENFORCE_GE(begin_idx, 0, "Slice begin index is less than zero.");
+  PADDLE_ENFORCE_LE(end_idx, dims_[0], "Slice end index is out of bound.");
+  PADDLE_ENFORCE_LT(begin_idx, end_idx,
+                    "Begin index must be less than end index.");
+  PADDLE_ENFORCE_NE(dims_[0], 1, "Can not slice a tensor with dims_[0] = 1.");
+  int base = product(dims_) / dims_[0];
+  Tensor dst;
+  dst.holder_ = holder_;
+  DDim dst_dims = dims_;
+  dst_dims[0] = end_idx - begin_idx;
+  dst.Resize(dst_dims);
+  dst.offset_ = offset_ + begin_idx * base * sizeof(T);
+  return dst;
+}
+
+inline Tensor& Tensor::Resize(const DDim& dims) {
+  dims_ = dims;
+  return *this;
+}
+
+inline const DDim& Tensor::dims() const { return dims_; }
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/tensor_py.h b/paddle/framework/tensor_py.h
new file mode 100644
index 0000000000..4e1ab77b15
--- /dev/null
+++ b/paddle/framework/tensor_py.h
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <string>
+#include "paddle/framework/tensor.h"
+#include "paddle/memory/memcpy.h"
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+
+namespace framework {
+
+namespace details {
+
+template <bool less, size_t I, typename... ARGS>
+struct CastToPyBufferImpl;
+
+template <size_t I, typename... ARGS>
+struct CastToPyBufferImpl<false, I, ARGS...> {
+  py::buffer_info operator()(framework::Tensor &tensor) {
+    PADDLE_THROW("This type of tensor cannot be expose to Python");
+    return py::buffer_info();
+  }
+};
+
+template <size_t I, typename... ARGS>
+struct CastToPyBufferImpl<true, I, ARGS...> {
+  using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
+  py::buffer_info operator()(framework::Tensor &tensor) {
+    if (std::type_index(typeid(CUR_TYPE)) == tensor.holder_->type()) {
+      auto dim_vec = framework::vectorize(tensor.dims());
+      std::vector<size_t> dims_outside;
+      std::vector<size_t> strides;
+      dims_outside.resize(dim_vec.size());
+      strides.resize(dim_vec.size());
+
+      size_t prod = 1;
+      for (size_t i = dim_vec.size(); i != 0; --i) {
+        dims_outside[i - 1] = (size_t)dim_vec[i - 1];
+        strides[i - 1] = sizeof(CUR_TYPE) * prod;
+        prod *= dims_outside[i - 1];
+      }
+      framework::Tensor dst_tensor;
+      if (paddle::platform::is_gpu_place(tensor.holder_->place())) {
+        dst_tensor.CopyFrom<CUR_TYPE>(tensor, platform::CPUPlace());
+      } else if (paddle::platform::is_cpu_place(tensor.holder_->place())) {
+        dst_tensor = tensor;
+      }
+      return py::buffer_info(
+          dst_tensor.mutable_data<CUR_TYPE>(dst_tensor.holder_->place()),
+          sizeof(CUR_TYPE), py::format_descriptor<CUR_TYPE>::format(),
+          (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
+    } else {
+      constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
+      return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
+    }
+  }
+};
+}  // namespace details
+inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
+  auto buffer_info = details::CastToPyBufferImpl<true, 0, float, int>()(tensor);
+  return buffer_info;
+}
+
+template <typename T>
+void PyCPUTensorSetFromArray(
+    framework::Tensor &self,
+    py::array_t<T, py::array::c_style | py::array::forcecast> array,
+    paddle::platform::CPUPlace &place) {
+  std::vector<int> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back((int)array.shape()[i]);
+  }
+
+  self.Resize(framework::make_ddim(dims));
+  auto *dst = self.mutable_data<T>(place);
+  std::memcpy(dst, array.data(), sizeof(T) * array.size());
+}
+
+#ifndef PADDLE_ONLY_CPU
+template <typename T>
+void PyCUDATensorSetFromArray(
+    framework::Tensor &self,
+    py::array_t<T, py::array::c_style | py::array::forcecast> array,
+    paddle::platform::GPUPlace &place) {
+  std::vector<int> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back((int)array.shape()[i]);
+  }
+
+  self.Resize(framework::make_ddim(dims));
+  auto *dst = self.mutable_data<T>(place);
+  paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(),
+                                  cudaMemcpyHostToDevice);
+}
+#endif
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index f4822838cf..7db38d5cae 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -18,7 +18,8 @@
 TEST(Tensor, Dims) {
   using namespace paddle::framework;
   using namespace paddle::platform;
-  Tensor tt(make_ddim({2, 3, 4}));
+  Tensor tt;
+  tt.Resize({2, 3, 4});
   DDim dims = tt.dims();
   ASSERT_EQ(arity(dims), 3);
   for (int i = 0; i < 3; ++i) {
@@ -32,10 +33,11 @@ TEST(Tensor, DataAssert) {
   bool caught = false;
   try {
     src_tensor.data<double>();
-  } catch (paddle::framework::EnforceNotMet err) {
+  } catch (paddle::platform::EnforceNotMet err) {
     caught = true;
     std::string msg =
-        "Tenosr has not been initialized. Call Tensor::mutable_data first.";
+        "holder_ should not be null\nTenosr holds no memory. Call "
+        "Tensor::mutable_data first.";
     const char* what = err.what();
     for (size_t i = 0; i < msg.length(); ++i) {
       ASSERT_EQ(what[i], msg[i]);
@@ -46,7 +48,7 @@ TEST(Tensor, DataAssert) {
 
 /* following tests are not available at present
    because Memory::Alloc() and Memory::Free() have not been ready.
-
+*/
 TEST(Tensor, MutableData) {
   using namespace paddle::framework;
   using namespace paddle::platform;
@@ -72,6 +74,7 @@ TEST(Tensor, MutableData) {
     EXPECT_EQ(p1, p2);
   }
 
+#ifndef PADDLE_ONLY_CPU
   {
     Tensor src_tensor;
     float* p1 = nullptr;
@@ -93,9 +96,10 @@ TEST(Tensor, MutableData) {
     p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), GPUPlace());
     EXPECT_EQ(p1, p2);
   }
+#endif
 }
 
-TEST(Tensor, ShareDataFrom) {
+TEST(Tensor, ShareDataWith) {
   using namespace paddle::framework;
   using namespace paddle::platform;
   {
@@ -104,10 +108,12 @@ TEST(Tensor, ShareDataFrom) {
     // Try to share data form uninitialized tensor
     bool caught = false;
     try {
-      dst_tensor.ShareDataFrom(src_tensor);
-    } catch (EnforceNotMet err) {
+      dst_tensor.ShareDataWith<float>(src_tensor);
+    } catch (paddle::platform::EnforceNotMet err) {
       caught = true;
-      std::string msg = "Can not share data from an uninitialized tensor.";
+      std::string msg =
+          "holder_ should not be null\nTenosr holds no memory. Call "
+          "Tensor::mutable_data first.";
       const char* what = err.what();
       for (size_t i = 0; i < msg.length(); ++i) {
         ASSERT_EQ(what[i], msg[i]);
@@ -116,17 +122,19 @@ TEST(Tensor, ShareDataFrom) {
     ASSERT_TRUE(caught);
 
     src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CPUPlace());
-    dst_tensor.ShareDataFrom(src_tensor);
+    dst_tensor.ShareDataWith<int>(src_tensor);
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 
+#ifndef PADDLE_ONLY_CPU
   {
     Tensor src_tensor;
     Tensor dst_tensor;
     src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
-    dst_tensor.ShareDataFrom(src_tensor);
+    dst_tensor.ShareDataWith<int>(src_tensor);
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
+#endif
 }
 
 TEST(Tensor, Slice) {
@@ -135,7 +143,7 @@ TEST(Tensor, Slice) {
   {
     Tensor src_tensor;
     src_tensor.mutable_data<int>(make_ddim({5, 3, 4}), CPUPlace());
-    Tensor slice_tensor = src_tensor.Slice(1, 3);
+    Tensor slice_tensor = src_tensor.Slice<int>(1, 3);
     DDim slice_dims = slice_tensor.dims();
     ASSERT_EQ(arity(slice_dims), 3);
     EXPECT_EQ(slice_dims[0], 2);
@@ -155,10 +163,11 @@ TEST(Tensor, Slice) {
     EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
   }
 
+#ifndef PADDLE_ONLY_CPU
   {
     Tensor src_tensor;
     src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
-    Tensor slice_tensor = src_tensor.Slice(2, 6);
+    Tensor slice_tensor = src_tensor.Slice<double>(2, 6);
     DDim slice_dims = slice_tensor.dims();
     ASSERT_EQ(arity(slice_dims), 2);
     EXPECT_EQ(slice_dims[0], 4);
@@ -176,6 +185,80 @@ TEST(Tensor, Slice) {
     EXPECT_EQ(slice_data_address, slice_mutable_data_address);
     EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
   }
+#endif
 }
 
-*/
\ No newline at end of file
+TEST(Tensor, CopyFrom) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  {
+    Tensor src_tensor;
+    Tensor dst_tensor;
+
+    int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
+
+    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    memcpy(src_ptr, arr, 9 * sizeof(int));
+
+    auto cpu_place = new paddle::platform::CPUPlace();
+    dst_tensor.CopyFrom<int>(src_tensor, *cpu_place);
+
+    const int* dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
+    dst_tensor.CopyFrom<int>(slice_tensor, *cpu_place);
+    const int* slice_ptr = slice_tensor.data<int>();
+    dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(dst_ptr, slice_ptr);
+    for (size_t i = 0; i < 3; ++i) {
+      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+    }
+  }
+#ifndef PADDLE_ONLY_CPU
+  {
+    Tensor src_tensor;
+    Tensor gpu_tensor;
+    Tensor dst_tensor;
+
+    int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
+
+    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    memcpy(src_ptr, arr, 9 * sizeof(int));
+
+    // CPU Tensor to GPU Tensor
+    auto gpu_place = new paddle::platform::GPUPlace(0);
+    gpu_tensor.CopyFrom<int>(src_tensor, *gpu_place);
+
+    // GPU Tensor to CPU Tensor
+    auto cpu_place = new paddle::platform::CPUPlace();
+    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place);
+
+    // Compare Tensors
+    const int* dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
+
+    // CPU Slice Tensor to GPU Tensor
+    gpu_tensor.CopyFrom<int>(slice_tensor, *gpu_place);
+
+    // GPU Tensor to CPU Tensor
+    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place);
+
+    // Compare Slice Tensors
+    const int* slice_ptr = slice_tensor.data<int>();
+    dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(dst_ptr, slice_ptr);
+    for (size_t i = 0; i < 3; ++i) {
+      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+    }
+  }
+#endif
+}
diff --git a/paddle/framework/variable.h b/paddle/framework/variable.h
index 72c4a7a2a1..38fc2720a3 100644
--- a/paddle/framework/variable.h
+++ b/paddle/framework/variable.h
@@ -16,7 +16,7 @@
 #include <typeindex>
 #include <typeinfo>
 
-#include "paddle/platform/assert.h"
+#include "paddle/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
@@ -25,7 +25,7 @@ class Variable {
  public:
   template <typename T>
   const T& Get() const {
-    PADDLE_ASSERT(IsType<T>());
+    PADDLE_ENFORCE(IsType<T>(), "Variable must be type %s", typeid(T).name());
     return *static_cast<const T*>(holder_->Ptr());
   }
 
@@ -65,6 +65,17 @@ class Variable {
 
   std::unique_ptr<Placeholder>
       holder_;  // pointers to a PlaceholderImpl object indeed.
+
+  // name_ is only meaningful with a Scope and accessible by it.
+  //
+  // NOTE: Please don't expose name_ by adding methods like
+  // Variable::Name or Scope::VarName!  A variable could have a human
+  // readable name or an auto-generated scope-unique name.  In the
+  // former case, the caller knows the name and doesn't need to access
+  // the name; in the latter case, the variable should be identified
+  // by its address but not the unreadable name.
+  friend class Scope;
+  const std::string* name_;
 };
 
 }  // namespace framework
diff --git a/paddle/function/BlockExpandOp.cpp b/paddle/function/BlockExpandOp.cpp
new file mode 100644
index 0000000000..a89b6bba45
--- /dev/null
+++ b/paddle/function/BlockExpandOp.cpp
@@ -0,0 +1,202 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Function.h"
+#include "Im2Col.h"
+
+namespace paddle {
+
+/*
+ * \brief Converts the image data of four dimensions(NCHW) into
+ *        a sequence data of three dimensions(NST) in the forward calculation,
+ *        which is reversed in the backward calculation.
+ *        Where N is batch size, S is the length of the sequence after each
+ *        image is expanded, T is the size of each time step in the sequence.
+ *
+ * Arguments in forward function:
+ * \param inputs[0]  Image data of NCHW format.
+ * \param outputs[0] Sequence data of NST format.
+ *
+ * Arguments in backward function:
+ * \param inputs[0]  Sequence data of NST format.
+ * \param outputs[0] Image data of NCHW format.
+ */
+class BlockExpandFunction : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    // function arguments
+    strides_ = config.get<std::vector<size_t>>("strides");
+    paddings_ = config.get<std::vector<size_t>>("paddings");
+    blocks_ = config.get<std::vector<size_t>>("blocks");
+
+    // number of inputs and outputs
+    numInputs_ = 1;
+    numOutputs_ = 1;
+  }
+
+  void checkShape(const TensorShape& image, const TensorShape& sequence) const {
+    // image shape should be 4-dimensional.
+    CHECK_EQ(image.ndims(), (size_t)4);
+    // sequence shape should be 3-dimensional.
+    CHECK_EQ(sequence.ndims(), (size_t)3);
+    // The batchSize of the image needs to be equal to
+    // the batchSize of the sequence.
+    CHECK_EQ(image[0], sequence[0]);
+  }
+
+  // Calculate the shape of colData based on the shape of the image
+  // and the shape of the sequence.
+  TensorShape getColShape(const TensorShape& image,
+                          const TensorShape& sequence) const {
+    size_t inputChannels = image[1];
+    size_t inputHeight = image[2];
+    size_t inputWidth = image[3];
+    size_t seqLength = sequence[1];
+    size_t stepSize = sequence[2];
+    size_t outputHeight =
+        1 +
+        (inputHeight + 2 * paddingH() - blockH() + strideH() - 1) / strideH();
+    size_t outputWidth =
+        1 +
+        (inputWidth + 2 * paddingW() - blockW() + strideW() - 1) / strideW();
+    CHECK_EQ(seqLength, outputHeight * outputWidth);
+    CHECK_EQ(stepSize, inputChannels * blockH() * blockW());
+
+    // [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+    return TensorShape({outputHeight,
+                        outputWidth,
+                        inputChannels,
+                        (size_t)blockH(),
+                        (size_t)blockW()});
+  }
+
+protected:
+  std::vector<size_t> strides_;
+  std::vector<size_t> paddings_;
+  std::vector<size_t> blocks_;
+
+  inline int strideH() const { return strides_[0]; }
+
+  inline int strideW() const { return strides_[1]; }
+
+  inline int paddingH() const { return paddings_[0]; }
+
+  inline int paddingW() const { return paddings_[1]; }
+
+  inline int blockH() const { return blocks_[0]; }
+
+  inline int blockW() const { return blocks_[1]; }
+};
+
+template <DeviceType Device>
+class BlockExpandForward : public BlockExpandFunction {
+public:
+  void init(const FuncConfig& config) override {
+    BlockExpandFunction::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& image = inputs[0].shape();
+    const TensorShape& sequence = outputs[0].shape();
+    checkShape(image, sequence);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    const TensorShape& image = inputs[0].shape();
+    const TensorShape& sequence = outputs[0].shape();
+
+    TensorShape imShape = TensorShape({image[1], image[2], image[3]});
+    TensorShape colShape = getColShape(image, sequence);
+    size_t batchSize = image[0];
+
+    real* imageData = inputs[0].data<real>();
+    real* seqData = outputs[0].data<real>();
+    Im2ColFunctor<kOCF, Device, real> im2col;
+    for (size_t i = 0; i < batchSize; i++) {
+      // The result of im2col is [outputHeight, outputWidth,
+      // inputChannels, filterHeight, filterWidth], and it is easy to
+      // reshape into [seqLength, stepSize], where seqLength is equal
+      // output_height * output_width, stepSize is equal
+      // input_channels * filter_height * filter_width
+      im2col(imageData,
+             imShape,
+             seqData,
+             colShape,
+             strideH(),
+             strideW(),
+             paddingH(),
+             paddingW());
+      imageData += imShape.getElements();
+      seqData += colShape.getElements();
+    }
+  }
+};
+
+template <DeviceType Device>
+class BlockExpandBackward : public BlockExpandFunction {
+public:
+  void init(const FuncConfig& config) override {
+    BlockExpandFunction::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& image = outputs[0].shape();
+    const TensorShape& sequence = inputs[0].shape();
+    checkShape(image, sequence);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    // Since the implementation of Col2ImFunctor is ADD_TO,
+    // this function only supports ADD_TO mode.
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    const TensorShape& image = outputs[0].shape();
+    const TensorShape& sequence = inputs[0].shape();
+
+    TensorShape imShape = TensorShape({image[1], image[2], image[3]});
+    TensorShape colShape = getColShape(image, sequence);
+    size_t batchSize = image[0];
+
+    real* imageData = outputs[0].data<real>();
+    real* seqData = inputs[0].data<real>();
+    Col2ImFunctor<kOCF, Device, real> col2im;
+    for (size_t i = 0; i < batchSize; i++) {
+      col2im(imageData,
+             imShape,
+             seqData,
+             colShape,
+             strideH(),
+             strideW(),
+             paddingH(),
+             paddingW());
+      imageData += imShape.getElements();
+      seqData += colShape.getElements();
+    }
+  }
+};
+
+REGISTER_TYPED_FUNC(BlockExpand, CPU, BlockExpandForward);
+REGISTER_TYPED_FUNC(BlockExpandGrad, CPU, BlockExpandBackward);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(BlockExpand, GPU, BlockExpandForward);
+REGISTER_TYPED_FUNC(BlockExpandGrad, GPU, BlockExpandBackward);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/BlockExpandOpTest.cpp b/paddle/function/BlockExpandOpTest.cpp
new file mode 100644
index 0000000000..59193a3ec3
--- /dev/null
+++ b/paddle/function/BlockExpandOpTest.cpp
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+
+namespace paddle {
+
+TEST(BlockExpandForward, real) {
+  for (size_t batchSize : {5}) {
+    for (size_t channels : {1, 5}) {
+      for (size_t inputHeight : {5, 33}) {
+        for (size_t inputWidth : {5, 32}) {
+          for (size_t block : {1, 3, 5}) {
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                // init Test object
+                std::vector<size_t> strides = {stride, stride};
+                std::vector<size_t> paddings = {padding, padding};
+                std::vector<size_t> blocks = {block, block};
+                CpuGpuFuncCompare test("BlockExpand",
+                                       FuncConfig()
+                                           .set("strides", strides)
+                                           .set("paddings", paddings)
+                                           .set("blocks", blocks));
+
+                size_t outputHeight =
+                    1 +
+                    (inputHeight + 2 * padding - block + stride - 1) / stride;
+                size_t outputWidth =
+                    1 +
+                    (inputWidth + 2 * padding - block + stride - 1) / stride;
+                TensorShape inputShape =
+                    TensorShape({batchSize, channels, inputHeight, inputWidth});
+                TensorShape outputShape =
+                    TensorShape({batchSize,
+                                 outputHeight * outputWidth,
+                                 channels * block * block});
+                test.addInputs(BufferArg(VALUE_TYPE_FLOAT, inputShape));
+                test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, outputShape));
+                // run Function
+                test.run();
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(BlockExpandBackward, real) {
+  for (size_t batchSize : {5}) {
+    for (size_t channels : {1, 5}) {
+      for (size_t inputHeight : {5, 33}) {
+        for (size_t inputWidth : {5, 32}) {
+          for (size_t block : {1, 3, 5}) {
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                // init Test object
+                std::vector<size_t> strides = {stride, stride};
+                std::vector<size_t> paddings = {padding, padding};
+                std::vector<size_t> blocks = {block, block};
+                CpuGpuFuncCompare test("BlockExpandGrad",
+                                       FuncConfig()
+                                           .set("strides", strides)
+                                           .set("paddings", paddings)
+                                           .set("blocks", blocks));
+
+                size_t outputHeight =
+                    1 +
+                    (inputHeight + 2 * padding - block + stride - 1) / stride;
+                size_t outputWidth =
+                    1 +
+                    (inputWidth + 2 * padding - block + stride - 1) / stride;
+                TensorShape inputShape =
+                    TensorShape({batchSize, channels, inputHeight, inputWidth});
+                TensorShape outputShape =
+                    TensorShape({batchSize,
+                                 outputHeight * outputWidth,
+                                 channels * block * block});
+                test.addInputs(BufferArg(VALUE_TYPE_FLOAT, outputShape));
+                test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, inputShape),
+                                ADD_TO);
+                // run Function
+                test.run();
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/function/BufferArgTest.cpp b/paddle/function/BufferArgTest.cpp
index 1744f37780..6b8e1e2da9 100644
--- a/paddle/function/BufferArgTest.cpp
+++ b/paddle/function/BufferArgTest.cpp
@@ -32,7 +32,7 @@ TEST(BufferTest, SequenceIdArg) {
                          sizeOfValuType(VALUE_TYPE_INT32));
   SequenceIdArg buffer(memory.getBuf(), shape);
   EXPECT_EQ(buffer.data(), memory.getBuf());
-  EXPECT_EQ(buffer.numSeqs(), 9);
+  EXPECT_EQ(buffer.numSeqs(), 9U);
 }
 
 }  // namespace paddle
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 1518a8a654..c572a9d433 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -4,6 +4,10 @@ file(GLOB cpp_files . *Op.cpp)
 list(APPEND h_files Function.h)
 list(APPEND cpp_files Function.cpp)
 list(APPEND cpp_files BufferArg.cpp)
+list(APPEND cpp_files GemmFunctor.cpp)
+if(USE_EIGEN_FOR_BLAS)
+  list(APPEND cpp_files EigenGemm.cpp)
+endif(USE_EIGEN_FOR_BLAS)
 
 if(WITH_GPU)
     file(GLOB cu_files . *OpGpu.cu)
@@ -11,7 +15,6 @@ if(WITH_GPU)
 endif()
 
 if(USE_NNPACK)
-  include(nnpack/nnpack.cmake)
   list(APPEND cpp_files nnpack/NNPACKConvOp.cpp)
   if(WITH_TESTING)
     add_unittest(NNPACKConvOpTest nnpack/NNPACKConvOpTest.cpp)
@@ -37,9 +40,13 @@ if(WITH_GPU)
     add_simple_unittest(MulOpTest)
     add_simple_unittest(CosSimOpTest)
     add_simple_unittest(RowConvOpTest)
+    add_simple_unittest(BlockExpandOpTest)
+    add_simple_unittest(CropOpTest)
+    add_simple_unittest(DepthwiseConvOpTest)
 endif()
 
-add_simple_unittest(ConvOpTest)
+add_simple_unittest(Im2ColTest)
+add_simple_unittest(GemmConvOpTest)
 endif()
 
 add_style_check_target(paddle_function ${h_files})
diff --git a/paddle/function/ContextProjectionOpGpu.cu b/paddle/function/ContextProjectionOpGpu.cu
index 1a5b404240..4492dea5d8 100644
--- a/paddle/function/ContextProjectionOpGpu.cu
+++ b/paddle/function/ContextProjectionOpGpu.cu
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "hl_base.h"
 #include "ContextProjectionOp.h"
+#include "hl_base.h"
 
 namespace paddle {
 
@@ -30,7 +30,7 @@ __global__ void KeContextProjectionForward(const real* input,
   int block_size = blockDim.x;
   int sequenceId = blockIdx.x;
   int seq_start = sequence[sequenceId];
-  int seq_end = sequence[sequenceId+1];
+  int seq_end = sequence[sequenceId + 1];
   real value = 0;
 
   int instances = seq_end - seq_start + context_length - 1;
@@ -49,8 +49,9 @@ __global__ void KeContextProjectionForward(const real* input,
         } else if ((i + context_start) >= (seq_end - seq_start)) {
           if (padding) {
             value =
-              weight[(begin_pad + i + context_start - (seq_end - seq_start)) *
-                         input_dim + idx];
+                weight[(begin_pad + i + context_start - (seq_end - seq_start)) *
+                           input_dim +
+                       idx];
           } else {
             continue;
           }
@@ -61,7 +62,7 @@ __global__ void KeContextProjectionForward(const real* input,
         int outx = (i - context_length) < 0 ? i : (context_length - 1);
         int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
         real* output_r =
-          output + outy * input_dim * context_length + outx * input_dim;
+            output + outy * input_dim * context_length + outx * input_dim;
         for (int j = outy; j < seq_end - seq_start; j++) {
           output_r[idx] += value;
           if (j - outy == outx) break;
@@ -108,13 +109,25 @@ void hl_context_projection_forward(const real* input,
   dim3 grid(blocks_x, blocks_y);
 
   if (weight) {
-    KeContextProjectionForward<true><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (input, sequence, weight, output, input_dim,
-       context_length, context_start, begin_pad);
-  } else  {
-    KeContextProjectionForward<false><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (input, sequence, weight, output, input_dim,
-       context_length, context_start, begin_pad);
+    KeContextProjectionForward<true><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        input,
+        sequence,
+        weight,
+        output,
+        input_dim,
+        context_length,
+        context_start,
+        begin_pad);
+  } else {
+    KeContextProjectionForward<false><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        input,
+        sequence,
+        weight,
+        output,
+        input_dim,
+        context_length,
+        context_start,
+        begin_pad);
   }
   CHECK_SYNC("hl_context_projection_forward failed");
 }
@@ -148,7 +161,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad,
   int block_size = blockDim.x;
   int sequenceId = blockIdx.x;
   int seq_start = sequence[sequenceId];
-  int seq_end = sequence[sequenceId+1];
+  int seq_end = sequence[sequenceId + 1];
   real value = 0;
 
   int instances = seq_end - seq_start + context_length - 1;
@@ -170,7 +183,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad,
         int outx = (i - context_length) < 0 ? i : (context_length - 1);
         int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
         real* output_r =
-          out + outy * input_dim * context_length + outx * input_dim;
+            out + outy * input_dim * context_length + outx * input_dim;
         for (int j = outy; j < seq_end - seq_start; j++) {
           value += output_r[idx];
           if (j - outy == outx) break;
@@ -211,8 +224,8 @@ void hl_context_projection_backward_data(const real* out_grad,
   int blocks_y = 1;
   dim3 threads(block_size, 1);
   dim3 grid(blocks_x, blocks_y);
-  KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>>
-    (out_grad, sequence, input_grad, input_dim, context_length, context_start);
+  KeContextProjectionBackwardData<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      out_grad, sequence, input_grad, input_dim, context_length, context_start);
   CHECK_SYNC("hl_context_projection_backward_data failed");
 }
 
@@ -231,7 +244,7 @@ void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
                                       context_start);
 }
 
-template<int THREADS_X, int THREADS_Y>
+template <int THREADS_X, int THREADS_Y>
 __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
                                                   const int* sequence,
                                                   real* w_grad,
@@ -254,17 +267,17 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
   if (weight_idx < w_dim) {
     for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
       int seq_start = sequence[seqId];
-      int seq_end = sequence[seqId+1];
-      output_r = const_cast<real*>(out_grad)
-                    + seq_start * w_dim * context_length;
+      int seq_end = sequence[seqId + 1];
+      output_r =
+          const_cast<real*>(out_grad) + seq_start * w_dim * context_length;
 
       if (context_start < 0) {
         if (padId + context_start < 0) {
           instanceId = padId;
         } else {
           // begin_pad > 0;
-          instanceId = (padId - begin_pad) +
-            (seq_end - seq_start) - context_start;
+          instanceId =
+              (padId - begin_pad) + (seq_end - seq_start) - context_start;
         }
       } else {
         if (padId + (seq_end - seq_start) < context_start) {
@@ -275,10 +288,11 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
         }
       }
 
-      int outx = (instanceId - context_length) < 0 ?
-                 instanceId : (context_length - 1);
-      int outy = (instanceId - context_length) < 0 ?
-                 0 : (instanceId - (context_length - 1));
+      int outx =
+          (instanceId - context_length) < 0 ? instanceId : (context_length - 1);
+      int outy = (instanceId - context_length) < 0
+                     ? 0
+                     : (instanceId - (context_length - 1));
       output_r += outy * w_dim * context_length + outx * w_dim;
       for (int j = outy; j < seq_end - seq_start; j++) {
         value += output_r[weight_idx];
@@ -290,7 +304,7 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
   }
   __syncthreads();
 
-  for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) {
+  for (int stride = THREADS_Y / 2; stride > 0; stride = stride / 2) {
     if (idy < stride) {
       sum_s[idy][idx] += sum_s[idy + stride][idx];
     }
@@ -339,22 +353,27 @@ void hl_context_projection_backward_weight(const real* out_grad,
   dim3 threads(threads_x, threads_y);
   dim3 grid(blocks_x, 1);
 
-  KeContextProjectionBackwardWeight<32, 32>
-    <<< grid, threads, 0, STREAM_DEFAULT >>>
-    (out_grad, sequence, w_grad, num_sequences, w_dim,
-     context_length, context_start, begin_pad);
+  KeContextProjectionBackwardWeight<32,
+                                    32><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      out_grad,
+      sequence,
+      w_grad,
+      num_sequences,
+      w_dim,
+      context_length,
+      context_start,
+      begin_pad);
   CHECK_SYNC("hl_context_projection_backward_weight failed");
 }
 
 template <>
-void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
-        const GpuMatrix& out_grad,
-        GpuMatrix& w_grad,
-        const GpuIVector& seq_vec,
-        size_t context_length,
-        int context_start,
-        size_t total_pad,
-        size_t begin_pad) {
+void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
+                                                      GpuMatrix& w_grad,
+                                                      const GpuIVector& seq_vec,
+                                                      size_t context_length,
+                                                      int context_start,
+                                                      size_t total_pad,
+                                                      size_t begin_pad) {
   hl_context_projection_backward_weight(out_grad.getData(),
                                         seq_vec.getData(),
                                         w_grad.getData(),
@@ -376,23 +395,18 @@ void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
                                                 size_t begin_pad,
                                                 bool is_padding,
                                                 size_t total_pad) {
-    if (in_grad) {
-        ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
-                out_grad,
-                in_grad,
-                sequence,
-                context_length,
-                context_start);
-    }
-    if (is_padding && w_grad) {
-        ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
-                out_grad,
-                w_grad,
-                sequence,
-                context_length,
-                context_start,
-                total_pad,
-                begin_pad);
+  if (in_grad) {
+    ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
+        out_grad, in_grad, sequence, context_length, context_start);
+  }
+  if (is_padding && w_grad) {
+    ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(out_grad,
+                                                     w_grad,
+                                                     sequence,
+                                                     context_length,
+                                                     context_start,
+                                                     total_pad,
+                                                     begin_pad);
   }
 }
 
diff --git a/paddle/function/ConvOp.h b/paddle/function/ConvOp.h
index bb4f48364b..baf78bc6c8 100644
--- a/paddle/function/ConvOp.h
+++ b/paddle/function/ConvOp.h
@@ -109,6 +109,13 @@ protected:
     return filter[filter.ndims() - 1];
   }
 
+  // determine whether im2col needs to be performed
+  inline bool isNeedIm2col(const TensorShape& filter) const {
+    return !(getFilterHeight(filter) == 1 && getFilterWidth(filter) == 1 &&
+             strideH() == 1 && strideW() == 1 && paddingH() == 0 &&
+             paddingW() == 0);
+  }
+
   std::vector<size_t> strides_;
   std::vector<size_t> paddings_;
 
diff --git a/paddle/function/ConvOpTest.cpp b/paddle/function/ConvOpTest.cpp
deleted file mode 100644
index dfa2f78461..0000000000
--- a/paddle/function/ConvOpTest.cpp
+++ /dev/null
@@ -1,210 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <memory>
-#include "Function.h"
-#include "FunctionTest.h"
-
-namespace paddle {
-
-enum TestType {
-  kForwardTest = 0,
-  kBackwardInputTest = 1,
-  kBackwardFilterTest = 2,
-};
-
-template <DeviceType DType1, DeviceType DType2>
-class ConvolutionTest {
-public:
-  ConvolutionTest(const std::string& conv1,
-                  const std::string& conv2,
-                  TestType type,
-                  std::string algo = "auto") {
-    for (size_t batchSize : {1, 32}) {
-      for (size_t inputSize : {7, 14, 54}) {
-        for (size_t filterSize : {1, 3, 5}) {
-          for (size_t inputChannels : {3, 64}) {
-            for (size_t outputChannels : {3, 64, 128}) {
-              if (inputChannels < outputChannels) break;
-              for (size_t stride : {1, 2}) {
-                for (size_t padding : {0, 1}) {
-                  if (padding >= filterSize) break;
-                  size_t outputSize =
-                      (inputSize - filterSize + 2 * padding + stride) / stride;
-                  VLOG(3) << " batchSize=" << batchSize
-                          << " inputChannels=" << inputChannels
-                          << " inputHeight=" << inputSize
-                          << " inputWidth=" << inputSize
-                          << " outputChannels=" << outputChannels
-                          << " filterHeight=" << filterSize
-                          << " filterWidth=" << filterSize
-                          << " outputHeight=" << outputSize
-                          << " outputWidth=" << outputSize
-                          << " stride=" << stride << " padding=" << padding;
-
-                  std::vector<size_t> paddings = {padding, padding};
-                  std::vector<size_t> strides = {stride, stride};
-                  Compare2Function<DType1, DType2> test(
-                      conv1,
-                      conv2,
-                      FuncConfig()
-                          .set("paddings", paddings)
-                          .set("strides", strides)
-                          .set("groups", (size_t)1)
-                          .set("algo", algo));
-
-                  TensorShape input{
-                      batchSize, inputChannels, inputSize, inputSize};
-                  TensorShape filter{
-                      outputChannels, inputChannels, filterSize, filterSize};
-                  TensorShape output{
-                      batchSize, outputChannels, outputSize, outputSize};
-
-                  if (type == kForwardTest) {
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                    test.run();
-                  } else if (type == kBackwardInputTest) {
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
-                    test.run();
-                  } else if (type == kBackwardFilterTest) {
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                    test.run();
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-// Mainly used to test cases where the height and width (input, filter)
-// are not equal.
-template <DeviceType DType1, DeviceType DType2>
-class ConvolutionTest2 {
-public:
-  ConvolutionTest2(const std::string& conv1,
-                   const std::string& conv2,
-                   TestType type,
-                   std::string algo = "auto") {
-    for (size_t batchSize : {16}) {
-      for (size_t inputHeight : {7, 31}) {
-        for (size_t inputWidth : {10, 54}) {
-          for (size_t filterHeight : {1, 5}) {
-            for (size_t filterWidth : {3, 7}) {
-              for (size_t inputChannels : {7}) {
-                for (size_t outputChannels : {32}) {
-                  size_t stride = 1;
-                  size_t padding = 0;
-                  size_t outputHeight =
-                      (inputHeight - filterHeight + 2 * padding + stride) /
-                      stride;
-                  size_t outputWidth =
-                      (inputWidth - filterWidth + 2 * padding + stride) /
-                      stride;
-                  VLOG(3) << " batchSize=" << batchSize
-                          << " inputChannels=" << inputChannels
-                          << " inputHeight=" << inputHeight
-                          << " inputWidth=" << inputWidth
-                          << " outputChannels=" << outputChannels
-                          << " filterHeight=" << filterHeight
-                          << " filterWidth=" << filterWidth
-                          << " outputHeight=" << outputHeight
-                          << " outputWidth=" << outputWidth
-                          << " stride=" << stride << " padding=" << padding;
-
-                  std::vector<size_t> paddings = {padding, padding};
-                  std::vector<size_t> strides = {stride, stride};
-                  Compare2Function<DType1, DType2> test(
-                      conv1,
-                      conv2,
-                      FuncConfig()
-                          .set("paddings", paddings)
-                          .set("strides", strides)
-                          .set("groups", (size_t)1)
-                          .set("algo", algo));
-
-                  TensorShape input{
-                      batchSize, inputChannels, inputHeight, inputWidth};
-                  TensorShape filter{
-                      outputChannels, inputChannels, filterHeight, filterWidth};
-                  TensorShape output{
-                      batchSize, outputChannels, outputHeight, outputWidth};
-
-                  if (type == kForwardTest) {
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                    test.run();
-                  } else if (type == kBackwardInputTest) {
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
-                    test.run();
-                  } else if (type == kBackwardFilterTest) {
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                    test.run();
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-TEST(Forward, GEMM) {
-  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test(
-      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest);
-  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test2(
-      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest);
-}
-
-#ifndef PADDLE_ONLY_CPU
-TEST(Forward, GEMM2) {
-  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConv-CPU", "GemmConv-GPU", kForwardTest);
-  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConv-CPU", "GemmConv-GPU", kForwardTest);
-}
-
-TEST(BackwardInput, GEMM) {
-  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest);
-  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest);
-}
-
-TEST(BackwardFilter, GEMM) {
-  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest);
-  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest);
-}
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/ConvOpTest.h b/paddle/function/ConvOpTest.h
new file mode 100644
index 0000000000..cb02a96d0d
--- /dev/null
+++ b/paddle/function/ConvOpTest.h
@@ -0,0 +1,256 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "FunctionTest.h"
+
+namespace paddle {
+
+template <DeviceType DType1, DeviceType DType2>
+void forward(Compare2Function<DType1, DType2>& test,
+             const TensorShape& input,
+             const TensorShape& filter,
+             const TensorShape& output) {
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
+  test.run();
+}
+
+template <DeviceType DType1, DeviceType DType2>
+void backward_input(Compare2Function<DType1, DType2>& test,
+                    const TensorShape& input,
+                    const TensorShape& filter,
+                    const TensorShape& output) {
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
+  test.run();
+}
+
+template <DeviceType DType1, DeviceType DType2>
+void backward_filter(Compare2Function<DType1, DType2>& test,
+                     const TensorShape& input,
+                     const TensorShape& filter,
+                     const TensorShape& output) {
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter), ADD_TO);
+  test.run();
+}
+
+template <DeviceType DType1, DeviceType DType2>
+using Function = void (*)(Compare2Function<DType1, DType2>& test,
+                          const TensorShape& input,
+                          const TensorShape& filter,
+                          const TensorShape& output);
+
+/**
+ * \brief A basic convolution function test interface.
+ *
+ * \param conv1         type name of convolution function 1.
+ * \param conv2         type name of convolution function 2.
+ * \param function      test function, can be one of the forward, backward_input
+ *                      backward_filter function.
+ * Example:
+ * 1. Compare GemmConv's CPU and GPU implementation:
+ *   Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+ *      "GemmConv-CPU", "GemmConv-GPU", forward);
+ */
+template <DeviceType DType1, DeviceType DType2>
+void Convolution(const std::string& conv1,
+                 const std::string& conv2,
+                 Function<DType1, DType2> function) {
+  for (size_t batchSize : {1, 5}) {
+    for (size_t inputSize : {7, 14, 31}) {
+      for (size_t filterSize : {1, 3, 5}) {
+        for (size_t inputChannels : {3, 16}) {
+          for (size_t outputChannels : {3, 16}) {
+            if (outputChannels < inputChannels) continue;
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                if (padding >= filterSize) break;
+
+                // NNPACK only supports stride = 1 if batchSize > 1
+                if ((conv1 == "NNPACKConv-CPU" || conv2 == "NNPACKConv-CPU") &&
+                    batchSize > 1 && stride > 1)
+                  break;
+
+                size_t outputSize =
+                    (inputSize - filterSize + 2 * padding + stride) / stride;
+                VLOG(3) << " batchSize=" << batchSize
+                        << " inputChannels=" << inputChannels
+                        << " inputHeight=" << inputSize
+                        << " inputWidth=" << inputSize
+                        << " outputChannels=" << outputChannels
+                        << " filterHeight=" << filterSize
+                        << " filterWidth=" << filterSize
+                        << " outputHeight=" << outputSize
+                        << " outputWidth=" << outputSize << " stride=" << stride
+                        << " padding=" << padding;
+
+                std::vector<size_t> paddings = {padding, padding};
+                std::vector<size_t> strides = {stride, stride};
+                Compare2Function<DType1, DType2> test(
+                    conv1,
+                    conv2,
+                    FuncConfig()
+                        .set("paddings", paddings)
+                        .set("strides", strides)
+                        .set("groups", (size_t)1)
+                        .set("algo", (std::string) "auto"));
+
+                TensorShape input{
+                    batchSize, inputChannels, inputSize, inputSize};
+                TensorShape filter{
+                    outputChannels, inputChannels, filterSize, filterSize};
+                TensorShape output{
+                    batchSize, outputChannels, outputSize, outputSize};
+
+                function(test, input, filter, output);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief A convolution function test interface for
+ *        image height is not equal image width.
+ */
+template <DeviceType DType1, DeviceType DType2>
+void Convolution2(const std::string& conv1,
+                  const std::string& conv2,
+                  Function<DType1, DType2> function) {
+  for (size_t batchSize : {4}) {
+    for (size_t inputHeight : {7, 31}) {
+      for (size_t inputWidth : {10, 54}) {
+        for (size_t filterHeight : {1, 5}) {
+          for (size_t filterWidth : {3, 7}) {
+            for (size_t inputChannels : {7}) {
+              for (size_t outputChannels : {7}) {
+                size_t stride = 1;
+                size_t padding = 0;
+                size_t outputHeight =
+                    (inputHeight - filterHeight + 2 * padding + stride) /
+                    stride;
+                size_t outputWidth =
+                    (inputWidth - filterWidth + 2 * padding + stride) / stride;
+                VLOG(3) << " batchSize=" << batchSize
+                        << " inputChannels=" << inputChannels
+                        << " inputHeight=" << inputHeight
+                        << " inputWidth=" << inputWidth
+                        << " outputChannels=" << outputChannels
+                        << " filterHeight=" << filterHeight
+                        << " filterWidth=" << filterWidth
+                        << " outputHeight=" << outputHeight
+                        << " outputWidth=" << outputWidth
+                        << " stride=" << stride << " padding=" << padding;
+
+                std::vector<size_t> paddings = {padding, padding};
+                std::vector<size_t> strides = {stride, stride};
+                Compare2Function<DType1, DType2> test(
+                    conv1,
+                    conv2,
+                    FuncConfig()
+                        .set("paddings", paddings)
+                        .set("strides", strides)
+                        .set("groups", (size_t)1)
+                        .set("algo", (std::string) "auto"));
+
+                TensorShape input{
+                    batchSize, inputChannels, inputHeight, inputWidth};
+                TensorShape filter{
+                    outputChannels, inputChannels, filterHeight, filterWidth};
+                TensorShape output{
+                    batchSize, outputChannels, outputHeight, outputWidth};
+
+                function(test, input, filter, output);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief A convolution function test interface for depthwise convolution.
+ */
+template <DeviceType DType1, DeviceType DType2>
+void DepthwiseConvolution(const std::string& conv1,
+                          const std::string& conv2,
+                          Function<DType1, DType2> function) {
+  for (size_t batchSize : {1, 32}) {
+    for (size_t inputSize : {7, 14, 54}) {
+      for (size_t filterSize : {3, 4}) {
+        for (size_t inputChannels : {32}) {
+          for (size_t outputChannels : {32, 64}) {
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                // NNPACK only supports stride = 1 if batchSize > 1,
+                // and there has some bug when batchSize > 1 and groups != 1
+                if ((conv1 == "NNPACKConv-CPU" || conv2 == "NNPACKConv-CPU") &&
+                    batchSize > 1)
+                  break;
+
+                size_t outputSize =
+                    (inputSize - filterSize + 2 * padding + stride) / stride;
+                VLOG(3) << " batchSize=" << batchSize
+                        << " inputChannels=" << inputChannels
+                        << " inputHeight=" << inputSize
+                        << " inputWidth=" << inputSize
+                        << " outputChannels=" << outputChannels
+                        << " filterHeight=" << filterSize
+                        << " filterWidth=" << filterSize
+                        << " outputHeight=" << outputSize
+                        << " outputWidth=" << outputSize << " stride=" << stride
+                        << " padding=" << padding;
+
+                std::vector<size_t> paddings = {padding, padding};
+                std::vector<size_t> strides = {stride, stride};
+                size_t groups = inputChannels;
+                Compare2Function<DType1, DType2> test(
+                    conv1,
+                    conv2,
+                    FuncConfig()
+                        .set("paddings", paddings)
+                        .set("strides", strides)
+                        .set("groups", groups)
+                        .set("algo", (std::string) "auto"));
+
+                TensorShape input{
+                    batchSize, inputChannels, inputSize, inputSize};
+                TensorShape filter{groups,
+                                   outputChannels / groups,
+                                   inputChannels / groups,
+                                   filterSize,
+                                   filterSize};
+                TensorShape output{
+                    batchSize, outputChannels, outputSize, outputSize};
+
+                function(test, input, filter, output);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/function/CosSimOpGpu.cu b/paddle/function/CosSimOpGpu.cu
index c62ab39551..a1f88f479b 100644
--- a/paddle/function/CosSimOpGpu.cu
+++ b/paddle/function/CosSimOpGpu.cu
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "CosSimOp.h"
 #include "hl_base.h"
 #include "hl_device_functions.cuh"
-#include "CosSimOp.h"
 
 namespace paddle {
 
-template<int block_size>
+template <int block_size>
 __global__ void KeCosSim(real* output,
                          const real* input1,
                          const real* input2,
@@ -78,8 +78,8 @@ void hlCossim(real* output,
   dim3 threads(block_size, 1);
   dim3 grid(1, input1_height);
 
-  KeCosSim<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>
-    (output, input1, input2, width, input1_height, input2_height, scale);
+  KeCosSim<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      output, input1, input2, width, input1_height, input2_height, scale);
   CHECK_SYNC("hlCossim failed");
 }
 
@@ -99,7 +99,7 @@ void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix& out_mat,
   hlCossim(out, x, y, dim, in1_mat.getHeight(), in2_mat.getHeight(), scale);
 }
 
-template<int block_size>
+template <int block_size>
 __global__ void KeCosSimDerivative(const real* grad,
                                    const real* output,
                                    const real* prev_out_x,
@@ -148,14 +148,13 @@ __global__ void KeCosSimDerivative(const real* grad,
   if (xy[0] == 0) {
     real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0]));
     for (int index = tid; index < width; index += block_size) {
-      prev_grad_x[index] +=
-        scale * grad[ty] * prev_out_y[index] * reciprocal;
+      prev_grad_x[index] += scale * grad[ty] * prev_out_y[index] * reciprocal;
       if (input2_height > 1) {
-        prev_grad_y[index] +=
-          scale * grad[ty] * prev_out_x[index] * reciprocal;
+        prev_grad_y[index] += scale * grad[ty] * prev_out_x[index] * reciprocal;
       } else {
-        paddle::paddleAtomicAdd(prev_grad_y + index,
-          scale * grad[ty] * prev_out_x[index] * reciprocal);
+        paddle::paddleAtomicAdd(
+            prev_grad_y + index,
+            scale * grad[ty] * prev_out_x[index] * reciprocal);
       }
     }
   } else {
@@ -163,17 +162,18 @@ __global__ void KeCosSimDerivative(const real* grad,
     real reciprocalSquareSumX = 1.0 / xx[0];
     real reciprocalSquareSumY = 1.0 / yy[0];
     for (int index = tid; index < width; index += block_size) {
-      prev_grad_x[index] += output[ty] * grad[ty] *
-        (prev_out_y[index] * reciprocalXY -
-         prev_out_x[index] * reciprocalSquareSumX);
+      prev_grad_x[index] +=
+          output[ty] * grad[ty] * (prev_out_y[index] * reciprocalXY -
+                                   prev_out_x[index] * reciprocalSquareSumX);
       if (input2_height > 1) {
-        prev_grad_y[index] += output[ty] * grad[ty] *
-          (prev_out_x[index] * reciprocalXY -
-           prev_out_y[index] * reciprocalSquareSumY);
+        prev_grad_y[index] +=
+            output[ty] * grad[ty] * (prev_out_x[index] * reciprocalXY -
+                                     prev_out_y[index] * reciprocalSquareSumY);
       } else {
-        paddle::paddleAtomicAdd(prev_grad_y + index, output[ty] * grad[ty] *
-          (prev_out_x[index] * reciprocalXY -
-           prev_out_y[index] * reciprocalSquareSumY));
+        paddle::paddleAtomicAdd(
+            prev_grad_y + index,
+            output[ty] * grad[ty] * (prev_out_x[index] * reciprocalXY -
+                                     prev_out_y[index] * reciprocalSquareSumY));
       }
     }
   }
@@ -198,9 +198,17 @@ void hlCossimDerivative(const real* grad,
   const int block_size = 256;
   dim3 threads(block_size, 1);
   dim3 grid(1, input1_height);
-  KeCosSimDerivative<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>
-    (grad, output, prev_out_x, prev_out_y, prev_grad_x, prev_grad_y, width,
-        input1_height, input2_height, scale);
+  KeCosSimDerivative<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      grad,
+      output,
+      prev_out_x,
+      prev_out_y,
+      prev_grad_x,
+      prev_grad_y,
+      width,
+      input1_height,
+      input2_height,
+      scale);
   CHECK_SYNC("hlCossimDerivate failed");
 }
 
@@ -214,9 +222,9 @@ void CosSimBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
                                      real scale) {
   CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() &&
         in2_val.getData() && in1_grad.getData() && in2_grad.getData());
-  CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_
-        && in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_)
-        << "Matrix types are not equally GPU";
+  CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_ &&
+        in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_)
+      << "Matrix types are not equally GPU";
 
   size_t dim = in1_val.getWidth();
   const real* grad = out_grad.getData();
diff --git a/paddle/function/CropOp.cpp b/paddle/function/CropOp.cpp
new file mode 100644
index 0000000000..f12ee43e3d
--- /dev/null
+++ b/paddle/function/CropOp.cpp
@@ -0,0 +1,177 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CropOp.h"
+#include "paddle/function/TensorShape.h"
+#include "paddle/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void Crop<DEVICE_TYPE_CPU>(real* outputs,
+                           const real* inputs,
+                           const TensorShape inShape,
+                           const TensorShape outShape,
+                           const FuncConfig& conf) {
+  std::vector<uint32_t> crop_corner =
+      conf.get<std::vector<uint32_t>>("crop_corner");
+  int cCrop = crop_corner[1];
+  int hCrop = crop_corner[2];
+  int wCrop = crop_corner[3];
+
+  int num = inShape[0];
+  int inC = inShape[1];
+  int inH = inShape[2];
+  int inW = inShape[3];
+
+  int outC = outShape[1];
+  int outH = outShape[2];
+  int outW = outShape[3];
+
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < outC; c++) {
+      for (int h = 0; h < outH; h++) {
+        int outoff = ((n * outC + c) * outH + h) * outW;
+        int inoff = ((n * inC + c + cCrop) * inH + h + hCrop) * inW + wCrop;
+        memcpy(outputs + outoff, inputs + inoff, outW * sizeof(real));
+      }
+    }
+  }
+}
+
+template <>
+void CropGrad<DEVICE_TYPE_CPU>(const real* inGrad,
+                               real* outGrad,
+                               const TensorShape inShape,
+                               const TensorShape outShape,
+                               const FuncConfig& conf) {
+  std::vector<uint32_t> crop_corner =
+      conf.get<std::vector<uint32_t>>("crop_corner");
+  int cCrop = crop_corner[1];
+  int hCrop = crop_corner[2];
+  int wCrop = crop_corner[3];
+
+  int num = outShape[0];
+  int outC = outShape[1];
+  int outH = outShape[2];
+  int outW = outShape[3];
+
+  int inC = inShape[1];
+  int inH = inShape[2];
+  int inW = inShape[3];
+
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < inC; c++) {
+      for (int h = 0; h < inH; h++) {
+        int outoff = ((n * outC + c + cCrop) * outH + h + hCrop) * outW + wCrop;
+        int inoff = ((n * inC + c) * inH + h) * inW;
+        CpuVector inG = CpuVector(inW, const_cast<real*>(inGrad + inoff));
+        CpuVector outG = CpuVector(inW, outGrad + outoff);
+        outG += inG;
+      }
+    }
+  }
+}
+
+/**
+ * \brief Crop input according to the specify corner and shape.
+ *        The input and output is a 4D tensor. In CropFunc, we only
+ *        crop the 2nd to 4th dimension.
+ *
+ * Argument in this Function:
+ * \param pad_    A struct object contains the cropping corner and shape.
+ * \param inputs  A 4D tensor, only one input.
+ * \param outputs A 4D tensor, the output value after cropping.
+ *
+ * For example,
+ * Input(2,2,2,3) = [
+ *                    [ [[1,2,3], [3,4,5]],
+ *                      [[2,3,5], [1,6,7]] ],
+ *                    [ [[4,3,1], [1,8,7]],
+ *                      [[3,8,9], [2,3,5]] ]
+ *                  ] # the input shape is (2,2,2,3)
+ *
+ * pad_: if corner = (0,1,1) and crop_shape = (2,1,2)
+ * Output(2,2,1,2) = [
+ *                    [ [[4,5]],
+ *                      [[6,7]] ],
+ *                    [ [[8,7]],
+ *                      [[3,5]] ]
+ *                  ] # the input shape is (2,2,2,3)
+ */
+template <DeviceType Device>
+class CropFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override { conf_ = config; }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+    TensorShape inShape = inputs[0].shape();
+    TensorShape outShape = outputs[0].shape();
+
+    Crop<Device>(outputs[0].data<real>(),
+                 inputs[0].data<real>(),
+                 inShape,
+                 outShape,
+                 conf_);
+  }
+
+private:
+  FuncConfig conf_;
+};
+
+/**
+ * \brief The backward propagation of cropping Function.
+ *
+ * Argument in this Function:
+ * \param crop_    The same meaning as it in CropFunc.
+ * \param inputs  The gradient with respect to the output value of CropFunc.
+ * \param outputs The gradient with respect to the input value of CropFunc.
+ */
+
+template <DeviceType Device>
+class CropGradFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override { conf_ = config; }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
+    TensorShape outShape = outputs[0].shape();
+    TensorShape inShape = inputs[0].shape();
+
+    CropGrad<Device>(inputs[0].data<real>(),
+                     outputs[0].data<real>(),
+                     inShape,
+                     outShape,
+                     conf_);
+  }
+
+private:
+  FuncConfig conf_;
+};
+
+REGISTER_TYPED_FUNC(Crop, CPU, CropFunc);
+REGISTER_TYPED_FUNC(CropGrad, CPU, CropGradFunc);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(Crop, GPU, CropFunc);
+REGISTER_TYPED_FUNC(CropGrad, GPU, CropGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/CropOp.h b/paddle/function/CropOp.h
new file mode 100644
index 0000000000..87986fbdc7
--- /dev/null
+++ b/paddle/function/CropOp.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/**
+ * \brief  This funtion crops inputs according to the specify start point and
+ *shape.
+ *
+ * \param[out] outputs	save results.
+ * \param[in]  inputs	input data.
+ * \param[in]  inShape  the shape of input tensor.
+ * \param[in]  conf     the cropping config
+ */
+template <DeviceType Device>
+void Crop(real* outputs,
+          const real* inputs,
+          const TensorShape inShape,
+          const TensorShape outShape,
+          const FuncConfig& conf);
+
+/**
+ * \brief   Cropping operation backward.
+ *
+ * \param[out] inGrad	gradients of previous layer
+ * \param[in]  outGrad  output gradient
+ * \param[in]  inShape  the shape of input tensor.
+ * \param[in]  conf     the cropping config
+ */
+template <DeviceType Device>
+void CropGrad(const real* inGrad,
+              real* outGrad,
+              const TensorShape inShape,
+              const TensorShape outShape,
+              const FuncConfig& conf);
+}  // namespace paddle
diff --git a/paddle/function/CropOpGpu.cu b/paddle/function/CropOpGpu.cu
new file mode 100644
index 0000000000..241356a9ca
--- /dev/null
+++ b/paddle/function/CropOpGpu.cu
@@ -0,0 +1,150 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CropOp.h"
+#include "hl_base.h"
+
+namespace paddle {
+
+__global__ void KeCrop(real* outputs,
+                       const real* inputs,
+                       int inC,
+                       int inH,
+                       int inW,
+                       int cropC,
+                       int cropH,
+                       int cropW,
+                       int outC,
+                       int outH,
+                       int outW,
+                       int nthreads) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nthreads) {
+    const int w = idx % outW;
+    const int h = (idx / outW) % outH;
+    const int c = (idx / outW / outH) % outC;
+    const int n = idx / outW / outH / outC;
+
+    const int off = ((n * inC + c + cropC) * inH + h + cropH) * inW + cropW + w;
+    outputs[idx] = inputs[off];
+  }
+}
+
+template <>
+void Crop<DEVICE_TYPE_GPU>(real* outputs,
+                           const real* inputs,
+                           const TensorShape inShape,
+                           const TensorShape outShape,
+                           const FuncConfig& conf) {
+  std::vector<uint32_t> crop_corner =
+      conf.get<std::vector<uint32_t>>("crop_corner");
+  int cropC = crop_corner[1];
+  int cropH = crop_corner[2];
+  int cropW = crop_corner[3];
+
+  int num = inShape[0];
+  int inC = inShape[1];
+  int inH = inShape[2];
+  int inW = inShape[3];
+
+  int outC = outShape[1];
+  int outH = outShape[2];
+  int outW = outShape[3];
+
+  size_t nth = num * outC * outH * outW;
+  int blockSize = 1024;
+  int gridSize = (nth + blockSize - 1) / blockSize;
+
+  KeCrop<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(outputs,
+                                                     inputs,
+                                                     inC,
+                                                     inH,
+                                                     inW,
+                                                     cropC,
+                                                     cropH,
+                                                     cropW,
+                                                     outC,
+                                                     outH,
+                                                     outW,
+                                                     nth);
+  CHECK_SYNC("Crop");
+}
+
+__global__ void KeCropDiff(const real* inGrad,
+                           real* outGrad,
+                           int inC,
+                           int inH,
+                           int inW,
+                           int cropC,
+                           int cropH,
+                           int cropW,
+                           int outC,
+                           int outH,
+                           int outW,
+                           int nthreads) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nthreads) {
+    const int w = idx % inW;
+    const int h = (idx / inW) % inH;
+    const int c = (idx / inW / inH) % inC;
+    const int n = idx / inW / inH / inC;
+
+    const int off =
+        ((n * outC + c + cropC) * outH + h + cropH) * outW + cropW + w;
+
+    outGrad[off] += inGrad[idx];
+  }
+}
+
+template <>
+void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad,
+                               real* outGrad,
+                               const TensorShape inShape,
+                               const TensorShape outShape,
+                               const FuncConfig& conf) {
+  std::vector<uint32_t> crop_corner =
+      conf.get<std::vector<uint32_t>>("crop_corner");
+  int cropC = crop_corner[1];
+  int cropH = crop_corner[2];
+  int cropW = crop_corner[3];
+
+  int num = outShape[0];
+  int outC = outShape[1];
+  int outH = outShape[2];
+  int outW = outShape[3];
+
+  int inC = inShape[1];
+  int inH = inShape[2];
+  int inW = inShape[3];
+
+  size_t nth = num * inC * inH * inW;
+  int blockSize = 1024;
+  int gridSize = (nth + blockSize - 1) / blockSize;
+
+  KeCropDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(inGrad,
+                                                         outGrad,
+                                                         inC,
+                                                         inH,
+                                                         inW,
+                                                         cropC,
+                                                         cropH,
+                                                         cropW,
+                                                         outC,
+                                                         outH,
+                                                         outW,
+                                                         nth);
+  CHECK_SYNC("CropGrad");
+}
+
+}  // namespace paddle
diff --git a/paddle/function/CropOpTest.cpp b/paddle/function/CropOpTest.cpp
new file mode 100644
index 0000000000..6f11abfdf6
--- /dev/null
+++ b/paddle/function/CropOpTest.cpp
@@ -0,0 +1,49 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+
+namespace paddle {
+
+TEST(Crop, real) {
+  for (size_t numSamples : {5, 32}) {
+    for (size_t channels : {5, 5, 32}) {
+      for (size_t imgSizeH : {5, 33, 100}) {
+        for (size_t imgSizeW : {5, 32, 96}) {
+          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
+                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
+          for (bool test_grad : {false, true}) {
+            CpuGpuFuncCompare compare(
+                test_grad ? "CropGrad" : "Crop",
+                FuncConfig()
+                    .set<std::vector<uint32_t>>("crop_corner", {0, 1, 1, 1})
+                    .set<std::vector<uint32_t>>("crop_shape", {0, 2, 3, 3}));
+            TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
+            TensorShape outDims{numSamples, 2, 3, 3};
+            compare.addInputs(
+                BufferArg(VALUE_TYPE_FLOAT, test_grad ? outDims : inDims));
+            compare.addOutputs(BufferArg(VALUE_TYPE_FLOAT,
+                                         test_grad ? inDims : outDims,
+                                         test_grad ? ADD_TO : ASSIGN_TO),
+                               test_grad ? ADD_TO : ASSIGN_TO);
+            compare.run();
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/function/CrossMapNormalOpGpu.cu b/paddle/function/CrossMapNormalOpGpu.cu
index b33dd10834..88b991ff6a 100644
--- a/paddle/function/CrossMapNormalOpGpu.cu
+++ b/paddle/function/CrossMapNormalOpGpu.cu
@@ -12,14 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "hl_base.h"
 #include "CrossMapNormalOp.h"
+#include "hl_base.h"
 
 namespace paddle {
 
-__global__ void KeCMRNormFillScale(size_t imageSize, const real* in,
-                                   real* scale, size_t channels,
-                                   size_t height, size_t width, size_t size,
+__global__ void KeCMRNormFillScale(size_t imageSize,
+                                   const real* in,
+                                   real* scale,
+                                   size_t channels,
+                                   size_t height,
+                                   size_t width,
+                                   size_t size,
                                    real alpha) {
   const int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < imageSize) {
@@ -51,8 +55,10 @@ __global__ void KeCMRNormFillScale(size_t imageSize, const real* in,
   }
 }
 
-__global__ void KeCMRNormOutput(size_t inputSize, const real* in,
-                                const real* scale, real negative_beta,
+__global__ void KeCMRNormOutput(size_t inputSize,
+                                const real* in,
+                                const real* scale,
+                                real negative_beta,
                                 real* out) {
   const int index = threadIdx.x + blockIdx.x * blockDim.x;
   if (index < inputSize) {
@@ -74,24 +80,30 @@ void CrossMapNormal<DEVICE_TYPE_GPU>(real* outputs,
   size_t imageSize = numSamples * height * width;
   int blockSize = 1024;
   int gridSize = (imageSize + 1024 - 1) / 1024;
-  KeCMRNormFillScale<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-    (imageSize, inputs, denoms, channels, height, width, size, scale);
+  KeCMRNormFillScale<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+      imageSize, inputs, denoms, channels, height, width, size, scale);
 
-  size_t inputSize = numSamples * height * width *channels;
+  size_t inputSize = numSamples * height * width * channels;
   blockSize = 1024;
   gridSize = (inputSize + 1024 - 1) / 1024;
-  KeCMRNormOutput<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-    (inputSize, inputs, denoms, -pow, outputs);
+  KeCMRNormOutput<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+      inputSize, inputs, denoms, -pow, outputs);
 
   CHECK_SYNC("CrossMapNormal");
 }
 
-__global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data,
-                              const real* top_data, const real* scale,
-                              const real* top_diff, size_t channels,
-                              size_t height, size_t width, size_t size,
-                              real negative_beta, real cache_ratio,
-                              real* bottom_diff ) {
+__global__ void KeCMRNormDiff(size_t imageSize,
+                              const real* bottom_data,
+                              const real* top_data,
+                              const real* scale,
+                              const real* top_diff,
+                              size_t channels,
+                              size_t height,
+                              size_t width,
+                              size_t size,
+                              real negative_beta,
+                              real cache_ratio,
+                              real* bottom_diff) {
   const int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < imageSize) {
     const int w = idx % width;
@@ -113,17 +125,17 @@ __global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data,
     while (index < channels + post_pad) {
       if (index < channels) {
         accum += top_diff[index * step] * top_data[index * step] /
-          scale[index * step];
+                 scale[index * step];
       }
       if (index >= size) {
         accum -= top_diff[(index - size) * step] *
-          top_data[(index - size) * step] / scale[(index - size) * step];
+                 top_data[(index - size) * step] / scale[(index - size) * step];
       }
       if (index >= post_pad) {
         bottom_diff[(index - post_pad) * step] +=
-          top_diff[(index - post_pad) * step] *
-          pow(scale[(index - post_pad) * step], negative_beta) - cache_ratio *
-          bottom_data[(index - post_pad) * step] * accum;
+            top_diff[(index - post_pad) * step] *
+                pow(scale[(index - post_pad) * step], negative_beta) -
+            cache_ratio * bottom_data[(index - post_pad) * step] * accum;
       }
       ++index;
     }
@@ -147,9 +159,18 @@ void CrossMapNormalGrad<DEVICE_TYPE_GPU>(real* inputsGrad,
 
   int blockSize = 1024;
   int gridSize = (imageSize + 1024 - 1) / 1024;
-  KeCMRNormDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-    (imageSize, inputsValue, outputsValue, denoms, outputsGrad, channels,
-      height, width, size, -pow, 2.0f * pow * scale, inputsGrad);
+  KeCMRNormDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(imageSize,
+                                                            inputsValue,
+                                                            outputsValue,
+                                                            denoms,
+                                                            outputsGrad,
+                                                            channels,
+                                                            height,
+                                                            width,
+                                                            size,
+                                                            -pow,
+                                                            2.0f * pow * scale,
+                                                            inputsGrad);
   CHECK_SYNC("CrossMapNormalGrad");
 }
 
diff --git a/paddle/function/CrossMapNormalOpTest.cpp b/paddle/function/CrossMapNormalOpTest.cpp
index ed17b17da6..3b390db77f 100644
--- a/paddle/function/CrossMapNormalOpTest.cpp
+++ b/paddle/function/CrossMapNormalOpTest.cpp
@@ -18,11 +18,11 @@ limitations under the License. */
 namespace paddle {
 
 TEST(CrossMapNormal, real) {
-  for (size_t numSamples : {5, 32}) {
-    for (size_t channels : {1, 5, 32}) {
-      for (size_t imgSizeH : {5, 33, 100}) {
-        for (size_t imgSizeW : {5, 32, 96}) {
-          for (size_t size : {1, 2, 3, 5, 7}) {
+  for (size_t numSamples : {5}) {
+    for (size_t channels : {1, 5}) {
+      for (size_t imgSizeH : {5, 33}) {
+        for (size_t imgSizeW : {5, 32}) {
+          for (size_t size : {1, 3}) {
             VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
                     << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
                     << " size=" << size;
@@ -48,11 +48,11 @@ TEST(CrossMapNormal, real) {
 }
 
 TEST(CrossMapNormalGrad, real) {
-  for (size_t numSamples : {5, 32}) {
-    for (size_t channels : {1, 5, 32}) {
-      for (size_t imgSizeH : {5, 33, 100}) {
-        for (size_t imgSizeW : {5, 32, 96}) {
-          for (size_t size : {1, 2, 3, 5, 7}) {
+  for (size_t numSamples : {5}) {
+    for (size_t channels : {1, 5}) {
+      for (size_t imgSizeH : {5, 33}) {
+        for (size_t imgSizeW : {5, 32}) {
+          for (size_t size : {1, 3}) {
             VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
                     << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
                     << " size=" << size;
diff --git a/paddle/function/DepthwiseConvOp.cpp b/paddle/function/DepthwiseConvOp.cpp
new file mode 100644
index 0000000000..2f3112fe65
--- /dev/null
+++ b/paddle/function/DepthwiseConvOp.cpp
@@ -0,0 +1,305 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DepthwiseConvOp.h"
+#include "ConvOp.h"
+
+namespace paddle {
+
+template <class T>
+class DepthwiseConvFunctor<DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* inputData,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* outputData) {
+    // TODO(zhaolong) : cpu implementation of depthwise convolution
+  }
+};
+
+template <class T>
+class DepthwiseConvGradInputFunctor<DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* outputGrad,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* inputGrad) {}
+  // TODO(zhaolong) : cpu implementation of depthwise convolution
+};
+
+template <class T>
+class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* outputGrad,
+                  const T* inputData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* colData,
+                  T* filterGrad) {}
+  // TODO(zhaolong) : cpu implementation of depthwise convolution
+};
+
+/*
+ * \brief Forward calculation of depthwise convolution.
+ */
+template <DeviceType Device>
+class DepthwiseConvFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+    size_t filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(inputChannels, groups_);
+
+    real* inputData = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* outputData = outputs[0].data<real>();
+
+    DepthwiseConvFunctor<Device, real> depthwiseConv;
+    depthwiseConv(inputData,
+                  filterData,
+                  batchSize,
+                  outputChannels,
+                  outputHeight,
+                  outputWidth,
+                  inputChannels,
+                  inputHeight,
+                  inputWidth,
+                  filterMultiplier,
+                  filterHeight,
+                  filterWidth,
+                  strideH(),
+                  strideW(),
+                  paddingH(),
+                  paddingW(),
+                  outputData);
+  }
+};
+
+/*
+ * \brief Backward input calculation of depthwise convolution.
+ */
+template <DeviceType Device>
+class DepthwiseConvGradInputFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& input = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    check(inputs, outputs);
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& input = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+    size_t filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(inputChannels, groups_);
+
+    real* outputGrad = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* inputGrad = outputs[0].data<real>();
+
+    DepthwiseConvGradInputFunctor<Device, real> depthwiseConvGradInput;
+    depthwiseConvGradInput(outputGrad,
+                           filterData,
+                           batchSize,
+                           outputChannels,
+                           outputHeight,
+                           outputWidth,
+                           inputChannels,
+                           inputHeight,
+                           inputWidth,
+                           filterMultiplier,
+                           filterHeight,
+                           filterWidth,
+                           strideH(),
+                           strideW(),
+                           paddingH(),
+                           paddingW(),
+                           inputGrad);
+  }
+};
+
+/*
+ * \brief Backward filter calculation of depthwise convolution.
+ */
+template <DeviceType Device>
+class DepthwiseConvGradFilterFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& input = inputs[1].shape();
+    const TensorShape& filter = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    check(inputs, outputs);
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& input = inputs[1].shape();
+    const TensorShape& filter = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+    size_t filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(inputChannels, groups_);
+
+    real* outputGrad = inputs[0].data<real>();
+    real* inputData = inputs[1].data<real>();
+    real* filterGrad = outputs[0].data<real>();
+
+    int size = outputChannels * filterHeight * filterWidth * outputHeight *
+               outputWidth;
+    resizeBuffer<Device>(size);
+    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+
+    DepthwiseConvGradFilterFunctor<Device, real> depthwiseConvGradFilter;
+
+    depthwiseConvGradFilter(outputGrad,
+                            inputData,
+                            batchSize,
+                            outputChannels,
+                            outputHeight,
+                            outputWidth,
+                            inputChannels,
+                            inputHeight,
+                            inputWidth,
+                            filterMultiplier,
+                            filterHeight,
+                            filterWidth,
+                            strideH(),
+                            strideW(),
+                            paddingH(),
+                            paddingW(),
+                            colData,
+                            filterGrad);
+  }
+};
+
+REGISTER_TYPED_FUNC(DepthwiseConv, CPU, DepthwiseConvFunction);
+REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
+                    CPU,
+                    DepthwiseConvGradInputFunction);
+REGISTER_TYPED_FUNC(DepthwiseConvGradFilter,
+                    CPU,
+                    DepthwiseConvGradFilterFunction);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(DepthwiseConv, GPU, DepthwiseConvFunction);
+REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
+                    GPU,
+                    DepthwiseConvGradInputFunction);
+REGISTER_TYPED_FUNC(DepthwiseConvGradFilter,
+                    GPU,
+                    DepthwiseConvGradFilterFunction);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/DepthwiseConvOp.h b/paddle/function/DepthwiseConvOp.h
new file mode 100644
index 0000000000..1bf70e52f3
--- /dev/null
+++ b/paddle/function/DepthwiseConvOp.h
@@ -0,0 +1,159 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "TensorType.h"
+
+namespace paddle {
+
+/**
+ *\brief   Depthwise convolution forward. The outputData
+ *         of depthwise convolution is same with ExpandConvLayer
+ *         when groups equals inputChannels in ExpandConvLayer.
+ *
+ * \param[in]   inputData         input data.
+ * \param[in]   filterData        the Paramters of the depthwise conv layer..
+ * \param[in]   batchSize         batch size of input data.
+ * \param[in]   outputChannels    channels of outputData.
+ * \param[in]   outputHeight      height of outputData.
+ * \param[in]   outputWidth       width of outputData.
+ * \param[in]   inputChannels     channels of inputData.
+ * \param[in]   inputHeight       height of inputData.
+ * \param[in]   inputWidth        width of inputData..
+ * \param[in]   filterMultiplier  equals to outputChannels/groups_.
+ * \param[in]   filterHeight      height of filter.
+ * \param[in]   filterWidth       widht of filter.
+ * \param[in]   strideH           stride size in height direction.
+ * \param[in]   strideW           stride size in width direction.
+ * \param[in]   paddingH          padding size in height direction.
+ * \param[in]   paddingW          padding size in width direction.
+ * \param[out]  outputData        outputData.
+ *
+ */
+template <DeviceType Device, class T>
+class DepthwiseConvFunctor {
+public:
+  void operator()(const T* inputData,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* outputData);
+};
+
+/**
+ *\brief  Functor tot compute the depthwise convolution backprop w.r.t input.
+ *
+ *
+ * \param[in]   outputGradData    the grad data of output.
+ * \param[in]   filterData        the Paramters of the depthwise conv layer..
+ * \param[in]   batchSize         batch size of input data.
+ * \param[in]   outputChannels    channels of outputData.
+ * \param[in]   outputHeight      height of outputData.
+ * \param[in]   outputWidth       width of outputData.
+ * \param[in]   inputChannels     channels of input data.
+ * \param[in]   inputHeight       height of inputData.
+ * \param[in]   inputWidth        width of inputData.
+ * \param[in]   filterMultiplier  equals to outputChannels/groups_.
+ * \param[in]   filterHeight      height of filter.
+ * \param[in]   filterWidth       widht of filter.
+ * \param[in]   strideH           stride size in height direction.
+ * \param[in]   strideW           stride size in width direction.
+ * \param[in]   paddingH          padding size in height direction.
+ * \param[in]   paddingW          padding size in width direction.
+ * \param[out]  inputGrad         the grad data of input.
+ *
+ */
+template <DeviceType Device, class T>
+class DepthwiseConvGradInputFunctor {
+public:
+  void operator()(const T* outputGrad,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* inputGrad);
+};
+
+/**
+ *\brief  Functor tot compute the depthwise convolution backprop w.r.t filter.
+ *
+ * \param[in]   outputGradData    the grad data of output.
+ * \param[in]   inputData         inputData.
+ * \param[in]   batchSize         batch size of input data.
+ * \param[in]   outputChannels    channels of outputData.
+ * \param[in]   outputHeight      height of outputData.
+ * \param[in]   outputWidth       width of outputData.
+ * \param[in]   inputChannels     channels of input data.
+ * \param[in]   inputHeight       height of inputData.
+ * \param[in]   inputWidth        width of inputData.
+ * \param[in]   filterMultiplier  equals to outputChannels/groups_.
+ * \param[in]   filterHeight      height of filter.
+ * \param[in]   filterWidth       widht of filter.
+ * \param[in]   strideH           stride size in height direction.
+ * \param[in]   strideW           stride size in width direction.
+ * \param[in]   paddingH          padding size in height direction.
+ * \param[in]   paddingW          padding size in width direction.
+ * \param[in]   colData           Auxiliary data when calculating filterGrad.
+ * \param[in]   multiplierData    Auxiliary data when calculating filterGrad.
+ * \param[out]  filterGrad        the grad data of filter.
+ *
+ */
+template <DeviceType Device, class T>
+class DepthwiseConvGradFilterFunctor {
+public:
+  void operator()(const T* outputGrad,
+                  const T* inputData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* colData,
+                  T* filterGrad);
+};
+
+}  // namespace paddle
diff --git a/paddle/function/DepthwiseConvOpGpu.cu b/paddle/function/DepthwiseConvOpGpu.cu
new file mode 100644
index 0000000000..2d722dfcfc
--- /dev/null
+++ b/paddle/function/DepthwiseConvOpGpu.cu
@@ -0,0 +1,376 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DepthwiseConvOp.h"
+#include "paddle/math/BaseMatrix.h"
+
+namespace paddle {
+
+// CUDA kernel to compute the depthwise convolution forward pass
+template <class T>
+__global__ void ConvolutionDepthwiseForward(const int nthreads,
+                                            const T* const inputData,
+                                            const T* const filterData,
+                                            const int batchSize,
+                                            const int outputChannels,
+                                            const int outputHeight,
+                                            const int outputWidth,
+                                            const int inputChannels,
+                                            const int inputHeight,
+                                            const int inputWidth,
+                                            const int filterMultiplier,
+                                            const int filterHeight,
+                                            const int filterWidth,
+                                            const int strideH,
+                                            const int strideW,
+                                            const int paddingH,
+                                            const int paddingW,
+                                            T* const outputData) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if (index < nthreads) {
+    const int batch = index / outputChannels / outputHeight / outputWidth;
+    const int c_out = (index / outputHeight / outputWidth) % outputChannels;
+    const int h_out = (index / outputWidth) % outputHeight;
+    const int w_out = index % outputWidth;
+
+    const int c_in = c_out / filterMultiplier;
+    const T* weight = filterData + c_out * filterHeight * filterWidth;
+    T value = 0;
+    const int h_in_start = -paddingH + h_out * strideH;
+    const int w_in_start = -paddingW + w_out * strideW;
+    const int h_in_end = -paddingH + h_out * strideH + filterHeight - 1;
+    const int w_in_end = -paddingW + w_out * strideW + filterWidth - 1;
+    if ((h_in_start >= 0) && (h_in_end < inputHeight) && (w_in_start >= 0) &&
+        (w_in_end < inputWidth)) {
+      for (int kh = 0; kh < filterHeight; ++kh) {
+        for (int kw = 0; kw < filterWidth; ++kw) {
+          const int h_in = -paddingH + h_out * strideH + kh;
+          const int w_in = -paddingW + w_out * strideW + kw;
+          const int offset =
+              ((batch * inputChannels + c_in) * inputHeight + h_in) *
+                  inputWidth +
+              w_in;
+          value += (*weight) * inputData[offset];
+          ++weight;
+        }
+      }
+    } else {
+      for (int kh = 0; kh < filterHeight; ++kh) {
+        for (int kw = 0; kw < filterWidth; ++kw) {
+          const int h_in = -paddingH + h_out * strideH + kh;
+          const int w_in = -paddingW + w_out * strideW + kw;
+          if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
+              (w_in < inputWidth)) {
+            const int offset =
+                ((batch * inputChannels + c_in) * inputHeight + h_in) *
+                    inputWidth +
+                w_in;
+            value += (*weight) * inputData[offset];
+          }
+          ++weight;
+        }
+      }
+    }
+    outputData[index] = value;
+  }
+}
+
+// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
+template <class T>
+__global__ void ConvolutionDepthwiseInputBackward(const int nthreads,
+                                                  const T* const top_diff,
+                                                  const T* const weight_data,
+                                                  const int num,
+                                                  const int outputChannels,
+                                                  const int outputHeight,
+                                                  const int outputWidth,
+                                                  const int inputChannels,
+                                                  const int inputHeight,
+                                                  const int inputWidth,
+                                                  const int filterMultiplier,
+                                                  const int filterHeight,
+                                                  const int filterWidth,
+                                                  const int strideH,
+                                                  const int strideW,
+                                                  const int paddingH,
+                                                  const int paddingW,
+                                                  T* const bottom_diff) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    const int batch = index / inputChannels / inputHeight / inputWidth;
+    const int c_in = (index / inputHeight / inputWidth) % inputChannels;
+    const int h_in = (index / inputWidth) % inputHeight;
+    const int w_in = index % inputWidth;
+
+    const int c_out_start = c_in * filterMultiplier;
+
+    int h_out_start = (h_in - filterHeight + paddingH + strideH) / strideH;
+    h_out_start = 0 > h_out_start ? 0 : h_out_start;
+    int h_out_end = (h_in + paddingH) / strideH;
+    h_out_end = outputHeight - 1 < h_out_end ? outputHeight - 1 : h_out_end;
+    int w_out_start = (w_in - filterWidth + paddingW + strideW) / strideW;
+    w_out_start = 0 > w_out_start ? 0 : w_out_start;
+    int w_out_end = (w_in + paddingW) / strideW;
+    w_out_end = outputWidth - 1 < w_out_end ? outputWidth - 1 : w_out_end;
+
+    T value = 0;
+
+    for (int c_out = c_out_start; c_out < c_out_start + filterMultiplier;
+         c_out++) {
+      for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
+        const int filter_h = h_in + paddingH - h_out * strideH;
+        for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
+          const int filter_w = w_in + paddingW - w_out * strideW;
+          const int filter_offset = c_out * filterHeight * filterWidth +
+                                    filter_h * filterWidth + filter_w;
+          const int top_diff_offset =
+              ((batch * outputChannels + c_out) * outputHeight + h_out) *
+                  outputWidth +
+              w_out;
+          value += top_diff[top_diff_offset] * weight_data[filter_offset];
+        }
+      }
+    }
+    bottom_diff[index] += value;
+  }
+}
+
+// CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
+template <class T>
+__global__ void ConvolutionDepthwiseFilterBackward(const int num_i,
+                                                   const int nthreads,
+                                                   const T* const top_diff,
+                                                   const T* const inputData,
+                                                   const int num,
+                                                   const int outputChannels,
+                                                   const int outputHeight,
+                                                   const int outputWidth,
+                                                   const int inputChannels,
+                                                   const int inputHeight,
+                                                   const int inputWidth,
+                                                   const int filterMultiplier,
+                                                   const int filterHeight,
+                                                   const int filterWidth,
+                                                   const int strideH,
+                                                   const int strideW,
+                                                   const int paddingH,
+                                                   const int paddingW,
+                                                   T* const buffer_data) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    const int h_out = (index / outputWidth) % outputHeight;
+    const int w_out = index % outputWidth;
+    const int kh =
+        (index / filterWidth / outputHeight / outputWidth) % filterHeight;
+    const int kw = (index / outputHeight / outputWidth) % filterWidth;
+    const int h_in = -paddingH + h_out * strideH + kh;
+    const int w_in = -paddingW + w_out * strideW + kw;
+    if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
+        (w_in < inputWidth)) {
+      const int c_out =
+          index / (filterHeight * filterWidth * outputHeight * outputWidth);
+      const int c_in = c_out / filterMultiplier;
+      const int batch = num_i;
+      const int top_offset =
+          ((batch * outputChannels + c_out) * outputHeight + h_out) *
+              outputWidth +
+          w_out;
+      const int bottom_offset =
+          ((batch * inputChannels + c_in) * inputHeight + h_in) * inputWidth +
+          w_in;
+      buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset];
+    } else {
+      buffer_data[index] = 0;
+    }
+  }
+}
+
+template <class T>
+class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const T* inputData,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* outputData) {
+    int outputSize = batchSize * outputChannels * outputHeight * outputWidth;
+
+    size_t blocks = (outputSize + 1024 - 1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+
+    ConvolutionDepthwiseForward<T><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        outputSize,
+        inputData,
+        filterData,
+        batchSize,
+        outputChannels,
+        outputHeight,
+        outputWidth,
+        inputChannels,
+        inputHeight,
+        inputWidth,
+        filterMultiplier,
+        filterHeight,
+        filterWidth,
+        strideH,
+        strideW,
+        paddingH,
+        paddingW,
+        outputData);
+  }
+};
+
+template <class T>
+class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const T* outputGrad,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* inputGrad) {
+    int inputSize = batchSize * inputChannels * inputHeight * inputWidth;
+
+    size_t blocks = (inputSize + 1024 - 1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+
+    ConvolutionDepthwiseInputBackward<T>
+        // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<grid, threads, 0, STREAM_DEFAULT>>>(inputSize,
+                                               outputGrad,
+                                               filterData,
+                                               batchSize,
+                                               outputChannels,
+                                               outputHeight,
+                                               outputWidth,
+                                               inputChannels,
+                                               inputHeight,
+                                               inputWidth,
+                                               filterMultiplier,
+                                               filterHeight,
+                                               filterWidth,
+                                               strideH,
+                                               strideW,
+                                               paddingH,
+                                               paddingW,
+                                               inputGrad);
+  }
+};
+
+template <class T>
+class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const T* outputGrad,
+                  const T* inputData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* colData,
+                  T* filterGrad) {
+    int colDataSize = outputChannels * filterHeight * filterWidth *
+                      outputHeight * outputWidth;
+
+    size_t blocks = (colDataSize + 1024 - 1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+    BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth,
+                                1,
+                                filterGrad,
+                                false,
+                                true);
+
+    for (int i = 0; i < batchSize; i++) {
+      ConvolutionDepthwiseFilterBackward<
+          T><<<grid, threads, 0, STREAM_DEFAULT>>>(i,
+                                                   colDataSize,
+                                                   outputGrad,
+                                                   inputData,
+                                                   batchSize,
+                                                   outputChannels,
+                                                   outputHeight,
+                                                   outputWidth,
+                                                   inputChannels,
+                                                   inputHeight,
+                                                   inputWidth,
+                                                   filterMultiplier,
+                                                   filterHeight,
+                                                   filterWidth,
+                                                   strideH,
+                                                   strideW,
+                                                   paddingH,
+                                                   paddingW,
+                                                   colData);
+      int K = outputHeight * outputWidth;
+      int M = colDataSize / K;
+
+      BaseMatrix colMatrix(M, K, colData, false, true);
+      filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0);
+    }
+  }
+};
+
+#ifdef PADDLE_TYPE_DOUBLE
+template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, double>;
+template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, double>;
+template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, double>;
+#else
+template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, float>;
+template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, float>;
+template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, float>;
+#endif
+
+}  // namespace paddle
diff --git a/paddle/framework/op_desc_test.cc b/paddle/function/DepthwiseConvOpTest.cpp
similarity index 51%
rename from paddle/framework/op_desc_test.cc
rename to paddle/function/DepthwiseConvOpTest.cpp
index d0c52523b6..f44ae0c342 100644
--- a/paddle/framework/op_desc_test.cc
+++ b/paddle/function/DepthwiseConvOpTest.cpp
@@ -13,23 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <paddle/framework/op_desc.pb.h>
-
-TEST(OpDesc, Create) {
-  paddle::framework::OpDesc op_desc;
-  op_desc.set_type("add");
-  op_desc.add_inputs("X");
-  op_desc.add_inputs("Y");
-  op_desc.add_outputs("Z");
-
-  auto attr = op_desc.mutable_attrs()->Add();
-  attr->set_type(paddle::framework::AttrType::FLOAT);
-  attr->set_f(3.14);
-
-  // required field name is not set, so IsInitialized should be false.
-  ASSERT_FALSE(op_desc.IsInitialized());
-
-  attr->set_name("add");
-  // after all required fields are set, IsInitialized should be true now.
-  ASSERT_TRUE(op_desc.IsInitialized());
-}
\ No newline at end of file
+#include "ConvOpTest.h"
+
+namespace paddle {
+
+#ifndef PADDLE_ONLY_CPU
+TEST(DepthwiseConv, Forward) {
+  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConv-CPU", "DepthwiseConv-GPU", forward);
+}
+
+TEST(DepthwiseConv, BackwardInput) {
+  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConvGradInput-CPU", "DepthwiseConvGradInput-GPU", backward_input);
+}
+
+TEST(DepthwiseConv, BackwardFilter) {
+  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConvGradFilter-CPU", "DepthwiseConvGradFilter-GPU", backward_filter);
+}
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/EigenGemm.cpp b/paddle/function/EigenGemm.cpp
new file mode 100644
index 0000000000..674141ed39
--- /dev/null
+++ b/paddle/function/EigenGemm.cpp
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+
+template <class T>
+struct EigenBlasGemm {
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, int>,
+                           Eigen::Aligned>
+      Matrix;
+
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc) {
+    Eigen::array<int, 2> sizeA;
+    if (transA) {
+      sizeA[0] = K;
+      sizeA[1] = M;
+      CHECK_EQ(M, lda);
+    } else {
+      sizeA[0] = M;
+      sizeA[1] = K;
+      CHECK_EQ(K, lda);
+    }
+    Eigen::array<int, 2> sizeB;
+    if (transB) {
+      sizeB[0] = N;
+      sizeB[1] = K;
+      CHECK_EQ(K, ldb);
+    } else {
+      sizeB[0] = K;
+      sizeB[1] = N;
+      CHECK_EQ(N, ldb);
+    }
+    Eigen::array<int, 2> sizeC;
+    sizeC[0] = M;
+    sizeC[1] = N;
+    CHECK_EQ(N, ldc);
+
+    const Matrix a(const_cast<T*>(A), sizeA);
+    const Matrix b(const_cast<T*>(B), sizeB);
+    Matrix c(C, sizeC);
+
+    typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
+    Eigen::array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
+    dims[0].first = transA ? 0 : 1;
+    dims[0].second = transB ? 1 : 0;
+
+    Eigen::DefaultDevice device;
+    if (alpha == T(1) && beta == T(0)) {
+      c.device(device) = a.contract(b, dims);
+    } else if (alpha == T(1) && beta == T(1)) {
+      c.device(device) += a.contract(b, dims);
+    } else {
+      c.device(device) = alpha * a.contract(b, dims) + beta * c;
+    }
+  }
+};
+
+#ifdef PADDLE_TYPE_DOUBLE
+template class EigenBlasGemm<double>;
+#else
+template class EigenBlasGemm<float>;
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/FunctionTest.cpp b/paddle/function/FunctionTest.cpp
index fdf7e631e5..7b0b1c6adb 100644
--- a/paddle/function/FunctionTest.cpp
+++ b/paddle/function/FunctionTest.cpp
@@ -24,14 +24,14 @@ void FunctionApi(typename Tensor<real, DType>::Matrix& output,
 
 template <>
 void FunctionApi<DEVICE_TYPE_CPU>(CpuMatrix& output, const CpuMatrix& input) {
-  EXPECT_EQ(output.getHeight(), 100);
-  EXPECT_EQ(output.getWidth(), 200);
+  EXPECT_EQ(output.getHeight(), 100U);
+  EXPECT_EQ(output.getWidth(), 200U);
 }
 
 template <>
 void FunctionApi<DEVICE_TYPE_GPU>(GpuMatrix& output, const GpuMatrix& input) {
-  EXPECT_EQ(output.getHeight(), 10);
-  EXPECT_EQ(output.getWidth(), 20);
+  EXPECT_EQ(output.getHeight(), 10U);
+  EXPECT_EQ(output.getWidth(), 20U);
 }
 
 template <DeviceType DType>
@@ -85,16 +85,16 @@ void testBufferArgs(const BufferArgs& inputs,
 }
 
 void testBufferArgs(const BufferArgs& inputs, const CheckBufferArg& check) {
-  EXPECT_EQ(inputs.size(), 1);
+  EXPECT_EQ(inputs.size(), 1U);
   check(inputs[0]);
 }
 
 TEST(Arguments, Matrix) {
   MatrixPtr matrix = Matrix::create(100, 200);
   CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 2);
-    EXPECT_EQ(arg.shape()[0], 100);
-    EXPECT_EQ(arg.shape()[1], 200);
+    EXPECT_EQ(arg.shape().ndims(), 2U);
+    EXPECT_EQ(arg.shape()[0], 100U);
+    EXPECT_EQ(arg.shape()[1], 200U);
     EXPECT_EQ(arg.data(), matrix->getData());
 
     EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getHeight(), matrix->getHeight());
@@ -112,8 +112,8 @@ TEST(Arguments, Matrix) {
 TEST(Arguments, Vector) {
   VectorPtr vector = Vector::create(100, false);
   CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 1);
-    EXPECT_EQ(arg.shape()[0], 100);
+    EXPECT_EQ(arg.shape().ndims(), 1U);
+    EXPECT_EQ(arg.shape()[0], 100U);
     EXPECT_EQ(arg.data(), vector->getData());
 
     CpuVector inVector = arg.vector<real, DEVICE_TYPE_CPU>();
@@ -131,9 +131,9 @@ TEST(Arguments, Vector) {
 TEST(Arguments, CpuSparseMatrix) {
   CpuSparseMatrix sparse(200, 300, 50);
   CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 2);
-    EXPECT_EQ(arg.shape()[0], 200);
-    EXPECT_EQ(arg.shape()[1], 300);
+    EXPECT_EQ(arg.shape().ndims(), 2U);
+    EXPECT_EQ(arg.shape()[0], 200U);
+    EXPECT_EQ(arg.shape()[1], 300U);
     EXPECT_EQ(arg.data(), sparse.getData());
     // CHECK_EQ(arg.sparse().nnz(), 50);
     // CHECK_EQ(arg.sparse().dataFormat(), SPARSE_CSR_FORMAT);
@@ -152,10 +152,10 @@ TEST(Arguments, CpuSparseMatrix) {
 TEST(Arguments, BufferArg) {
   BufferArg arg(nullptr, VALUE_TYPE_FLOAT, {1, 2, 3});
   CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 3);
-    EXPECT_EQ(arg.shape()[0], 1);
-    EXPECT_EQ(arg.shape()[1], 2);
-    EXPECT_EQ(arg.shape()[2], 3);
+    EXPECT_EQ(arg.shape().ndims(), 3U);
+    EXPECT_EQ(arg.shape()[0], 1U);
+    EXPECT_EQ(arg.shape()[1], 2U);
+    EXPECT_EQ(arg.shape()[2], 3U);
   };
 
   BufferArgs argments;
diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index a40e5d9d2e..f8cf4ebea8 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -12,101 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "GemmConvOp.h"
+#include "ConvOp.h"
 #include "GemmFunctor.h"
+#include "Im2Col.h"
 #include "paddle/math/MemoryHandle.h"
 
 namespace paddle {
 
-/*
- * imData = [input_channels, input_height, input_width]
- * colData = [input_channels, filter_height, filter_width,
- *            output_height, output_width]
- */
-template <class T>
-class Im2ColFunctor<DEVICE_TYPE_CPU, T> {
-public:
-  void operator()(const T* imData,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  T* colData) {
-    int channelsCol = inputChannels * filterHeight * filterWidth;
-
-    for (int c = 0; c < channelsCol; ++c) {
-      int wOffset = c % filterWidth;
-      int hOffset = (c / filterWidth) % filterHeight;
-      int c_im = c / filterWidth / filterHeight;
-      for (int h = 0; h < outputHeight; ++h) {
-        for (int w = 0; w < outputWidth; ++w) {
-          int imRowIdx = h * strideHeight + hOffset;
-          int imColIdx = w * strideWidth + wOffset;
-          if ((imRowIdx - paddingHeight) < 0 ||
-              (imRowIdx - paddingHeight) >= inputHeight ||
-              (imColIdx - paddingWidth) < 0 ||
-              (imColIdx - paddingWidth) >= inputWidth) {
-            colData[(c * outputHeight + h) * outputWidth + w] = T(0);
-          } else {
-            imRowIdx += c_im * inputHeight - paddingHeight;
-            imColIdx -= paddingWidth;
-            colData[(c * outputHeight + h) * outputWidth + w] =
-                imData[imRowIdx * inputWidth + imColIdx];
-          }
-        }
-      }
-    }
-  }
-};
-
-template <class T>
-class Col2ImFunctor<DEVICE_TYPE_CPU, T> {
-public:
-  void operator()(const T* colData,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  T* imData) {
-    int channelsCol = inputChannels * filterHeight * filterWidth;
-
-    for (int c = 0; c < channelsCol; ++c) {
-      int wOffset = c % filterWidth;
-      int hOffset = (c / filterWidth) % filterHeight;
-      int c_im = c / filterWidth / filterHeight;
-      for (int h = 0; h < outputHeight; ++h) {
-        for (int w = 0; w < outputWidth; ++w) {
-          int imRowIdx = h * strideHeight + hOffset;
-          int imColIdx = w * strideWidth + wOffset;
-          if ((imRowIdx - paddingHeight) >= 0 &&
-              (imRowIdx - paddingHeight) < inputHeight &&
-              (imColIdx - paddingWidth) >= 0 &&
-              (imColIdx - paddingWidth) < inputWidth) {
-            imRowIdx += c_im * inputHeight - paddingHeight;
-            imColIdx -= paddingWidth;
-            imData[imRowIdx * inputWidth + imColIdx] +=
-                colData[(c * outputHeight + h) * outputWidth + w];
-          }
-        }
-      }
-    }
-  }
-};
-
 /*
  * \brief Forward calculation of convolution.
  */
@@ -117,8 +29,7 @@ public:
     ConvFunctionBase::init(config);
   }
 
-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
     const TensorShape& input = inputs[0].shape();
     const TensorShape& filter = inputs[1].shape();
     const TensorShape& output = outputs[0].shape();
@@ -155,51 +66,60 @@ public:
     real* inputData = inputs[0].data<real>();
     real* filterData = inputs[1].data<real>();
     real* outputData = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+
+    TensorShape colShape;
+    real* colData = NULL;
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+      resizeBuffer<Device>(colShape.getElements());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
 
-    size_t size = inputChannels / groups_ * filterHeight * filterWidth *
-                  outputHeight * outputWidth;
-    resizeBuffer<Device>(size);
-    real* colData = reinterpret_cast<real*>(memory_->getBuf());
-
-    Im2ColFunctor<Device, real> im2col;
-    GemmFunctor<Device, real> gemm;
-    size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth;
+    Im2ColFunctor<kCFO, Device, real> im2col;
+    size_t inputOffset = imShape.getElements();
     size_t outputOffset =
         (outputChannels / groups_) * outputHeight * outputWidth;
     size_t filterOffset = filter.getElements() / groups_;
 
     for (size_t i = 0; i < batchSize; i++) {
       for (size_t g = 0; g < groups_; g++) {
-        im2col(inputData + g * inputOffset,
-               inputChannels / groups_,
-               inputHeight,
-               inputWidth,
-               filterHeight,
-               filterWidth,
-               strideH(),
-               strideW(),
-               paddingH(),
-               paddingW(),
-               outputHeight,
-               outputWidth,
-               colData);
-
+        if (needIm2col) {
+          im2col(inputData + g * inputOffset,
+                 imShape,
+                 colData,
+                 colShape,
+                 strideH(),
+                 strideW(),
+                 paddingH(),
+                 paddingW());
+        } else {
+          colData = inputData + g * inputOffset;
+        }
         int M = outputChannels / groups_;
         int N = outputHeight * outputWidth;
         int K = inputChannels / groups_ * filterHeight * filterWidth;
-        gemm(CblasNoTrans,
-             CblasNoTrans,
-             M,
-             N,
-             K,
-             1.0f,
-             filterData + g * filterOffset,
-             K,
-             colData,
-             N,
-             beta,
-             outputData + g * outputOffset,
-             N);
+        BlasGemm<Device, real>::compute(false,
+                                        false,
+                                        M,
+                                        N,
+                                        K,
+                                        1.0f,
+                                        filterData + g * filterOffset,
+                                        K,
+                                        colData,
+                                        N,
+                                        beta,
+                                        outputData + g * outputOffset,
+                                        N);
       }
       inputData += inputChannels * inputHeight * inputWidth;
       outputData += outputChannels * outputHeight * outputWidth;
@@ -217,8 +137,7 @@ public:
     ConvFunctionBase::init(config);
   }
 
-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
     const TensorShape& output = inputs[0].shape();
     const TensorShape& filter = inputs[1].shape();
     const TensorShape& input = outputs[0].shape();
@@ -249,15 +168,26 @@ public:
     real* outputGrad = inputs[0].data<real>();
     real* filterData = inputs[1].data<real>();
     real* inputGrad = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+
+    TensorShape colShape;
+    real* colData = NULL;
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+      resizeBuffer<Device>(colShape.getElements());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
 
-    size_t size = inputChannels / groups_ * filterHeight * filterWidth *
-                  outputHeight * outputWidth;
-    resizeBuffer<Device>(size);
-    real* colData = reinterpret_cast<real*>(memory_->getBuf());
-
-    Col2ImFunctor<Device, real> col2im;
-    GemmFunctor<Device, real> gemm;
-    size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth;
+    Col2ImFunctor<kCFO, Device, real> col2im;
+    size_t inputOffset = imShape.getElements();
     size_t outputOffset =
         (outputChannels / groups_) * outputHeight * outputWidth;
     size_t filterOffset = filter.getElements() / groups_;
@@ -267,33 +197,34 @@ public:
         int K = outputChannels / groups_;
         int N = outputHeight * outputWidth;
         int M = inputChannels / groups_ * filterHeight * filterWidth;
-        gemm(CblasTrans,
-             CblasNoTrans,
-             M,
-             N,
-             K,
-             1.0f,
-             filterData + g * filterOffset,
-             M,
-             outputGrad + g * outputOffset,
-             N,
-             0.0f,
-             colData,
-             N);
-
-        col2im(colData,
-               inputChannels / groups_,
-               inputHeight,
-               inputWidth,
-               filterHeight,
-               filterWidth,
-               strideH(),
-               strideW(),
-               paddingH(),
-               paddingW(),
-               outputHeight,
-               outputWidth,
-               inputGrad + g * inputOffset);
+        real scale = 0.0f;
+        if (!needIm2col) {
+          colData = inputGrad + g * inputOffset;
+          scale = 1.0f;
+        }
+        BlasGemm<Device, real>::compute(true,
+                                        false,
+                                        M,
+                                        N,
+                                        K,
+                                        1.0f,
+                                        filterData + g * filterOffset,
+                                        M,
+                                        outputGrad + g * outputOffset,
+                                        N,
+                                        scale,
+                                        colData,
+                                        N);
+        if (needIm2col) {
+          col2im(inputGrad + g * inputOffset,
+                 imShape,
+                 colData,
+                 colShape,
+                 strideH(),
+                 strideW(),
+                 paddingH(),
+                 paddingW());
+        }
       }
       inputGrad += inputChannels * inputHeight * inputWidth;
       outputGrad += outputChannels * outputHeight * outputWidth;
@@ -311,8 +242,7 @@ public:
     ConvFunctionBase::init(config);
   }
 
-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
     const TensorShape& output = inputs[0].shape();
     const TensorShape& input = inputs[1].shape();
     const TensorShape& filter = outputs[0].shape();
@@ -347,50 +277,59 @@ public:
     real* outputGrad = inputs[0].data<real>();
     real* inputData = inputs[1].data<real>();
     real* filterGrad = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+
+    TensorShape colShape;
+    real* colData = NULL;
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+      resizeBuffer<Device>(colShape.getElements());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
 
-    size_t size = inputChannels / groups_ * filterHeight * filterWidth *
-                  outputHeight * outputWidth;
-    resizeBuffer<Device>(size);
-    real* colData = reinterpret_cast<real*>(memory_->getBuf());
-
-    Im2ColFunctor<Device, real> im2col;
-    GemmFunctor<Device, real> gemm;
-    size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth;
+    Im2ColFunctor<kCFO, Device, real> im2col;
+    size_t inputOffset = imShape.getElements();
     size_t outputOffset =
         (outputChannels / groups_) * outputHeight * outputWidth;
     size_t filterOffset = filter.getElements() / groups_;
     for (size_t i = 0; i < batchSize; i++) {
       for (size_t g = 0; g < groups_; g++) {
-        im2col(inputData + g * inputOffset,
-               inputChannels / groups_,
-               inputHeight,
-               inputWidth,
-               filterHeight,
-               filterWidth,
-               strideH(),
-               strideW(),
-               paddingH(),
-               paddingW(),
-               outputHeight,
-               outputWidth,
-               colData);
-
+        if (needIm2col) {
+          im2col(inputData + g * inputOffset,
+                 imShape,
+                 colData,
+                 colShape,
+                 strideH(),
+                 strideW(),
+                 paddingH(),
+                 paddingW());
+        } else {
+          colData = inputData + g * inputOffset;
+        }
         int M = outputChannels / groups_;
         int K = outputHeight * outputWidth;
         int N = inputChannels / groups_ * filterHeight * filterWidth;
-        gemm(CblasNoTrans,
-             CblasTrans,
-             M,
-             N,
-             K,
-             1.0f,
-             outputGrad + g * outputOffset,
-             K,
-             colData,
-             K,
-             i == 0 ? beta : 1.0f,
-             filterGrad + g * filterOffset,
-             N);
+        BlasGemm<Device, real>::compute(false,
+                                        true,
+                                        M,
+                                        N,
+                                        K,
+                                        1.0f,
+                                        outputGrad + g * outputOffset,
+                                        K,
+                                        colData,
+                                        K,
+                                        i == 0 ? beta : 1.0f,
+                                        filterGrad + g * filterOffset,
+                                        N);
       }
       inputData += inputChannels * inputHeight * inputWidth;
       outputGrad += outputChannels * outputHeight * outputWidth;
diff --git a/paddle/function/GemmConvOp.h b/paddle/function/GemmConvOp.h
deleted file mode 100644
index 9f11cce597..0000000000
--- a/paddle/function/GemmConvOp.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ConvOp.h"
-
-namespace paddle {
-
-/*
- * imData = [input_channels, input_height, input_width]
- * colData = [input_channels, filter_height, filter_width,
- *            output_height, output_width]
- */
-template <DeviceType Device, class T>
-class Im2ColFunctor {
-public:
-  void operator()(const T* imData,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  T* colData);
-};
-
-template <DeviceType Device, class T>
-class Col2ImFunctor {
-public:
-  void operator()(const T* colData,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  T* imData);
-};
-
-}  // namespace paddle
diff --git a/paddle/function/GemmConvOpGpu.cu b/paddle/function/GemmConvOpGpu.cu
deleted file mode 100644
index 2a1795ff0f..0000000000
--- a/paddle/function/GemmConvOpGpu.cu
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvOp.h"
-#include "GemmConvOp.h"
-
-namespace paddle {
-
-template<class T>
-__global__
-void im2col(const T* data_im, int numOuts, int height, int width,
-            int blockH, int blockW,
-            int strideH, int strideW,
-            int paddingH, int paddingW,
-            int height_col, int width_col,
-            T* data_col) {
-  int index =
-    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < numOuts) {
-    int w_out = index % width_col;
-    index /= width_col;
-    int h_out = index % height_col;
-    int channel_in = index / height_col;
-    int channel_out = channel_in * blockH * blockW;
-    int h_in = h_out * strideH;
-    int w_in = w_out * strideW;
-
-    data_col += (channel_out * height_col + h_out) * width_col + w_out;
-    for (int i = 0; i < blockH; ++i) {
-      for (int j = 0; j < blockW; ++j) {
-        int rIdx = int(h_in+i);
-        int cIdx = int(w_in+j);
-        if ((rIdx-(int)paddingH) >= (int)height ||
-            (rIdx-(int)paddingH) < 0 ||
-            (cIdx-(int)paddingW) >= (int)width ||
-            (cIdx-(int)paddingW) < 0) {
-          *data_col = 0;
-        } else {
-          rIdx = rIdx + channel_in*height - paddingH;
-          cIdx = cIdx - paddingW;
-          *data_col = data_im[rIdx* width + cIdx];
-        }
-        data_col += height_col * width_col;
-      }
-    }
-  }
-}
-
-template <class T>
-class Im2ColFunctor<DEVICE_TYPE_GPU, T> {
-public:
-  void operator()(const T* imData,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  T* colData) {
-    int numKernels = inputChannels * outputHeight * outputWidth;
-    int blocks = (numKernels + 1024 -1) / 1024;
-    int blockX = 512;
-    int blockY = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-    im2col<T><<< grid, threads, 0, STREAM_DEFAULT >>>
-        (imData, numKernels, inputHeight, inputWidth, filterHeight, filterWidth,
-         strideHeight, strideWidth, paddingHeight, paddingWidth,
-         outputHeight, outputWidth, colData);
-    CHECK_SYNC("Im2ColFunctor GPU failed");
-  }
-};
-
-template<class T>
-__global__
-void col2im(size_t n, const T* data_col, size_t height,
-            size_t width, size_t channels,
-            size_t blockH, size_t blockW,
-            size_t strideH, size_t strideW,
-            size_t paddingH, size_t paddingW,
-            size_t height_col, size_t width_col,
-            T* data_im) {
-  size_t index =
-    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < n) {
-    T val = 0;
-    int w = int(index % width);
-    int h = int((index / width) % height);
-    int c = int(index / (width * height));
-    if ((w - (int)paddingW) >= 0 &&
-        (w - (int)paddingW) < (width-2 * paddingW) &&
-        (h - (int)paddingH) >= 0 &&
-        (h - paddingH) < (height - 2 * paddingH)) {
-      // compute the start and end of the output
-      int w_col_start =
-        (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1;
-      int w_col_end =
-        min((int)(w / (int)strideW + 1), (int)(width_col));
-      int h_col_start =
-        (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1;
-      int h_col_end = min(int(h / strideH + 1), int(height_col));
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          // the col location: [c * width * height + h_out, w_out]
-          int c_col = int(c * blockH* blockW) + \
-            (h - h_col * (int)strideH) * (int)blockW +
-            (w - w_col * (int)strideW);
-          val += data_col[(c_col * height_col + h_col) * width_col + w_col];
-        }
-      }
-      h -= paddingH;
-      w -= paddingW;
-      data_im[c*((width-2*paddingW) * (height-2*paddingH)) +
-              h*(width-2*paddingW) + w] += val;
-    }
-  }
-}
-
-template <class T>
-class Col2ImFunctor<DEVICE_TYPE_GPU, T> {
-public:
-  void operator()(const T* colData,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  T* imData) {
-    size_t numKernels = inputChannels * (inputHeight + 2*paddingHeight)
-        * (inputWidth + 2*paddingWidth);
-
-    size_t blocks = (numKernels + 1024 -1) / 1024;
-    size_t blockX = 512;
-    size_t blockY = (blocks+512-1)/512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-
-    // To avoid involving atomic operations, we will launch one kernel per
-    // bottom dimension, and then in the kernel add up the top dimensions.
-    col2im<T><<< grid, threads, 0, STREAM_DEFAULT >>>
-             (numKernels,
-              colData,
-              inputHeight + 2*paddingHeight,
-              inputWidth + 2*paddingWidth,
-              inputChannels,
-              filterHeight,
-              filterWidth,
-              strideHeight,
-              strideWidth,
-              paddingHeight,
-              paddingWidth,
-              outputHeight,
-              outputWidth,
-              imData);
-    CHECK_SYNC("Col2ImFunctor GPU failed");
-  }
-};
-
-template class Im2ColFunctor<DEVICE_TYPE_GPU, float>;
-template class Im2ColFunctor<DEVICE_TYPE_GPU, double>;
-template class Col2ImFunctor<DEVICE_TYPE_GPU, float>;
-template class Col2ImFunctor<DEVICE_TYPE_GPU, double>;
-
-}  // namespace paddle
diff --git a/paddle/function/GemmConvOpTest.cpp b/paddle/function/GemmConvOpTest.cpp
new file mode 100644
index 0000000000..5283d79a5a
--- /dev/null
+++ b/paddle/function/GemmConvOpTest.cpp
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "ConvOpTest.h"
+
+namespace paddle {
+
+TEST(GemmConv, NaiveConv) {
+  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
+      "NaiveConv-CPU", "GemmConv-CPU", forward);
+  Convolution2<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
+      "NaiveConv-CPU", "GemmConv-CPU", forward);
+}
+
+#ifndef PADDLE_ONLY_CPU
+TEST(GemmConv, Forward) {
+  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConv-CPU", "GemmConv-GPU", forward);
+  Convolution2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConv-CPU", "GemmConv-GPU", forward);
+}
+
+TEST(GemmConv, BackwardInput) {
+  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", backward_input);
+  Convolution2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", backward_input);
+}
+
+TEST(GemmConv, BackwardFilter) {
+  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", backward_filter);
+  Convolution2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", backward_filter);
+}
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/GemmFunctor.cpp b/paddle/function/GemmFunctor.cpp
new file mode 100644
index 0000000000..9e25ee58a1
--- /dev/null
+++ b/paddle/function/GemmFunctor.cpp
@@ -0,0 +1,90 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "GemmFunctor.h"
+#include "paddle/math/MathFunctions.h"
+
+namespace paddle {
+
+template <class T>
+struct BlasGemm<DEVICE_TYPE_CPU, T> {
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc) {
+#ifdef PADDLE_USE_EIGEN_FOR_BLAS
+    EigenBlasGemm<T>::compute(
+        transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+#else
+    gemm<T>(transA == false ? CblasNoTrans : CblasTrans,
+            transB == false ? CblasNoTrans : CblasTrans,
+            M,
+            N,
+            K,
+            alpha,
+            A,
+            lda,
+            B,
+            ldb,
+            beta,
+            C,
+            ldc);
+#endif
+  }
+};
+
+template <class T>
+struct BlasGemm<DEVICE_TYPE_GPU, T> {
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc) {
+    hl_matrix_mul((T*)A,
+                  transA == false ? HPPL_OP_N : HPPL_OP_T,
+                  (T*)B,
+                  transB == false ? HPPL_OP_N : HPPL_OP_T,
+                  C,
+                  M,
+                  N,
+                  K,
+                  alpha,
+                  beta,
+                  lda,
+                  ldb,
+                  ldc);
+  }
+};
+
+template struct BlasGemm<DEVICE_TYPE_CPU, real>;
+template struct BlasGemm<DEVICE_TYPE_GPU, real>;
+
+}  // namespace paddle
diff --git a/paddle/function/GemmFunctor.h b/paddle/function/GemmFunctor.h
index d5db5cf5e7..0809953b4e 100644
--- a/paddle/function/GemmFunctor.h
+++ b/paddle/function/GemmFunctor.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/math/MathFunctions.h"
+#include "TensorType.h"
 
 namespace paddle {
 
@@ -24,73 +24,42 @@ namespace paddle {
 // of MatMulFunction, we need to consider the reconstruction of hl_matrix_mul
 // interface.
 template <DeviceType Device, class T>
-class GemmFunctor {
-public:
-  void operator()(const CBLAS_TRANSPOSE transA,
-                  const CBLAS_TRANSPOSE TransB,
-                  const int M,
-                  const int N,
-                  const int K,
-                  const T alpha,
-                  const T* A,
-                  const int lda,
-                  const T* B,
-                  const int ldb,
-                  const T beta,
-                  T* C,
-                  const int ldc);
+struct BlasGemm {
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc);
 };
 
+// TODO(hedaoyuan): Since the definition of the real type in the Paddle
+// conflicts with the Eigen library, so compile the Eigen code can not
+// include the Paddle header file. And need an EigenBlasGemm template class
+// that does not contain the DeviceType parameter.
+// I will fix this problem and merge BlasGemm and EigenBlasGemm into one.
 template <class T>
-class GemmFunctor<DEVICE_TYPE_CPU, T> {
-public:
-  void operator()(const CBLAS_TRANSPOSE transA,
-                  const CBLAS_TRANSPOSE TransB,
-                  const int M,
-                  const int N,
-                  const int K,
-                  const T alpha,
-                  const T* A,
-                  const int lda,
-                  const T* B,
-                  const int ldb,
-                  const T beta,
-                  T* C,
-                  const int ldc) {
-    gemm<T>(transA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
-  }
-};
-
-template <class T>
-class GemmFunctor<DEVICE_TYPE_GPU, T> {
-public:
-  void operator()(const CBLAS_TRANSPOSE transA,
-                  const CBLAS_TRANSPOSE TransB,
-                  const int M,
-                  const int N,
-                  const int K,
-                  const T alpha,
-                  const T* A,
-                  const int lda,
-                  const T* B,
-                  const int ldb,
-                  const T beta,
-                  T* C,
-                  const int ldc) {
-    hl_matrix_mul((T*)A,
-                  transA == CblasNoTrans ? HPPL_OP_N : HPPL_OP_T,
-                  (T*)B,
-                  TransB == CblasNoTrans ? HPPL_OP_N : HPPL_OP_T,
-                  C,
-                  M,
-                  N,
-                  K,
-                  alpha,
-                  beta,
-                  lda,
-                  ldb,
-                  ldc);
-  }
+struct EigenBlasGemm {
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc);
 };
 
 }  // namespace paddle
diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h
new file mode 100644
index 0000000000..48e2e32f92
--- /dev/null
+++ b/paddle/function/Im2Col.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "TensorShape.h"
+#include "TensorType.h"
+
+namespace paddle {
+
+/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */
+enum ColFormat { kCFO = 0, kOCF = 1 };
+
+/*
+ * \brief Converts the image data of three dimensions(CHW) into a colData of
+ *        five dimensions in the Im2ColFunctor calculation,
+ *        And in the Col2ImFunctor calculation, it is reversed.
+ *
+ * \param imData   Image data.
+ * \param imShape  The shape of imData,
+ *                 [inputChannels, inputHeight, inputWidth].
+ * \param colData  Column data.
+ * \param colShape The shape of colData.
+ *
+ * If the template argument Format is kCFO, the shape of colData is:
+ * [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ * So, it is easy to reshape into a convolution matrix for convolution
+ * calculation based on matrix multiplication.
+ * The shape of convolution matrix is [height, width], where the height is equal
+ * inputChannels * filterHeight * filterWidth, and the width is equal
+ * outputHeight * outputWidth.
+ *
+ * Reshape:
+ *     shape of colData           shape of convolution matrix
+ *     [inputChannels,
+ *      filterHeight,
+ *      filterWidth,      ======>      [height, width]
+ *      outputHeight,
+ *      outputWidth]
+ *
+ * If the template argument Format is kOCF, the shape of colData is:
+ * [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ * So, it is easy to reshape into a sequence matrix for rnn calculation.
+ * The shape of sequence matrix is [seqLength, stepSize], where the seqLength
+ * is equal outputHeight * outputWidth, and the stepSize is equal
+ * inputChannels * filterHeight * filterWidth.
+ *
+ * Reshape:
+ *     shape of colData             shape of sequence matrix
+ *     [outputHeight,
+ *      outputWidth,
+ *      inputChannels,    ======>    [seqLength, stepSize]
+ *      filterHeight,
+ *      filterWidth]
+ *
+ * \note The caller needs to ensure that imShape.inputChannels is equal to
+ *       colShape.inputChannels.
+ */
+template <ColFormat Format, DeviceType Device, class T>
+class Im2ColFunctor {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth);
+};
+
+template <ColFormat Format, DeviceType Device, class T>
+class Col2ImFunctor {
+public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth);
+};
+
+}  // namespace paddle
diff --git a/paddle/function/Im2ColOp.cpp b/paddle/function/Im2ColOp.cpp
new file mode 100644
index 0000000000..b7d1eb1ede
--- /dev/null
+++ b/paddle/function/Im2ColOp.cpp
@@ -0,0 +1,235 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Im2Col.h"
+
+namespace paddle {
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ */
+template <class T>
+class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputHeight = colShape[3];
+    int outputWidth = colShape[4];
+    int channelsCol = inputChannels * filterHeight * filterWidth;
+
+    for (int c = 0; c < channelsCol; ++c) {
+      int wOffset = c % filterWidth;
+      int hOffset = (c / filterWidth) % filterHeight;
+      int c_im = c / filterWidth / filterHeight;
+      for (int h = 0; h < outputHeight; ++h) {
+        for (int w = 0; w < outputWidth; ++w) {
+          int imRowIdx = h * strideHeight + hOffset;
+          int imColIdx = w * strideWidth + wOffset;
+          if ((imRowIdx - paddingHeight) < 0 ||
+              (imRowIdx - paddingHeight) >= inputHeight ||
+              (imColIdx - paddingWidth) < 0 ||
+              (imColIdx - paddingWidth) >= inputWidth) {
+            colData[(c * outputHeight + h) * outputWidth + w] = T(0);
+          } else {
+            imRowIdx += c_im * inputHeight - paddingHeight;
+            imColIdx -= paddingWidth;
+            colData[(c * outputHeight + h) * outputWidth + w] =
+                imData[imRowIdx * inputWidth + imColIdx];
+          }
+        }
+      }
+    }
+  }
+};
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ */
+template <class T>
+class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputHeight = colShape[3];
+    int outputWidth = colShape[4];
+    int channelsCol = inputChannels * filterHeight * filterWidth;
+
+    for (int c = 0; c < channelsCol; ++c) {
+      int wOffset = c % filterWidth;
+      int hOffset = (c / filterWidth) % filterHeight;
+      int c_im = c / filterWidth / filterHeight;
+      for (int h = 0; h < outputHeight; ++h) {
+        for (int w = 0; w < outputWidth; ++w) {
+          int imRowIdx = h * strideHeight + hOffset;
+          int imColIdx = w * strideWidth + wOffset;
+          if ((imRowIdx - paddingHeight) >= 0 &&
+              (imRowIdx - paddingHeight) < inputHeight &&
+              (imColIdx - paddingWidth) >= 0 &&
+              (imColIdx - paddingWidth) < inputWidth) {
+            imRowIdx += c_im * inputHeight - paddingHeight;
+            imColIdx -= paddingWidth;
+            imData[imRowIdx * inputWidth + imColIdx] +=
+                colData[(c * outputHeight + h) * outputWidth + w];
+          }
+        }
+      }
+    }
+  }
+};
+
+template class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, float>;
+template class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, double>;
+template class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, float>;
+template class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, double>;
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
+template <class T>
+class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
+    for (int outputH = 0; outputH < outputHeight; ++outputH) {
+      for (int outputW = 0; outputW < outputWidth; ++outputW) {
+        for (int channel = 0; channel < inputChannels; ++channel) {
+          for (int filterH = 0; filterH < filterHeight; ++filterH) {
+            for (int filterW = 0; filterW < filterWidth; ++filterW) {
+              int imRowOffset =
+                  outputH * strideHeight + filterH - paddingHeight;
+              int imColOffset = outputW * strideWidth + filterW - paddingWidth;
+              int colDataOffset =
+                  (((outputH * outputWidth + outputW) * inputChannels +
+                    channel) *
+                       filterHeight +
+                   filterH) *
+                      filterWidth +
+                  filterW;
+              if (imRowOffset < 0 || imRowOffset >= inputHeight ||
+                  imColOffset < 0 || imColOffset >= inputWidth) {
+                colData[colDataOffset] = float(0);
+              } else {
+                int imDataOffset =
+                    (channel * inputHeight + imRowOffset) * inputWidth +
+                    imColOffset;
+                colData[colDataOffset] = imData[imDataOffset];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
+template <class T>
+class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
+    for (int outputH = 0; outputH < outputHeight; ++outputH) {
+      for (int outputW = 0; outputW < outputWidth; ++outputW) {
+        for (int channel = 0; channel < inputChannels; ++channel) {
+          for (int filterH = 0; filterH < filterHeight; ++filterH) {
+            for (int filterW = 0; filterW < filterWidth; ++filterW) {
+              int imRowOffset =
+                  outputH * strideHeight + filterH - paddingHeight;
+              int imColOffset = outputW * strideWidth + filterW - paddingWidth;
+              int colDataOffset =
+                  (((outputH * outputWidth + outputW) * inputChannels +
+                    channel) *
+                       filterHeight +
+                   filterH) *
+                      filterWidth +
+                  filterW;
+              if (imRowOffset >= 0 && imRowOffset < inputHeight &&
+                  imColOffset >= 0 && imColOffset < inputWidth) {
+                int imDataOffset =
+                    (channel * inputHeight + imRowOffset) * inputWidth +
+                    imColOffset;
+                imData[imDataOffset] += colData[colDataOffset];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, float>;
+template class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, double>;
+template class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, float>;
+template class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, double>;
+
+}  // namespace paddle
diff --git a/paddle/function/Im2ColOpGpu.cu b/paddle/function/Im2ColOpGpu.cu
new file mode 100644
index 0000000000..bd98610498
--- /dev/null
+++ b/paddle/function/Im2ColOpGpu.cu
@@ -0,0 +1,425 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Im2Col.h"
+#include "hl_device_functions.cuh"
+
+namespace paddle {
+
+template <class T>
+__global__ void im2col(const T* data_im,
+                       int numOuts,
+                       int height,
+                       int width,
+                       int blockH,
+                       int blockW,
+                       int strideH,
+                       int strideW,
+                       int paddingH,
+                       int paddingW,
+                       int height_col,
+                       int width_col,
+                       T* data_col) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < numOuts) {
+    int w_out = index % width_col;
+    index /= width_col;
+    int h_out = index % height_col;
+    int channel_in = index / height_col;
+    int channel_out = channel_in * blockH * blockW;
+    int h_in = h_out * strideH;
+    int w_in = w_out * strideW;
+
+    data_col += (channel_out * height_col + h_out) * width_col + w_out;
+    for (int i = 0; i < blockH; ++i) {
+      for (int j = 0; j < blockW; ++j) {
+        int rIdx = int(h_in + i);
+        int cIdx = int(w_in + j);
+        if ((rIdx - (int)paddingH) >= (int)height ||
+            (rIdx - (int)paddingH) < 0 ||
+            (cIdx - (int)paddingW) >= (int)width ||
+            (cIdx - (int)paddingW) < 0) {
+          *data_col = 0;
+        } else {
+          rIdx = rIdx + channel_in * height - paddingH;
+          cIdx = cIdx - paddingW;
+          *data_col = data_im[rIdx * width + cIdx];
+        }
+        data_col += height_col * width_col;
+      }
+    }
+  }
+}
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ */
+template <class T>
+class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputHeight = colShape[3];
+    int outputWidth = colShape[4];
+
+    int numKernels = inputChannels * outputHeight * outputWidth;
+    int blocks = (numKernels + 1024 - 1) / 1024;
+    int blockX = 512;
+    int blockY = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+    im2col<T><<<grid, threads, 0, STREAM_DEFAULT>>>(imData,
+                                                    numKernels,
+                                                    inputHeight,
+                                                    inputWidth,
+                                                    filterHeight,
+                                                    filterWidth,
+                                                    strideHeight,
+                                                    strideWidth,
+                                                    paddingHeight,
+                                                    paddingWidth,
+                                                    outputHeight,
+                                                    outputWidth,
+                                                    colData);
+    CHECK_SYNC("Im2ColFunctor GPU failed");
+  }
+};
+
+template <class T>
+__global__ void col2im(size_t n,
+                       const T* data_col,
+                       size_t height,
+                       size_t width,
+                       size_t channels,
+                       size_t blockH,
+                       size_t blockW,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t paddingH,
+                       size_t paddingW,
+                       size_t height_col,
+                       size_t width_col,
+                       T* data_im) {
+  size_t index =
+      (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < n) {
+    T val = 0;
+    int w = int(index % width);
+    int h = int((index / width) % height);
+    int c = int(index / (width * height));
+    if ((w - (int)paddingW) >= 0 &&
+        (w - (int)paddingW) < (width - 2 * paddingW) &&
+        (h - (int)paddingH) >= 0 && (h - paddingH) < (height - 2 * paddingH)) {
+      // compute the start and end of the output
+      int w_col_start =
+          (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1;
+      int w_col_end = min((int)(w / (int)strideW + 1), (int)(width_col));
+      int h_col_start =
+          (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1;
+      int h_col_end = min(int(h / strideH + 1), int(height_col));
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          // the col location: [c * width * height + h_out, w_out]
+          int c_col = int(c * blockH * blockW) +
+                      (h - h_col * (int)strideH) * (int)blockW +
+                      (w - w_col * (int)strideW);
+          val += data_col[(c_col * height_col + h_col) * width_col + w_col];
+        }
+      }
+      h -= paddingH;
+      w -= paddingW;
+      data_im[c * ((width - 2 * paddingW) * (height - 2 * paddingH)) +
+              h * (width - 2 * paddingW) + w] += val;
+    }
+  }
+}
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ */
+template <class T>
+class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputHeight = colShape[3];
+    int outputWidth = colShape[4];
+
+    size_t numKernels = inputChannels * (inputHeight + 2 * paddingHeight) *
+                        (inputWidth + 2 * paddingWidth);
+
+    size_t blocks = (numKernels + 1024 - 1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+
+    // To avoid involving atomic operations, we will launch one kernel per
+    // bottom dimension, and then in the kernel add up the top dimensions.
+    col2im<T><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        numKernels,
+        colData,
+        inputHeight + 2 * paddingHeight,
+        inputWidth + 2 * paddingWidth,
+        inputChannels,
+        filterHeight,
+        filterWidth,
+        strideHeight,
+        strideWidth,
+        paddingHeight,
+        paddingWidth,
+        outputHeight,
+        outputWidth,
+        imData);
+    CHECK_SYNC("Col2ImFunctor GPU failed");
+  }
+};
+
+template class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, float>;
+template class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, double>;
+template class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, float>;
+template class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, double>;
+
+template <class T>
+__global__ void im2colOCF(const T* imData,
+                          T* colData,
+                          int inputChannels,
+                          int inputHeight,
+                          int inputWidth,
+                          int filterHeight,
+                          int filterWidth,
+                          int strideHeight,
+                          int strideWidth,
+                          int paddingHeight,
+                          int paddingWidth,
+                          int outputHeight,
+                          int outputWidth) {
+  int swId = blockIdx.x;
+  int shId = blockIdx.y;
+  for (int channelId = threadIdx.z; channelId < inputChannels;
+       channelId += blockDim.z) {
+    for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
+      for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
+        int widthOffset = idx + swId * strideWidth - paddingWidth;
+        int heightOffset = idy + shId * strideHeight - paddingHeight;
+        int imOffset = widthOffset + heightOffset * inputWidth +
+                       channelId * inputHeight * inputWidth;
+
+        int colOffset = idx + idy * filterWidth +
+                        channelId * filterHeight * filterWidth +
+                        (shId * outputWidth + swId) *
+                            (inputChannels * filterHeight * filterWidth);
+
+        if (heightOffset >= inputHeight || heightOffset < 0 ||
+            widthOffset >= inputWidth || widthOffset < 0) {
+          colData[colOffset] = T(0);
+        } else {
+          colData[colOffset] = imData[imOffset];
+        }
+      }
+    }
+  }
+}
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
+template <class T>
+class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
+
+    int blockDimX = 0;
+    int blockDimY = 0;
+    if (filterHeight <= 4 && filterWidth <= 4) {
+      blockDimX = 4;
+      blockDimY = 4;
+    } else if (filterHeight <= 8 && filterWidth <= 8) {
+      blockDimX = 8;
+      blockDimY = 8;
+    } else if (filterHeight <= 16 && filterWidth <= 16) {
+      blockDimX = 16;
+      blockDimY = 16;
+    } else {
+      blockDimX = 32;
+      blockDimY = 32;
+    }
+
+    int blockDimZ = 1024 / blockDimX / blockDimY;
+    dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels));
+    dim3 grid(outputWidth, outputHeight);
+    im2colOCF<T><<<grid, threads, 0, STREAM_DEFAULT>>>(imData,
+                                                       colData,
+                                                       inputChannels,
+                                                       inputHeight,
+                                                       inputWidth,
+                                                       filterHeight,
+                                                       filterWidth,
+                                                       strideHeight,
+                                                       strideWidth,
+                                                       paddingHeight,
+                                                       paddingWidth,
+                                                       outputHeight,
+                                                       outputWidth);
+    CHECK_SYNC("Im2ColFunctor GPU failed");
+  }
+};
+
+template <class T>
+__global__ void col2imOCF(T* imData,
+                          const T* colData,
+                          int inputChannels,
+                          int inputHeight,
+                          int inputWidth,
+                          int filterHeight,
+                          int filterWidth,
+                          int strideHeight,
+                          int strideWidth,
+                          int paddingHeight,
+                          int paddingWidth,
+                          int outputHeight,
+                          int outputWidth) {
+  int swId = blockIdx.x;
+  int shId = blockIdx.y;
+  for (int channelId = threadIdx.z; channelId < inputChannels;
+       channelId += blockDim.z) {
+    for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
+      for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
+        int widthOffset = idx + swId * strideWidth - paddingWidth;
+        int heightOffset = idy + shId * strideHeight - paddingHeight;
+        int imOffset = widthOffset + heightOffset * inputWidth +
+                       channelId * inputHeight * inputWidth;
+
+        int colOffset = idx + idy * filterWidth +
+                        channelId * filterHeight * filterWidth +
+                        (shId * outputWidth + swId) *
+                            (inputChannels * filterHeight * filterWidth);
+
+        if (heightOffset >= 0 && heightOffset < inputHeight &&
+            widthOffset >= 0 && widthOffset < inputWidth) {
+          paddle::paddleAtomicAdd(imData + imOffset, colData[colOffset]);
+        }
+      }
+    }
+  }
+}
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
+template <class T>
+class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
+
+    int blockDimX = 0;
+    int blockDimY = 0;
+    if (filterHeight <= 4 && filterWidth <= 4) {
+      blockDimX = 4;
+      blockDimY = 4;
+    } else if (filterHeight <= 8 && filterWidth <= 8) {
+      blockDimX = 8;
+      blockDimY = 8;
+    } else if (filterHeight <= 16 && filterWidth <= 16) {
+      blockDimX = 16;
+      blockDimY = 16;
+    } else {
+      blockDimX = 32;
+      blockDimY = 32;
+    }
+
+    int blockDimZ = 1024 / blockDimX / blockDimY;
+    dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels));
+    dim3 grid(outputWidth, outputHeight);
+    col2imOCF<T><<<grid, threads, 0, STREAM_DEFAULT>>>(imData,
+                                                       colData,
+                                                       inputChannels,
+                                                       inputHeight,
+                                                       inputWidth,
+                                                       filterHeight,
+                                                       filterWidth,
+                                                       strideHeight,
+                                                       strideWidth,
+                                                       paddingHeight,
+                                                       paddingWidth,
+                                                       outputHeight,
+                                                       outputWidth);
+    CHECK_SYNC("Col2ImFunctor GPU failed");
+  }
+};
+
+template class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, float>;
+template class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, double>;
+template class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, float>;
+template class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, double>;
+
+}  // namespace paddle
diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp
new file mode 100644
index 0000000000..acc88a553a
--- /dev/null
+++ b/paddle/function/Im2ColTest.cpp
@@ -0,0 +1,125 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Im2Col.h"
+#include <gtest/gtest.h>
+#include "Function.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/tests/TensorCheck.h"
+
+namespace paddle {
+
+template <DeviceType Device, class T>
+void TestIm2ColFunctor() {
+  for (size_t channels : {1, 5, 32}) {
+    for (size_t inputHeight : {5, 33, 100}) {
+      for (size_t inputWidth : {5, 32, 96}) {
+        for (size_t filterHeight : {1, 5}) {
+          for (size_t filterWidth : {3, 7}) {
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                if (inputHeight <= filterHeight || inputWidth <= filterWidth)
+                  break;
+                if (padding >= filterHeight || padding >= filterWidth) break;
+                size_t outputHeight =
+                    (inputHeight - filterHeight + 2 * padding + stride) /
+                    stride;
+                size_t outputWidth =
+                    (inputWidth - filterWidth + 2 * padding + stride) / stride;
+
+                TensorShape imShape =
+                    TensorShape({channels, inputHeight, inputWidth});
+                TensorShape colShape1 = TensorShape({channels,
+                                                     filterHeight,
+                                                     filterWidth,
+                                                     outputHeight,
+                                                     outputWidth});
+                TensorShape colShape2 = TensorShape({outputHeight,
+                                                     outputWidth,
+                                                     channels,
+                                                     filterHeight,
+                                                     filterWidth});
+
+                size_t height = channels * filterHeight * filterWidth;
+                size_t width = outputHeight * outputWidth;
+                VectorPtr input1 = Vector::create(imShape.getElements(), false);
+                VectorPtr input2 = Vector::create(imShape.getElements(), false);
+                MatrixPtr output1 = Matrix::create(height, width, false, false);
+                MatrixPtr output2 = Matrix::create(width, height, false, false);
+                input1->uniform(0.001, 1);
+                input2->copyFrom(*input1);
+
+                Im2ColFunctor<kCFO, Device, T> im2Col1;
+                Im2ColFunctor<kOCF, Device, T> im2Col2;
+                im2Col1(input1->getData(),
+                        imShape,
+                        output1->getData(),
+                        colShape1,
+                        stride,
+                        stride,
+                        padding,
+                        padding);
+                im2Col2(input2->getData(),
+                        imShape,
+                        output2->getData(),
+                        colShape2,
+                        stride,
+                        stride,
+                        padding,
+                        padding);
+
+                // The transposition of the result of ColFormat == kCFO
+                // is equal to the result of ColFormat == kOCF.
+                MatrixPtr test;
+                output2->transpose(test, true);
+                autotest::TensorCheckErr(*output1, *test);
+
+                Col2ImFunctor<kCFO, Device, T> col2Im1;
+                Col2ImFunctor<kOCF, Device, T> col2Im2;
+                col2Im1(input1->getData(),
+                        imShape,
+                        output1->getData(),
+                        colShape1,
+                        stride,
+                        stride,
+                        padding,
+                        padding);
+                col2Im2(input2->getData(),
+                        imShape,
+                        output2->getData(),
+                        colShape2,
+                        stride,
+                        stride,
+                        padding,
+                        padding);
+
+                autotest::TensorCheckErr(*input1, *input2);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(Im2ColFunctor, CPU) { TestIm2ColFunctor<DEVICE_TYPE_CPU, float>(); }
+
+#ifndef PADDLE_ONLY_CPU
+
+TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); }
+
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/MulOpGpu.cu b/paddle/function/MulOpGpu.cu
index dcfcb2325d..9449b89056 100644
--- a/paddle/function/MulOpGpu.cu
+++ b/paddle/function/MulOpGpu.cu
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "hl_base.h"
 #include "MulOp.h"
+#include "hl_base.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
 
diff --git a/paddle/function/NaiveConvOp.cpp b/paddle/function/NaiveConvOp.cpp
index 4348f0f775..e0692fa06d 100644
--- a/paddle/function/NaiveConvOp.cpp
+++ b/paddle/function/NaiveConvOp.cpp
@@ -90,8 +90,7 @@ public:
     ConvFunctionBase::init(config);
   }
 
-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
     const TensorShape& input = inputs[0].shape();
     const TensorShape& filter = inputs[1].shape();
     const TensorShape& output = outputs[0].shape();
diff --git a/paddle/function/PadOpGpu.cu b/paddle/function/PadOpGpu.cu
index 9094f15284..5b6f4e6832 100644
--- a/paddle/function/PadOpGpu.cu
+++ b/paddle/function/PadOpGpu.cu
@@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "hl_base.h"
 #include "PadOp.h"
+#include "hl_base.h"
 
 namespace paddle {
 
-__global__ void KePad(real* outputs, const real* inputs,
-                      int inC, int inH, int inW,
-                      int padc, int padh, int padw,
-                      int outC, int outH, int outW, int nthreads) {
+__global__ void KePad(real* outputs,
+                      const real* inputs,
+                      int inC,
+                      int inH,
+                      int inW,
+                      int padc,
+                      int padh,
+                      int padw,
+                      int outC,
+                      int outH,
+                      int outW,
+                      int nthreads) {
   const int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < nthreads) {
     const int w = idx % inW;
@@ -50,16 +58,33 @@ void Pad<DEVICE_TYPE_GPU>(real* outputs,
   int outC = inC + cstart + cend;
   int outH = inH + hstart + hend;
   int outW = inW + wstart + wend;
-  KePad<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-    (outputs, inputs, inC, inH, inW, cstart, hstart, wstart,
-     outC, outH, outW, nth);
+  KePad<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(outputs,
+                                                    inputs,
+                                                    inC,
+                                                    inH,
+                                                    inW,
+                                                    cstart,
+                                                    hstart,
+                                                    wstart,
+                                                    outC,
+                                                    outH,
+                                                    outW,
+                                                    nth);
   CHECK_SYNC("Pad");
 }
 
-__global__ void KePadDiff(real* inGrad, const real* outGrad,
-                          int inC, int inH, int inW,
-                          int padc, int padh, int padw,
-                          int outC, int outH, int outW, int nthreads) {
+__global__ void KePadDiff(real* inGrad,
+                          const real* outGrad,
+                          int inC,
+                          int inH,
+                          int inW,
+                          int padc,
+                          int padh,
+                          int padw,
+                          int outC,
+                          int outH,
+                          int outW,
+                          int nthreads) {
   const int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < nthreads) {
     const int w = idx % inW;
@@ -89,9 +114,18 @@ void PadGrad<DEVICE_TYPE_GPU>(real* inGrad,
   int outC = inC + cstart + cend;
   int outH = inH + hstart + hend;
   int outW = inW + wstart + wend;
-  KePadDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-    (inGrad, outGrad, inC, inH, inW, cstart, hstart, wstart,
-     outC, outH, outW, nth);
+  KePadDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(inGrad,
+                                                        outGrad,
+                                                        inC,
+                                                        inH,
+                                                        inW,
+                                                        cstart,
+                                                        hstart,
+                                                        wstart,
+                                                        outC,
+                                                        outH,
+                                                        outW,
+                                                        nth);
   CHECK_SYNC("PadGrad");
 }
 
diff --git a/paddle/function/RowConvOpGpu.cu b/paddle/function/RowConvOpGpu.cu
index c0b947e224..b0cbd9fd1d 100644
--- a/paddle/function/RowConvOpGpu.cu
+++ b/paddle/function/RowConvOpGpu.cu
@@ -12,16 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "hl_base.h"
 #include "RowConvOp.h"
+#include "hl_base.h"
 
 namespace paddle {
 
-template<int BLOCK_H, int BLOCK_W>
-__global__ void KeRowConv(real* y, const real* x,  const real* w,
-    const int* starts, const int height, const int width,
-    const int numSeq, const int context) {
-
+template <int BLOCK_H, int BLOCK_W>
+__global__ void KeRowConv(real* y,
+                          const real* x,
+                          const real* w,
+                          const int* starts,
+                          const int height,
+                          const int width,
+                          const int numSeq,
+                          const int context) {
   const int tidx = threadIdx.x;
   const int tidy = threadIdx.y;
   const int blky = blockDim.y;
@@ -30,9 +34,9 @@ __global__ void KeRowConv(real* y, const real* x,  const real* w,
   __shared__ real sw[BLOCK_H][BLOCK_W];
 
   for (int i = tidy; i < context; i += blky) {
-    sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0;
+    sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0;
   }
-  
+
   __syncthreads();
 
   for (int i = 0; i < numSeq; ++i) {
@@ -56,9 +60,14 @@ __global__ void KeRowConv(real* y, const real* x,  const real* w,
   }
 }
 
-__global__ void KeRowConv2(real* y, const real* x,  const real* w,
-    const int* starts, const int height, const int width,
-    const int numSeq, const int context) {
+__global__ void KeRowConv2(real* y,
+                           const real* x,
+                           const real* w,
+                           const int* starts,
+                           const int height,
+                           const int width,
+                           const int numSeq,
+                           const int context) {
   const int tidx = threadIdx.x;
   const int tidy = threadIdx.y;
   const int blky = blockDim.y;
@@ -84,8 +93,6 @@ __global__ void KeRowConv2(real* y, const real* x,  const real* w,
   }
 }
 
-
-
 template <>
 void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
                               const GpuMatrix& in,
@@ -105,21 +112,24 @@ void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
   dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
 
   if (contextLength <= 32) {
-    KeRowConv<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
-      (y, x, w, starts, height, width, numSeq, contextLength);
+    KeRowConv<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
+        y, x, w, starts, height, width, numSeq, contextLength);
   } else {
-    KeRowConv2<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
-      (y, x, w, starts, height, width, numSeq, contextLength);
+    KeRowConv2<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
+        y, x, w, starts, height, width, numSeq, contextLength);
   }
   CHECK_SYNC("RowConv");
 }
 
-
-template<int BLOCK_H, int BLOCK_W, int CONTEXT>
-__global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
-    const int* starts, const int height, const int width, const int numSeq,
-    const int context) {
-
+template <int BLOCK_H, int BLOCK_W, int CONTEXT>
+__global__ void KeRowConvBwWeight(real* dw,
+                                  const real* x,
+                                  const real* dy,
+                                  const int* starts,
+                                  const int height,
+                                  const int width,
+                                  const int numSeq,
+                                  const int context) {
   const int tidx = threadIdx.x;
   const int tidy = threadIdx.y;
   const int blky = blockDim.y;
@@ -138,18 +148,21 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
     const int start = starts[i];
     const int end = starts[i + 1];
     const int steps = end - start;
-    const int size = ((steps + BLOCK_H - 1)/BLOCK_H) * BLOCK_H;
+    const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H;
     for (int j = tidy; j < size; j += BLOCK_H) {
       int xoff = gidx + tidx;
       int yoff = start + j;
 
       // transpose
-      sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
-      sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0;
+      sh_x[tidx][tidy] =
+          (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
+      sh_dy[tidx][tidy + context - 1] =
+          (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0;
       __syncthreads();
       if (tidy < (context - 1)) {
         yoff = yoff - context + 1;
-        sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0;
+        sh_dy[tidx][tidy] =
+            (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0;
       }
       __syncthreads();
 
@@ -176,11 +189,15 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
   }
 }
 
-template<int BLOCK_H, int BLOCK_W>
-__global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
-    const int* starts, const int height, const int width, const int numSeq,
-    const int context) {
-
+template <int BLOCK_H, int BLOCK_W>
+__global__ void KeRowConvBwWeight2(real* dw,
+                                   const real* x,
+                                   const real* dy,
+                                   const int* starts,
+                                   const int height,
+                                   const int width,
+                                   const int numSeq,
+                                   const int context) {
   const int tidx = threadIdx.x;
   const int tidy = threadIdx.y;
   const int gidx = blockIdx.x * blockDim.x;
@@ -193,17 +210,21 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
     const int end = starts[i + 1];
     const int steps = end - start;
 
-    const int size = ((steps + BLOCK_H - 1)/BLOCK_H) * BLOCK_H;
+    const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H;
     for (int j = tidy; j < size; j += BLOCK_H) {
       int xoff = gidx + tidx;
       int yoff = start + j;
 
       // transpose
-      sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
+      sh_x[tidx][tidy] =
+          (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
       __syncthreads();
 
       for (int t = 0; t < context; t++) {
-        sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start && yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0;
+        sh_dy[tidx][tidy] =
+            (xoff < width && (yoff - t) >= start && yoff - t < end)
+                ? dy[(yoff - t) * width + xoff]
+                : 0.0;
         __syncthreads();
 
         real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx];
@@ -217,18 +238,22 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
         __syncthreads();
 
         if (tidx == 0 && (gidx + tidy) < width) {
-          dw[t*width + gidx + tidy] += val;
+          dw[t * width + gidx + tidy] += val;
         }
       }
     }
   }
 }
 
-template<int BLOCK_H, int BLOCK_W>
-__global__ void KeRowConvBwData(real* dx, const real* w, const real* dy,
-    const int* starts, const int height, const int width, const int numSeq,
-    const int context) {
-
+template <int BLOCK_H, int BLOCK_W>
+__global__ void KeRowConvBwData(real* dx,
+                                const real* w,
+                                const real* dy,
+                                const int* starts,
+                                const int height,
+                                const int width,
+                                const int numSeq,
+                                const int context) {
   const int tidx = threadIdx.x;
   const int tidy = threadIdx.y;
   const int blky = blockDim.y;
@@ -237,9 +262,9 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy,
   __shared__ real sw[BLOCK_H][BLOCK_W];
 
   for (int i = tidy; i < context; i += blky) {
-    sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0;
+    sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0;
   }
-  
+
   __syncthreads();
 
   for (int i = 0; i < numSeq; ++i) {
@@ -261,10 +286,14 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy,
   }
 }
 
-__global__ void KeRowConvBwData2(real* dx, const real* w, const real* dy,
-    const int* starts, const int height, const int width, const int numSeq,
-    const int context) {
-
+__global__ void KeRowConvBwData2(real* dx,
+                                 const real* w,
+                                 const real* dy,
+                                 const int* starts,
+                                 const int height,
+                                 const int width,
+                                 const int numSeq,
+                                 const int context) {
   const int tidx = threadIdx.x;
   const int tidy = threadIdx.y;
   const int blky = blockDim.y;
@@ -290,14 +319,13 @@ __global__ void KeRowConvBwData2(real* dx, const real* w, const real* dy,
   }
 }
 
-
 template <>
 void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
-                              const GpuMatrix& in,
-                              const GpuMatrix& filter,
-                              GpuMatrix& inG,
-                              GpuMatrix& filterG,
-                              const GpuIVector& seq) {
+                                  const GpuMatrix& in,
+                                  const GpuMatrix& filter,
+                                  GpuMatrix& inG,
+                                  GpuMatrix& filterG,
+                                  const GpuIVector& seq) {
   const size_t numSeq = seq.getSize() - 1;
   const size_t contextLength = filter.getHeight();
   const size_t height = in.getHeight();
@@ -312,14 +340,12 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
     dim3 dimBlock(32, 32);
     dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
     real* dw = filterG.getData();
-    if (contextLength <= 32) { 
-      KeRowConvBwWeight<32, 32, 32>
-        <<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
-        (dw, x, dy, starts, height, width, numSeq, contextLength);
+    if (contextLength <= 32) {
+      KeRowConvBwWeight<32, 32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
+          dw, x, dy, starts, height, width, numSeq, contextLength);
     } else {
-      KeRowConvBwWeight2<32, 32>
-        <<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
-        (dw, x, dy, starts, height, width, numSeq, contextLength);
+      KeRowConvBwWeight2<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
+          dw, x, dy, starts, height, width, numSeq, contextLength);
     }
   }
 
@@ -328,13 +354,11 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
     dim3 dimBlock2(32, 32);
     dim3 dimGrid2(DIVUP(width, dimBlock2.x), 1);
     if (contextLength <= 64) {
-      KeRowConvBwData<32, 64>
-        <<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>
-        (dx, w, dy, starts, height, width, numSeq, contextLength);
+      KeRowConvBwData<32, 64><<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>(
+          dx, w, dy, starts, height, width, numSeq, contextLength);
     } else {
-      KeRowConvBwData2
-        <<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>
-        (dx, w, dy, starts, height, width, numSeq, contextLength);
+      KeRowConvBwData2<<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>(
+          dx, w, dy, starts, height, width, numSeq, contextLength);
     }
   }
 
diff --git a/paddle/function/TensorShapeTest.cpp b/paddle/function/TensorShapeTest.cpp
index 45a2e106e7..e55d516d4a 100644
--- a/paddle/function/TensorShapeTest.cpp
+++ b/paddle/function/TensorShapeTest.cpp
@@ -19,35 +19,35 @@ namespace paddle {
 
 TEST(TensorShape, Constructor) {
   TensorShape t1;
-  EXPECT_EQ(t1.ndims(), 0);
-  EXPECT_EQ(t1.getElements(), 0);
+  EXPECT_EQ(t1.ndims(), 0U);
+  EXPECT_EQ(t1.getElements(), 0U);
 
   TensorShape t2(3);
-  EXPECT_EQ(t2.ndims(), 3);
-  EXPECT_EQ(t2.getElements(), 1);
+  EXPECT_EQ(t2.ndims(), 3U);
+  EXPECT_EQ(t2.getElements(), 1U);
 
   TensorShape t3({8, 10});
-  EXPECT_EQ(t3.ndims(), 2);
-  EXPECT_EQ(t3.getElements(), 80);
+  EXPECT_EQ(t3.ndims(), 2U);
+  EXPECT_EQ(t3.getElements(), 80U);
 
   TensorShape t4(t3);
   EXPECT_EQ(t4.ndims(), t3.ndims());
   EXPECT_EQ(t4.getElements(), t3.getElements());
 
   TensorShape t5({1, 2, 3, 4, 5});
-  EXPECT_EQ(t5.ndims(), 5);
-  EXPECT_EQ(t5.getElements(), 120);
+  EXPECT_EQ(t5.ndims(), 5U);
+  EXPECT_EQ(t5.getElements(), 120U);
 }
 
 TEST(TensorShape, GetAndSet) {
   TensorShape t({1, 2, 3});
-  EXPECT_EQ(t.ndims(), 3);
-  EXPECT_EQ(t.getElements(), 6);
+  EXPECT_EQ(t.ndims(), 3U);
+  EXPECT_EQ(t.getElements(), 6U);
 
-  EXPECT_EQ(t[1], 2);
+  EXPECT_EQ(t[1], 2U);
   t.setDim(1, 100);
-  EXPECT_EQ(t.getElements(), 300);
-  EXPECT_EQ(t[1], 100);
+  EXPECT_EQ(t.getElements(), 300U);
+  EXPECT_EQ(t[1], 100U);
 }
 
 }  // namespace paddle
diff --git a/paddle/function/TensorTypeTest.cpp b/paddle/function/TensorTypeTest.cpp
index e50e46f3e9..d1c559a91e 100644
--- a/paddle/function/TensorTypeTest.cpp
+++ b/paddle/function/TensorTypeTest.cpp
@@ -19,9 +19,9 @@ namespace paddle {
 
 TEST(TensorType, Matrix) {
   Tensor<real, DEVICE_TYPE_CPU>::Matrix matrix(100, 200);
-  EXPECT_EQ(matrix.getHeight(), 100);
-  EXPECT_EQ(matrix.getWidth(), 200);
-  EXPECT_EQ(matrix.getElementCnt(), 100 * 200);
+  EXPECT_EQ(matrix.getHeight(), 100U);
+  EXPECT_EQ(matrix.getWidth(), 200U);
+  EXPECT_EQ(matrix.getElementCnt(), 100U * 200U);
   EXPECT_EQ(matrix.useGpu(), false);
 
   Tensor<real, DEVICE_TYPE_GPU>::Matrix testGpu(100, 200);
@@ -33,15 +33,15 @@ TEST(TensorType, Vector) {
   Tensor<real, DEVICE_TYPE_GPU>::Vector gpuVector(100);
   EXPECT_EQ(cpuVector.useGpu(), false);
   EXPECT_EQ(gpuVector.useGpu(), true);
-  EXPECT_EQ(cpuVector.getSize(), 100);
-  EXPECT_EQ(gpuVector.getSize(), 100);
+  EXPECT_EQ(cpuVector.getSize(), 100U);
+  EXPECT_EQ(gpuVector.getSize(), 100U);
 
   Tensor<int, DEVICE_TYPE_CPU>::Vector cpuIVector(100);
   Tensor<int, DEVICE_TYPE_GPU>::Vector gpuIVector(100);
   EXPECT_EQ(cpuIVector.useGpu(), false);
   EXPECT_EQ(gpuIVector.useGpu(), true);
-  EXPECT_EQ(cpuIVector.getSize(), 100);
-  EXPECT_EQ(gpuIVector.getSize(), 100);
+  EXPECT_EQ(cpuIVector.getSize(), 100U);
+  EXPECT_EQ(gpuIVector.getSize(), 100U);
 }
 
 TEST(TensorType, EmptyMatrix) {
diff --git a/paddle/function/nnpack/NNPACKConvOp.cpp b/paddle/function/nnpack/NNPACKConvOp.cpp
index e8080c3d71..6ccc487cf1 100644
--- a/paddle/function/nnpack/NNPACKConvOp.cpp
+++ b/paddle/function/nnpack/NNPACKConvOp.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/function/ConvOp.h"
 
 DEFINE_bool(nnpack_allocate_outside,
-            false,
+            true,
             "Allocate and free workspace memory outside the NNPACK interface.");
 DEFINE_int32(nnpack_num_threads,
              0,
@@ -49,34 +49,23 @@ class NNPACKConvFunction : public ConvFunctionBase {
 public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
-    CHECK_EQ(groups_, (size_t)1);
     algorithm_ = get_nnp_convolution_algorithm(config.get<std::string>("algo"));
-    // algorithm_ = nnp_convolution_algorithm_auto;
     transform_strategy_ = nnp_convolution_transform_strategy_compute;
     nnp_status status = nnp_initialize();
     CHECK_EQ(status, nnp_status_success);
     workspaceBuffer_ = nullptr;
     workspaceSize_ = 0;
 
-    threadpool_ = nullptr;
-    if (FLAGS_nnpack_num_threads) {
-      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
-      VLOG(3) << "Number of threads "
-              << pthreadpool_get_threads_count(threadpool_);
-    }
+    create_nnpack_threadpool();
   }
 
   ~NNPACKConvFunction() {
-    if (threadpool_) {
-      pthreadpool_destroy(threadpool_);
-    }
     if (workspaceBuffer_) {
       free(workspaceBuffer_);
     }
   }
 
-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
     const TensorShape& input = inputs[0].shape();
     const TensorShape& filter = inputs[1].shape();
     const TensorShape& output = outputs[0].shape();
@@ -99,8 +88,8 @@ public:
     size_t filterHeight = getFilterHeight(filter);
     size_t filterWidth = getFilterWidth(filter);
     size_t outputChannels = output[1];
-    // size_t outputHeight = output[2];
-    // size_t outputWidth = output[3];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
 
     nnp_size inputSize = {.width = inputWidth, .height = inputHeight};
     nnp_padding padding = {.top = (size_t)paddingH(),
@@ -179,31 +168,40 @@ public:
       }
     }
 
+    size_t inputOffset = inputChannels / groups_ * inputHeight * inputWidth;
+    size_t outputOffset = outputChannels / groups_ * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+
     if (batchSize == 1) {
-      nnp_status status =
-          nnp_convolution_inference(algorithm_,
-                                    transform_strategy_,
-                                    inputChannels,
-                                    outputChannels,
-                                    inputSize,
-                                    padding,
-                                    kernelSize,
-                                    outputSubsampling,
-                                    inputData,
-                                    filterData,
-                                    nullptr, /* bias */
-                                    outputData,
-                                    bufferPtr,
-                                    sizePtr,
-                                    nnp_activation_identity,
-                                    nullptr,
-                                    threadpool_, /* threadpool */
-                                    nullptr);
-      CHECK_EQ(status, nnp_status_success);
+      for (size_t g = 0; g < groups_; g++) {
+        nnp_status status =
+            nnp_convolution_inference(algorithm_,
+                                      transform_strategy_,
+                                      inputChannels / groups_,
+                                      outputChannels / groups_,
+                                      inputSize,
+                                      padding,
+                                      kernelSize,
+                                      outputSubsampling,
+                                      inputData + inputOffset * g,
+                                      filterData + filterOffset * g,
+                                      nullptr, /* bias */
+                                      outputData + outputOffset * g,
+                                      bufferPtr,
+                                      sizePtr,
+                                      nnp_activation_identity,
+                                      nullptr,
+                                      threadpool_, /* threadpool */
+                                      nullptr);
+        CHECK_EQ(status, nnp_status_success);
+      }
     } else {
       // only supports stride = 1
       CHECK_EQ(strideH(), 1);
       CHECK_EQ(strideW(), 1);
+
+      // TODO(hedaoyuan): There has some bug when batchSize > 1 and groups_ > 1.
+      CHECK_EQ(groups_, static_cast<size_t>(1));
       nnp_status status = nnp_convolution_output(algorithm_,
                                                  batchSize,
                                                  inputChannels,
@@ -225,14 +223,25 @@ public:
     }
   }
 
+  static void create_nnpack_threadpool() {
+    if (FLAGS_nnpack_num_threads && threadpool_ == nullptr) {
+      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
+      VLOG(3) << "Number of threads "
+              << pthreadpool_get_threads_count(threadpool_);
+    }
+  }
+
 private:
   nnp_convolution_algorithm algorithm_;
   nnp_convolution_transform_strategy transform_strategy_;
   void* workspaceBuffer_;
   size_t workspaceSize_;
-  pthreadpool_t threadpool_;
+  static pthreadpool_t threadpool_;
 };
 
+template <DeviceType Device>
+pthreadpool_t NNPACKConvFunction<Device>::threadpool_ = nullptr;
+
 REGISTER_TYPED_FUNC(NNPACKConv, CPU, NNPACKConvFunction);
 
 }  // namespace paddle
diff --git a/paddle/function/nnpack/NNPACKConvOpTest.cpp b/paddle/function/nnpack/NNPACKConvOpTest.cpp
index 4818011211..4dd3982487 100644
--- a/paddle/function/nnpack/NNPACKConvOpTest.cpp
+++ b/paddle/function/nnpack/NNPACKConvOpTest.cpp
@@ -13,87 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include "paddle/function/Function.h"
-#include "paddle/function/FunctionTest.h"
-
-DEFINE_string(algo,
-              "auto",
-              "The algorithm (auto, ft8x8, ft16x16, wt8x8, "
-              "implicit-gemm, or direct) for computing convolution of NNPACK.");
+#include "paddle/function/ConvOpTest.h"
 
 namespace paddle {
 
-#define IS_NNPACK_SUPPORT(algo, filterSize, stride)        \
-  if (algo == "direct" && filterSize != 1) continue;       \
-  if (algo == "direct" && batchSize != 1) continue;        \
-  if (algo == "wt8x8" && filterSize != 3) continue;        \
-  if (algo == "implicit-gemm" && batchSize != 1) continue; \
-  if (algo != "auto" && algo != "implicit-gemm" && stride > 1) continue;
-
-class ConvolutionTest {
-public:
-  ConvolutionTest(const std::string& conv1,
-                  const std::string& conv2,
-                  std::string algo = "auto") {
-    for (size_t batchSize : {1, 32}) {
-      for (size_t inputSize : {7, 14, 54}) {
-        for (size_t filterSize : {1, 3, 5}) {
-          for (size_t inputChannels : {3, 64}) {
-            for (size_t outputChannels : {3, 64, 128}) {
-              if (inputChannels < outputChannels) break;
-              for (size_t stride : {1, 2}) {
-                // if batchSize > 1 NNPACKConv only supports stride = 1
-                if (batchSize > 1 && stride > 1) break;
-                for (size_t padding : {0, 1}) {
-                  if (padding >= filterSize) break;
-                  size_t outputSize =
-                      (inputSize - filterSize + 2 * padding + stride) / stride;
-                  IS_NNPACK_SUPPORT(algo, filterSize, stride);
-                  LOG(INFO) << " batchSize=" << batchSize
-                            << " inputChannels=" << inputChannels
-                            << " inputHeight=" << inputSize
-                            << " inputWidth=" << inputSize
-                            << " outputChannels=" << outputChannels
-                            << " filterHeight=" << filterSize
-                            << " filterWidth=" << filterSize
-                            << " outputHeight=" << outputSize
-                            << " outputWidth=" << outputSize
-                            << " stride=" << stride << " padding=" << padding;
-
-                  std::vector<size_t> paddings = {padding, padding};
-                  std::vector<size_t> strides = {stride, stride};
-                  Compare2Function<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test(
-                      conv1,
-                      conv2,
-                      FuncConfig()
-                          .set("paddings", paddings)
-                          .set("strides", strides)
-                          .set("groups", (size_t)1)
-                          .set("algo", algo));
-
-                  TensorShape shape0{
-                      batchSize, inputChannels, inputSize, inputSize};
-                  TensorShape shape1{
-                      outputChannels, inputChannels, filterSize, filterSize};
-                  TensorShape shape2{
-                      batchSize, outputChannels, outputSize, outputSize};
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape0));
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape1));
-                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape2));
-                  test.run();
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
+TEST(NNPACK, Forward) {
+  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
+      "GemmConv-CPU", "NNPACKConv-CPU", forward);
+}
 
-TEST(Convolution, NNPACK) {
-  // NNPACK only supports stride = 1
-  ConvolutionTest test("GemmConv-CPU", "NNPACKConv-CPU", FLAGS_algo);
+TEST(NNPACK, Depthwise) {
+  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
+      "GemmConv-CPU", "NNPACKConv-CPU", forward);
 }
 
 }  // namespace paddle
diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt
index 0012636b8f..62cff9361c 100644
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -23,6 +23,17 @@ endmacro()
 
 filter_test(GSERVER_HEADER)
 filter_test(GSERVER_SOURCES)
+
+if(NOT WITH_MKLDNN)
+    file(GLOB_RECURSE DNN_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLDNN*.h")
+    file(GLOB_RECURSE DNN_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLDNN*.cpp")
+    list(REMOVE_ITEM GSERVER_HEADER ${DNN_HEADER})
+    list(REMOVE_ITEM GSERVER_SOURCES ${DNN_SOURCES})
+    message(STATUS "Skip compiling with MKLDNNLayers and MKLDNNActivations")
+else()
+    message(STATUS "Compile with MKLDNNLayers and MKLDNNActivations")
+endif()
+
 if(NOT WITH_GPU)
     list(REMOVE_ITEM GSERVER_HEADER
         layers/CudnnConvBaseLayer.h
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index a40530f413..78e958e06f 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -112,7 +112,6 @@ BEGIN_DEFINE_ACTIVATION(softmax)
 private:
 MatrixPtr sftMaxSum_;
 MatrixPtr sftMaxDot_;
-MatrixPtr one_;
 
 public:
 Error __must_check forward(Argument& act) {
@@ -138,14 +137,6 @@ Error __must_check backward(Argument& act) {
                            1,
                            /* trans */ false,
                            useGpu(act.deviceId));
-    if (!one_ || one_->getWidth() != outputG->getWidth()) {
-      Matrix::resizeOrCreate(one_,
-                             1,
-                             outputG->getWidth(),
-                             /* trans */ false,
-                             useGpu(act.deviceId));
-      one_->one();
-    }
 
     sftMaxDot_->dotMul(*outputG, *outputV);
     sftMaxSum_->colMerge(*sftMaxDot_);
@@ -186,7 +177,10 @@ Error __must_check forward(Argument& act) {
                                     useGpu(act.deviceId));
   }
 
-  auto starts = act.sequenceStartPositions->getVector(useGpu(act.deviceId));
+  auto starts =
+      act.hasSubseq()
+          ? act.subSequenceStartPositions->getVector(useGpu(act.deviceId))
+          : act.sequenceStartPositions->getVector(useGpu(act.deviceId));
   act.value->sequenceSoftmax(*act.value, *starts);
   return Error();
 }
@@ -197,8 +191,9 @@ Error __must_check backward(Argument& act) {
         "Input width for each timestep of sequence softmax should be 1");
   }
 
-  size_t numSequences = act.getNumSequences();
-  const int* starts = act.sequenceStartPositions->getData(false);
+  size_t numSequences =
+      act.hasSubseq() ? act.getNumSubSequences() : act.getNumSequences();
+  const int* starts = act.getCpuStartPositions();
 
   for (size_t i = 0; i < numSequences; ++i) {
     // TODO(Dangqingqing) optimization for GPU
@@ -207,8 +202,8 @@ Error __must_check backward(Argument& act) {
     argument_.value->setData(act.value->getData() + offset, 1UL, size);
     argument_.grad->setData(act.grad->getData() + offset, 1UL, size);
 
-    Error status = softmax_.backward(argument_);
-    if (!status) return status;
+    Error err = softmax_.backward(argument_);
+    if (!err.isOK()) return err;
   }
   return Error();
 }
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 4003676217..265dbb5493 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -205,10 +205,8 @@ public:
       hl_destroy_event(hlEvent_);
       hlEvent_ = NULL;
     }
-    if (batchData_) {
-      delete batchData_;
-      batchData_ = NULL;
-    }
+    delete batchData_;
+    batchData_ = NULL;
   }
 
   void setDataBatch(DataBatch* batchData) { batchData_ = batchData; }
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index 2e839f6405..26cff3e677 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -202,7 +202,7 @@ void NeuralNetwork::prefetch(const std::vector<Argument>& inArgs) {
         auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
             para->getMat(PARAMETER_VALUE).get());
         para->clearGradient();
-        mat->clearIndices();
+        if (mat) mat->clearIndices();
       }
     }
   }
@@ -403,7 +403,7 @@ public:
       : layerName_(layerName) {
     addEvaluator(std::move(evaluator));
   }
-  virtual void eval(const NeuralNetwork& nn) override {
+  void eval(const NeuralNetwork& nn) override {
     const LayerPtr& layer = nn.getLayer(layerName_);
     CHECK(layer) << "Nonexisted layer: " << layerName_ << " in submodel "
                  << nn.getName();
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index 8cc08eb208..1829f72a87 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -184,7 +184,7 @@ public:
   }
 
   void backward(const UpdateCallback& callback) override {
-    if (biases_) {
+    if (biases_ && biases_->getWGrad()) {
       backwardActivation();
       biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
       biases_->getParameterPtr()->incUpdate(callback);
@@ -636,7 +636,7 @@ void lenToStarts(std::vector<int>& starts) {
   }
   starts.back() = pos;
 }
-}
+}  // namespace
 
 void RecurrentGradientMachine::calcSequenceStartPositions() {
   std::vector<int> starts(commonSeqInfo_.size() + 1);
diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp
index 15e7411b5f..bdae7e623a 100644
--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
@@ -124,7 +124,7 @@ void copyElements(const IVector& srcVec,
     dest[index[i]] = src[i];
   }
 }
-}
+}  // namespace
 
 void GatherAgentLayer::forwardIds(PassType passType) {
   IVectorPtr realId = realLayers_[0]->getOutputLabel();
diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp
index 2bafeb9215..3b1f346359 100644
--- a/paddle/gserver/layers/BlockExpandLayer.cpp
+++ b/paddle/gserver/layers/BlockExpandLayer.cpp
@@ -37,6 +37,22 @@ bool BlockExpandLayer::init(const LayerMap& layerMap,
   imgSizeH_ = blockConf.img_size_y();
   imgSizeW_ = blockConf.img_size_x();
 
+  std::vector<size_t> strides = {(size_t)strideH_, (size_t)strideW_};
+  std::vector<size_t> paddings = {(size_t)paddingH_, (size_t)paddingW_};
+  std::vector<size_t> blocks = {(size_t)blockH_, (size_t)blockW_};
+  createFunction(forward_,
+                 "BlockExpand",
+                 FuncConfig()
+                     .set("strides", strides)
+                     .set("paddings", paddings)
+                     .set("blocks", blocks));
+  createFunction(backward_,
+                 "BlockExpandGrad",
+                 FuncConfig()
+                     .set("strides", strides)
+                     .set("paddings", paddings)
+                     .set("blocks", blocks));
+
   return true;
 }
 
@@ -63,48 +79,27 @@ void BlockExpandLayer::forward(PassType passType) {
   Layer::forward(passType);
 
   size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-
   size_t blockNum = getBlockNum();
   size_t blockSize = blockH_ * blockW_ * channels_;
   resetOutput(blockNum * batchSize, blockSize);
-  Argument& out = getOutput();
-  MatrixPtr outV = getOutputValue();
 
-  MatrixPtr input = getPrev(0)->getOutputValue();
-  Matrix::resizeOrCreate(outVTrans_, blockSize, blockNum, false, useGpu_);
+  // calculate output_.value
+  inputShape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
+  outputShape_ = TensorShape({batchSize, blockNum, blockSize});
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), inputShape_);
+  outputs.addArg(*getOutputValue(), outputShape_, ASSIGN_TO);
+  forward_[0]->calc(inputs, outputs);
+
+  // calculate output_.sequenceStartPositions and output_.cpuSequenceDims
+  Argument& out = getOutput();
   ICpuGpuVector::resizeOrCreate(
       out.sequenceStartPositions, batchSize + 1, false);
   IVector::resizeOrCreate(out.cpuSequenceDims, 2 * batchSize, false);
   int* start = out.sequenceStartPositions->getMutableData(false);
   int* dims = out.cpuSequenceDims->getData();
   for (size_t i = 0; i < batchSize; i++) {
-    outVTrans_->zeroMem();
-    /* expand each block as one row */
-    MatrixPtr inputTmp =
-        Matrix::create(input->getData() + i * input->getWidth(),
-                       1,
-                       input->getWidth(),
-                       false,
-                       useGpu_);
-    outVTrans_->convExpand(*inputTmp,
-                           imgSizeH_,
-                           imgSizeW_,
-                           channels_,
-                           blockH_,
-                           blockW_,
-                           strideH_,
-                           strideW_,
-                           paddingH_,
-                           paddingW_,
-                           outputH_,
-                           outputW_);
-    MatrixPtr outVTmp =
-        Matrix::create(outV->getData() + i * blockNum * blockSize,
-                       blockNum,
-                       blockSize,
-                       false,
-                       useGpu_);
-    outVTrans_->transpose(outVTmp, false);
     start[i] = i * blockNum;
     dims[2 * i] = outputH_;
     dims[2 * i + 1] = outputW_;
@@ -113,48 +108,13 @@ void BlockExpandLayer::forward(PassType passType) {
 }
 
 void BlockExpandLayer::backward(const UpdateCallback& callback) {
-  size_t blockNum = outputH_ * outputW_;
-  size_t blockSize = blockH_ * blockW_ * channels_;
   /* Calculate the input layers error */
-  MatrixPtr preGrad = inputLayers_[0]->getOutputGrad();
-  if (!preGrad) {
-    return;
-  }
-  MatrixPtr grad = getOutputGrad();
-  MatrixPtr gradTrans = Matrix::create(blockSize, blockNum, false, useGpu_);
-  size_t batchSize = preGrad->getHeight();
-
-  CHECK_EQ(batchSize * blockNum, grad->getHeight());
-  CHECK_EQ(blockSize, grad->getWidth());
-
-  for (size_t i = 0; i < batchSize; i++) {
-    MatrixPtr gradTmp =
-        Matrix::create(grad->getData() + i * blockNum * blockSize,
-                       blockNum,
-                       blockSize,
-                       false,
-                       useGpu_);
-    gradTmp->transpose(gradTrans, false);
-    MatrixPtr preGradTmp =
-        Matrix::create(preGrad->getData() + i * preGrad->getWidth(),
-                       1,
-                       preGrad->getWidth(),
-                       false,
-                       useGpu_);
-    preGradTmp->convShrink(*gradTrans,
-                           imgSizeH_,
-                           imgSizeW_,
-                           channels_,
-                           blockH_,
-                           blockW_,
-                           strideH_,
-                           strideW_,
-                           paddingH_,
-                           paddingW_,
-                           outputH_,
-                           outputW_,
-                           1.0,
-                           1.0);
+  if (getInputGrad(0)) {
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*getOutputGrad(), outputShape_);
+    outputs.addArg(*getInputGrad(0), inputShape_, ADD_TO);
+    backward_[0]->calc(inputs, outputs);
   }
 }
 
diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h
index 8f347400e6..15ce73ab8b 100644
--- a/paddle/gserver/layers/BlockExpandLayer.h
+++ b/paddle/gserver/layers/BlockExpandLayer.h
@@ -50,8 +50,8 @@ protected:
   size_t blockH_, blockW_, strideH_, strideW_, paddingH_, paddingW_;
   size_t imgSizeH_, imgSizeW_, outputH_, outputW_, channels_;
 
-  /// auxiliary variable, which saves the transposed output value.
-  MatrixPtr outVTrans_;
+  TensorShape inputShape_;
+  TensorShape outputShape_;
 
 public:
   explicit BlockExpandLayer(const LayerConfig& config) : Layer(config) {}
diff --git a/paddle/gserver/layers/ClipLayer.cpp b/paddle/gserver/layers/ClipLayer.cpp
new file mode 100644
index 0000000000..13f16c9537
--- /dev/null
+++ b/paddle/gserver/layers/ClipLayer.cpp
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * A layer for clipping the input value by the threshold.
+ * \f[
+ *   out[i] = \min\left(\max\left(in[i],p_{1}\right),p_{2}\right)
+ * \f]
+ */
+
+class ClipLayer : public Layer {
+protected:
+  double min_;
+  double max_;
+
+public:
+  explicit ClipLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(clip, ClipLayer);
+
+bool ClipLayer::init(const LayerMap& layerMap,
+                     const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 1U);
+  auto layerConf = config_.inputs(0).clip_conf();
+  min_ = layerConf.min();
+  max_ = layerConf.max();
+  CHECK_LT(min_, max_);
+  return true;
+}
+
+void ClipLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV = getInputValue(0);
+  resetOutput(inV->getHeight(), inV->getWidth());
+  MatrixPtr outV = getOutputValue();
+  outV->copyFrom(*inV);
+  outV->clip(min_, max_);
+}
+
+void ClipLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr inG = getInputGrad(0);
+  if (inG) {
+    MatrixPtr outV = getOutputValue();
+    MatrixPtr outG = getOutputGrad();
+    MatrixPtr tmpMtx;
+    Matrix::resizeOrCreate(
+        tmpMtx, outG->getHeight(), outG->getWidth(), false, useGpu_);
+    tmpMtx->clipDerivative(*inV, min_, max_);
+    inG->addDotMul(*outG, *tmpMtx, 1, 1);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseProjection.cpp b/paddle/gserver/layers/ConvBaseProjection.cpp
index d1e932ded5..eb6b0445c9 100644
--- a/paddle/gserver/layers/ConvBaseProjection.cpp
+++ b/paddle/gserver/layers/ConvBaseProjection.cpp
@@ -87,9 +87,6 @@ void ConvBaseProjection::initCudnn() {
   bwdDataLimitBytes_ = 0;
   bwdFilterLimitBytes_ = 0;
   workSpaceInBytes_ = 0;
-
-  batchNum_ = 0;
-  isSelectAlgo_ = false;
 }
 
 void ConvBaseProjection::reshapeTensorDesc(int batchSize) {
@@ -142,32 +139,25 @@ void ConvBaseProjection::reshape(int batchSize) {
   CHECK_EQ(width, out_->value->getWidth());
   CHECK_EQ(calInputSize(), in_->value->getWidth());
 
-  isSelectAlgo_ = (batchSize == batchNum_);
-  batchNum_ = batchSize;
-
-  if (!isSelectAlgo_) {
-    reshapeTensorDesc(batchSize);
-    hl_conv_workspace(imageDesc_,
-                      outputDesc_,
-                      filterDesc_,
-                      convDesc_,
-                      &fwdAlgo_,
-                      &fwdLimitBytes_,
-                      &bwdDataAlgo_,
-                      &bwdDataLimitBytes_,
-                      &bwdFilterAlgo_,
-                      &bwdFilterLimitBytes_);
-
-    size_t maxWorkSpace = 0;
-    maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
-    maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
-    workSpaceInBytes_ = maxWorkSpace;
-
-    VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
-            << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
-  }
-
-  isSelectAlgo_ = true;
+  reshapeTensorDesc(batchSize);
+  hl_conv_workspace(imageDesc_,
+                    outputDesc_,
+                    filterDesc_,
+                    convDesc_,
+                    &fwdAlgo_,
+                    &fwdLimitBytes_,
+                    &bwdDataAlgo_,
+                    &bwdDataLimitBytes_,
+                    &bwdFilterAlgo_,
+                    &bwdFilterLimitBytes_);
+
+  size_t maxWorkSpace = 0;
+  maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
+  maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
+  workSpaceInBytes_ = maxWorkSpace;
+
+  VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
+          << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
 }
 
 void *ConvBaseProjection::getSpaceBytes(size_t size) {
diff --git a/paddle/gserver/layers/ConvBaseProjection.h b/paddle/gserver/layers/ConvBaseProjection.h
index 4a33aa1837..e9d9f8f1b2 100644
--- a/paddle/gserver/layers/ConvBaseProjection.h
+++ b/paddle/gserver/layers/ConvBaseProjection.h
@@ -101,12 +101,6 @@ protected:
   size_t bwdFilterLimitBytes_;
   /// Size of total work space.
   size_t workSpaceInBytes_;
-
-  /// Whether to call cuDNN api to choose conv algorithm.
-  bool isSelectAlgo_;
-  /// batchNum is used to record batch size. If the batch size is changed,
-  /// the selection algorithm will be called.
-  int batchNum_;
   bool bias_;
 
   std::unique_ptr<Weight> weight_;
diff --git a/paddle/gserver/layers/CropLayer.cpp b/paddle/gserver/layers/CropLayer.cpp
new file mode 100644
index 0000000000..69ad913420
--- /dev/null
+++ b/paddle/gserver/layers/CropLayer.cpp
@@ -0,0 +1,146 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CropLayer.h"
+#include "paddle/utils/Stat.h"
+namespace paddle {
+
+REGISTER_LAYER(crop, CropLayer);
+
+bool CropLayer::init(const LayerMap& layerMap,
+                     const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_LE(static_cast<int>(inputLayers_.size()), 2);
+  CHECK_GE(static_cast<int>(inputLayers_.size()), 1);
+  crop_axis_ = config_.axis();
+  for (int i = 0; i < config_.offset_size(); i++) {
+    crop_offsets_.push_back(config_.offset(i));
+  }
+
+  // 1. get input_0 shape
+  auto& input0_img_conf = config_.inputs(0).image_conf();
+  inDims_ = TensorShape({0,
+                         input0_img_conf.channels(),
+                         input0_img_conf.has_img_size_y()
+                             ? input0_img_conf.img_size_y()
+                             : input0_img_conf.img_size(),
+                         input0_img_conf.img_size()});
+  // 2. get target dims from config
+  if (config_.inputs_size() == 1) {
+    targetDims_ = TensorShape({config_.shape(0),
+                               config_.shape(1),
+                               config_.shape(2),
+                               config_.shape(3)});
+  } else {
+    // 2. get input_1 shape
+    auto& input1_img_conf = config_.inputs(1).image_conf();
+    targetDims_ = TensorShape({0,
+                               input1_img_conf.channels(),
+                               input1_img_conf.has_img_size_y()
+                                   ? input1_img_conf.img_size_y()
+                                   : input1_img_conf.img_size(),
+                               input1_img_conf.img_size()});
+  }
+
+  // 3. get final crop corner
+  int dimSize = 4;
+  crop_corner_ = {0, 0, 0, 0};
+  for (int i = 0; i < dimSize; i++) {
+    if (i >= crop_axis_) {
+      if (crop_offsets_.size() > 1) {
+        crop_corner_[i] = crop_offsets_[i - crop_axis_];
+      } else {
+        crop_corner_[i] = crop_offsets_[0];
+      }
+    }
+  }
+
+  outDims_ = TensorShape(4);
+
+  createFunction(
+      forward_, "Crop", FuncConfig().set("crop_corner", crop_corner_));
+  createFunction(
+      backward_, "CropGrad", FuncConfig().set("crop_corner", crop_corner_));
+
+  return true;
+}
+
+void CropLayer::setOutDims() {
+  MatrixPtr input = inputLayers_[1]->getOutputValue();
+  size_t batchSize = input->getHeight();
+  // get target dims from input_1
+  if (config_.inputs_size() == 2) {
+    targetDims_.setDim(0, batchSize);
+    int ch = config_.inputs(0).image_conf().channels();
+    if (ch != 0) targetDims_.setDim(1, ch);
+    int h = inputLayers_[1]->getOutput().getFrameHeight();
+    if (h != 0) targetDims_.setDim(2, h);
+    int w = inputLayers_[1]->getOutput().getFrameWidth();
+    if (w != 0) targetDims_.setDim(3, w);
+  }
+  // get final crop shape from target dims and crop axis
+  std::vector<uint32_t> crop_shape;
+  int dimSize = 4;
+  for (int i = 0; i < dimSize; i++) {
+    if (i >= crop_axis_) {
+      crop_shape.push_back(targetDims_[i]);
+    } else {
+      crop_shape.push_back(inDims_[i]);
+    }
+  }
+
+  outDims_.reshape(
+      {crop_shape[0], crop_shape[1], crop_shape[2], crop_shape[3]});
+  output_.setFrameHeight(crop_shape[2]);
+  output_.setFrameWidth(crop_shape[3]);
+}
+
+void CropLayer::setInDims() {
+  MatrixPtr input = inputLayers_[0]->getOutputValue();
+  size_t batchSize = input->getHeight();
+  inDims_.setDim(0, batchSize);
+  int h = inputLayers_[0]->getOutput().getFrameHeight();
+  if (h != 0) inDims_.setDim(2, h);
+  int w = inputLayers_[0]->getOutput().getFrameWidth();
+  if (w != 0) inDims_.setDim(3, w);
+}
+
+void CropLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  setInDims();
+  setOutDims();
+  int size = outDims_[1] * outDims_[2] * outDims_[3];
+  resetOutput(outDims_[0], size);
+  MatrixPtr outV = getOutputValue();
+  REGISTER_TIMER_INFO("CropForward", getName().c_str());
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), inDims_);
+  outputs.addArg(*getOutputValue(), outDims_, ASSIGN_TO);
+  forward_[0]->calc(inputs, outputs);
+}
+
+void CropLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  REGISTER_TIMER_INFO("CropBackward", getName().c_str());
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getOutputGrad(), outDims_);
+  outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
+  backward_[0]->calc(inputs, outputs);
+}
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CropLayer.h b/paddle/gserver/layers/CropLayer.h
new file mode 100644
index 0000000000..6b62026210
--- /dev/null
+++ b/paddle/gserver/layers/CropLayer.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * \brief  This layer crop input according to the specify conf.
+ *         input_0: input to be cropped
+ *         input_1: optional reference input
+ *         axis: start dimension to be croped
+ *         offset: offset of cropping  in each dimension
+ *         shape: if reference input layer was not setted,
+ *                  crop input as this shape conf
+ */
+class CropLayer : public Layer {
+public:
+  explicit CropLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~CropLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+protected:
+  void setOutDims();
+  void setInDims();
+
+  int32_t crop_axis_;
+  std::vector<uint32_t> crop_offsets_;
+  std::vector<uint32_t> crop_corner_;
+  TensorShape inDims_;
+  TensorShape targetDims_;
+  TensorShape outDims_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.cpp b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
index 09dac05a7a..44ba2c4b7d 100644
--- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "CudnnBatchNormLayer.h"
 #include "Layer.h"
+#include "paddle/cuda/include/hl_batch_norm.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
@@ -79,16 +80,33 @@ void CudnnBatchNormLayer::forward(PassType passType) {
                                    savedInvVar);
   } else {
     // used movingMean and movingVar in testing
-    hl_batch_norm_forward_inference(ioDesc_,
-                                    input,
-                                    ioDesc_,
-                                    output,
-                                    bnParamDesc_,
-                                    gamma,
-                                    beta,
-                                    movingMean,
-                                    movingVar,
-                                    EPS);
+    if (batchSize <= 1024) {
+      hl_batch_norm_forward_inference(ioDesc_,
+                                      input,
+                                      ioDesc_,
+                                      output,
+                                      bnParamDesc_,
+                                      gamma,
+                                      beta,
+                                      movingMean,
+                                      movingVar,
+                                      EPS);
+    } else {
+      // There is a limitation in cudnn library.
+      // When the batch size is larger than 1024 in cuDNN v5.1,
+      // the cudnnBatchNormalizationForwardInference will fail.
+      hl_batch_norm_cuda_inference(input,
+                                   output,
+                                   gamma,
+                                   beta,
+                                   movingMean,
+                                   movingVar,
+                                   EPS,
+                                   batchSize,
+                                   channels_,
+                                   imageH_,
+                                   imageW_);
+    }
   }
 
   /* activation */ {
diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp
index af79e65a7c..0ece279931 100644
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -38,12 +38,26 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
   inputShape_.resize(numInputs);
   filterShape_.resize(numInputs);
   outputShape_.resize(numInputs);
+
+  std::string convType;
+  std::string convGradInputType;
+  std::string convGradFilterType;
+
   for (int i = 0; i < config_.inputs_size(); i++) {
     std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
     std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
 
-    if (FLAGS_use_nnpack) {
-      CHECK_EQ(isDeconv_, false);
+    if (useGpu_ && (size_t)groups_[i] == (size_t)channels_[i] && !isDeconv_) {
+      convType = "DepthwiseConv";
+      convGradInputType = "DepthwiseConvGradInput";
+      convGradFilterType = "DepthwiseConvGradFilter";
+    } else {
+      convType = "GemmConv";
+      convGradInputType = "GemmConvGradInput";
+      convGradFilterType = "GemmConvGradFilter";
+    }
+
+    if (FLAGS_use_nnpack && !isDeconv_) {
       createFunction(forward_,
                      "NNPACKConv",
                      FuncConfig()
@@ -53,21 +67,21 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
                          .set("algo", std::string("auto")));
     } else {
       createFunction(forward_,
-                     !isDeconv_ ? "GemmConv" : "GemmConvGradInput",
+                     !isDeconv_ ? convType : convGradInputType,
                      FuncConfig()
                          .set("paddings", paddings)
                          .set("strides", strides)
                          .set("groups", (size_t)groups_[i]));
 
       createFunction(backward_,
-                     !isDeconv_ ? "GemmConvGradInput" : "GemmConv",
+                     !isDeconv_ ? convGradInputType : convType,
                      FuncConfig()
                          .set("paddings", paddings)
                          .set("strides", strides)
                          .set("groups", (size_t)groups_[i]));
 
       createFunction(backward_,
-                     "GemmConvGradFilter",
+                     convGradFilterType,
                      FuncConfig()
                          .set("paddings", paddings)
                          .set("strides", strides)
diff --git a/paddle/gserver/layers/GruCompute.cu b/paddle/gserver/layers/GruCompute.cu
index d5e547dce3..b4f5c54b14 100644
--- a/paddle/gserver/layers/GruCompute.cu
+++ b/paddle/gserver/layers/GruCompute.cu
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "GruCompute.h"
 
 #include "hl_recurrent_apply.cuh"
@@ -31,8 +30,10 @@ void GruCompute::forward<1>(hl_gru_value value, int frameSize, int batchSize) {
 }
 
 template <>
-void GruCompute::backward<1>(hl_gru_value value, hl_gru_grad grad,
-                            int frameSize, int batchSize) {
+void GruCompute::backward<1>(hl_gru_value value,
+                             hl_gru_grad grad,
+                             int frameSize,
+                             int batchSize) {
   hl_gpu_gru_backward(hppl::backward::gru_stateGrad(),
                       hppl::backward::gru_resetGrad(),
                       value,
diff --git a/paddle/gserver/layers/KmaxSeqScoreLayer.cpp b/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
new file mode 100644
index 0000000000..8ce591d476
--- /dev/null
+++ b/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+class KmaxSeqScoreLayer : public Layer {
+private:
+  MatrixPtr scores_;
+  size_t beamSize_;
+  void kmaxScorePerSeq(const real* score,
+                       real* sortedRes,
+                       const ICpuGpuVectorPtr seqStartPos);
+
+public:
+  explicit KmaxSeqScoreLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(kmax_seq_score, KmaxSeqScoreLayer);
+
+bool KmaxSeqScoreLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  bool ret = Layer::init(layerMap, parameterMap);
+  CHECK_EQ(1U, inputLayers_.size());
+
+  beamSize_ = config_.beam_size();
+  CHECK_GE(beamSize_, 1U);
+
+  setNeedSequenceInfo(false);
+  setNeedGradient(false);
+  return ret;
+}
+
+void KmaxSeqScoreLayer::kmaxScorePerSeq(const real* scores,
+                                        real* sortedIds,
+                                        const ICpuGpuVectorPtr seqStartPos) {
+  int* starts = seqStartPos->getMutableData(false);
+  std::vector<real> indices;
+  for (size_t i = 0; i < seqStartPos->getSize() - 1; ++i) {
+    int seqLen = starts[i + 1] - starts[i];
+    int k = std::min(static_cast<int>(beamSize_), seqLen);
+
+    indices.resize(seqLen, 0);
+    std::iota(begin(indices), end(indices), 0.);
+    std::vector<real> tmpScore(scores + starts[i], scores + starts[i + 1]);
+    std::partial_sort(
+        begin(indices),
+        begin(indices) + k,
+        end(indices),
+        [&](size_t a, size_t b) { return tmpScore[a] > tmpScore[b]; });
+    memcpy(sortedIds + (i * beamSize_), indices.data(), k * sizeof(real));
+  }
+}
+
+void KmaxSeqScoreLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& input = getInput(0);
+  const MatrixPtr inputScore = getInputValue(0);
+
+  CHECK(input.hasSeq() || input.hasSubseq())
+      << "input of " << getName()
+      << " must be a sequence or a nested sequence.";
+  CHECK_EQ(input.value->getWidth(), 1UL)
+      << "input of " << getName()
+      << " is score over a sequence or a nested sequence, so its width "
+      << " must be 1.";
+
+  if (useGpu_) {
+    // this Layer runs only in CPU, if the model is runing on GPU,
+    // then copy the input to this layer from GPU to CPU.
+    Matrix::resizeOrCreate(scores_,
+                           inputScore->getHeight(),
+                           1,
+                           false /* trans */,
+                           false /* useGpu */);
+    scores_->copyFrom(*inputScore);
+  } else {
+    scores_ = inputScore;
+  }
+
+  Matrix::resizeOrCreate(
+      output_.value,
+      input.hasSubseq() ? input.getNumSubSequences() : input.getNumSequences(),
+      beamSize_,
+      false,
+      false);
+  output_.value->one();
+  output_.value->mulScalar(-1.);
+
+  kmaxScorePerSeq(scores_->getData(),
+                  output_.value->getData(),
+                  input.hasSubseq() ? input.subSequenceStartPositions
+                                    : input.sequenceStartPositions);
+}
+
+void KmaxSeqScoreLayer::backward(const UpdateCallback& callback) {}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index 4b92b5d163..d5621412ca 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -359,12 +359,11 @@ void Layer::backwardActivation() {
   /* Do error clipping */
   if (config_.error_clipping_threshold() > 0.0f) {
     if (FLAGS_log_error_clipping) {
-      CpuVector outGradVec(0, nullptr);
-      outGradVec.subVecFrom(
-          output_.grad->getData(), 0, output_.grad->getElementCnt());
-      real maxAbsGrad = outGradVec.getAbsMax();
+      VectorPtr outGradVec = Vector::create(
+          output_.grad->getData(), output_.grad->getElementCnt(), useGpu_);
+      real maxAbsGrad = outGradVec->getAbsMax();
       if (maxAbsGrad > config_.error_clipping_threshold()) {
-        real avgAbsGrad = outGradVec.getAbsSum() / outGradVec.getSize();
+        real avgAbsGrad = outGradVec->getAbsSum() / outGradVec->getSize();
         LOG(INFO) << " layer=" << config_.name() << " need clipping,"
                   << " max error=" << maxAbsGrad << " avg error=" << avgAbsGrad;
       }
diff --git a/paddle/gserver/layers/LstmCompute.cu b/paddle/gserver/layers/LstmCompute.cu
index f75c0c40cc..d3f59b52a4 100644
--- a/paddle/gserver/layers/LstmCompute.cu
+++ b/paddle/gserver/layers/LstmCompute.cu
@@ -12,41 +12,62 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "LstmCompute.h"
 #include "hl_recurrent_apply.cuh"
 
 namespace paddle {
 
 template <>
-void LstmCompute::forwardBatch<1>(hl_lstm_value value, int frameSize,
-                                 int batchSize) {
-  hl_gpu_lstm_forward(hppl::forward::lstm(), value, frameSize,
-                      batchSize, activeNode_, activeGate_,
+void LstmCompute::forwardBatch<1>(hl_lstm_value value,
+                                  int frameSize,
+                                  int batchSize) {
+  hl_gpu_lstm_forward(hppl::forward::lstm(),
+                      value,
+                      frameSize,
+                      batchSize,
+                      activeNode_,
+                      activeGate_,
                       activeState_);
 }
 
 template <>
-void LstmCompute::backwardBatch<1>(hl_lstm_value value, hl_lstm_grad grad,
-                                   int frameSize, int batchSize) {
-  hl_gpu_lstm_backward(hppl::backward::lstm(), value, grad,
-                       frameSize, batchSize, activeNode_,
-                       activeGate_, activeState_);
+void LstmCompute::backwardBatch<1>(hl_lstm_value value,
+                                   hl_lstm_grad grad,
+                                   int frameSize,
+                                   int batchSize) {
+  hl_gpu_lstm_backward(hppl::backward::lstm(),
+                       value,
+                       grad,
+                       frameSize,
+                       batchSize,
+                       activeNode_,
+                       activeGate_,
+                       activeState_);
 }
 
 template <>
 void LstmCompute::forwardOneSequence<1>(hl_lstm_value value, int frameSize) {
-  hl_gpu_lstm_forward(hppl::forward::lstm(), value,
-                      frameSize, /* batchSize */ 1,
-                      activeNode_, activeGate_, activeState_);
+  hl_gpu_lstm_forward(hppl::forward::lstm(),
+                      value,
+                      frameSize,
+                      /* batchSize */ 1,
+                      activeNode_,
+                      activeGate_,
+                      activeState_);
 }
 
 template <>
-void LstmCompute::backwardOneSequence<1>(hl_lstm_value value, hl_lstm_grad grad,
+void LstmCompute::backwardOneSequence<1>(hl_lstm_value value,
+                                         hl_lstm_grad grad,
                                          int frameSize) {
-  hl_gpu_lstm_backward(hppl::backward::lstm(), value, grad,
-                       frameSize, /* batchSize */ 1,
-                       activeNode_, activeGate_, activeState_);
+  hl_gpu_lstm_backward(hppl::backward::lstm(),
+                       value,
+                       grad,
+                       frameSize,
+                       /* batchSize */ 1,
+                       activeNode_,
+                       activeGate_,
+                       activeState_);
 }
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNBase.h b/paddle/gserver/layers/MKLDNNBase.h
new file mode 100644
index 0000000000..4c0234e7b3
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNBase.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "mkldnn.hpp"
+
+namespace paddle {
+
+typedef enum {
+  MKLDNN_BASE = 1,   // basical info of MKLDNN
+  MKLDNN_TESTS = 1,  // gtest info of MKLDNN
+  MKLDNN_SIZES = 2,  // size info of MKLDNN
+  MKLDNN_FMTS = 3,   // format info of MKLDNN
+  MKLDNN_ALL = 4,    // show all info of MKLDNN
+} MKLDNN_LOG_LEVEL;
+
+/**
+ * @brief MKLDNN CPU engine.
+ *
+ */
+class CPUEngine {
+public:
+  static CPUEngine& Instance() {
+    // Thread-safe in C++11.
+    static CPUEngine myInstance;
+    return myInstance;
+  }
+
+  // Disallow copy or move
+  CPUEngine(const CPUEngine&) = delete;             // Copy constructor
+  CPUEngine(CPUEngine&&) = delete;                  // Move constructor
+  CPUEngine& operator=(const CPUEngine&) = delete;  // Copy assignment
+  CPUEngine& operator=(CPUEngine&&) = delete;       // Move assignment
+
+  mkldnn::engine& getEngine() { return cpuEngine_; }
+
+protected:
+  CPUEngine() : cpuEngine_(mkldnn::engine::cpu, 0) {}
+  //    CPUEngine() : cpuEngine_(mkldnn::engine::cpu_lazy, 0) {}
+  ~CPUEngine() {}
+
+private:
+  mkldnn::engine cpuEngine_;
+};
+
+/**
+ * @brief MKLDNN Stream.
+ *
+ */
+class MKLDNNStream {
+public:
+  MKLDNNStream() : ready_(false) { resetState(); }
+
+  virtual ~MKLDNNStream() {}
+
+  /**
+   * @brief Submit stream
+   * @param prims The primitives vector
+   * @param block Waiting for the stream to complete
+   */
+  void submit(std::vector<mkldnn::primitive>& prims, bool block = true) {
+    resetState();
+    stream_->submit(prims).wait(block);
+    ready_ = false;
+  }
+
+  /**
+   * @brief Reset the mkldnn stream
+   */
+  void resetState() {
+    if (ready_) {
+      return;
+    }
+    // TODO(TJ): change me when mkldnn have method to reset this state
+    // stream_.reset(new mkldnn::stream(mkldnn::stream::kind::lazy));
+    stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
+    ready_ = true;
+  }
+
+private:
+  bool ready_;
+  std::shared_ptr<mkldnn::stream> stream_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
new file mode 100644
index 0000000000..d201fac65e
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -0,0 +1,286 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNFcLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+typedef inner_product_forward fc_fwd;
+typedef inner_product_backward_weights fc_bwdWgt;
+typedef inner_product_backward_data fc_bwdData;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_fc, MKLDNNFcLayer);
+
+bool MKLDNNFcLayer::init(const LayerMap& layerMap,
+                         const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+
+  CHECK_EQ(inputLayers_.size(), 1) << "Only support one input layer yet";
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  CHECK(!parameters_[0]->isSparse()) << "Do not support sparse yet";
+
+  // output size, cat not be changed
+  oc_ = getSize();
+  oh_ = 1;
+  ow_ = 1;
+
+  // input size can not change in FC
+  iLayerSize_ = inputLayers_[0]->getSize();
+  CHECK_EQ(parameters_[0]->getSize(), iLayerSize_ * oc_);
+
+  // create weight
+  weight_ =
+      std::unique_ptr<Weight>(new Weight(oc_, iLayerSize_, parameters_[0], 0));
+
+  // create biases
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_));
+  }
+  return true;
+}
+
+void MKLDNNFcLayer::convertWeightsFromPaddle() {
+  if (hasInitedWgt_) {
+    return;
+  }
+
+  // TODO(TJ): dst format should get from wgtVal_
+  int dstFmt = PARAM_FORMAT_MKLDNN_OI;
+  int srcFmt = weight_->getParameterPtr()->getHeaderFormat();
+  if (srcFmt == dstFmt) {
+    return;
+  }
+
+  // The weight_ is transposed from initial paddle weight
+  MatrixPtr paddleWgt = Matrix::create(
+      weight_->getW()->getData(), iLayerSize_, oc_, false, false);
+
+  // TODO(TJ): remove this print when do not need differ weights
+  std::ostringstream ostr;
+  paddleWgt->print(ostr);
+  VLOG(MKLDNN_ALL) << "Initial Weight from paddle: " << std::endl << ostr.str();
+
+  // The mkldnn weight is transposed from initial paddle matrix
+  MatrixPtr paddleWgtT;
+  paddleWgt->transpose(paddleWgtT, true);
+  weight_->getW()->copyFrom(*paddleWgtT);
+  weight_->getParameterPtr()->setHeaderFormat(dstFmt);
+  hasInitedWgt_ = true;
+}
+
+void MKLDNNFcLayer::convertWeightsToPaddle() {
+  MatrixPtr dnnWgt = weight_->getW();
+  MatrixPtr paddleWgt;
+  dnnWgt->transpose(paddleWgt, true);
+
+  // copy paddle weight and override on weight_
+  MatrixPtr dnnWgtT = Matrix::create(
+      dnnWgt->getData(), dnnWgt->getWidth(), dnnWgt->getHeight(), false, false);
+  dnnWgtT->copyFrom(*paddleWgt);
+}
+
+void MKLDNNFcLayer::reshape() {
+  const Argument& input = getInput(0);
+  int batchSize = input.getBatchSize();
+  if (bs_ == batchSize) {
+    return;
+  }
+  bs_ = batchSize;
+  ih_ = input.getFrameHeight();
+  iw_ = input.getFrameWidth();
+  if (ih_ == 0) {
+    ih_ = 1;
+  }
+  if (iw_ == 0) {
+    iw_ = 1;
+  }
+  hasSpatial_ = true;
+  if (ih_ == 1 && iw_ == 1) {
+    hasSpatial_ = false;
+  }
+  CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize());
+  ic_ = iLayerSize_ / (ih_ * iw_);
+  CHECK_EQ(size_t(ic_ * ih_ * iw_), iLayerSize_) << "not divisible";
+  CHECK_EQ(size_t(oc_), getSize());
+  printSizeInfo();
+
+  // reset output
+  output_.setFrameHeight(oh_);
+  output_.setFrameWidth(ow_);
+  resetOutput(bs_, oc_);
+
+  // reset mkldnn forward
+  resetFwd();
+  needResetBwd_ = true;
+
+  convertWeightsFromPaddle();
+}
+
+void MKLDNNFcLayer::resetFwd() {
+  bool hasBias = biases_ && biases_->getW();
+  real* iData = getInputValue(0)->getData();
+  real* oData = getOutputValue()->getData();
+  real* wData = weight_->getW()->getData();
+  real* bData = hasBias ? biases_->getW()->getData() : NULL;
+
+  // TODO(TJ): below create should be covered in MkldnnMatrix
+  // create memory desc
+  memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw)
+                                 : createMD({bs_, ic_}, format::nc);
+  memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw)
+                                 : createMD({oc_, ic_}, format::oi);
+  memory::desc bMD = bData != NULL ? createMD({oc_}, format::x)
+                                   : createMD({}, format::format_undef);
+  memory::desc oMD = createMD({bs_, oc_}, format::nc);
+
+  // create memory primitive desc and memory self
+  inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData));
+  wgtVal_.reset(new memory(memory::primitive_desc(wMD, engine_), wData));
+  outVal_.reset(new memory(memory::primitive_desc(oMD, engine_), oData));
+
+  prop_kind pk = prop_kind::forward;
+  fc_fwd::desc fwdDesc = bData != NULL ? fc_fwd::desc(pk, iMD, wMD, bMD, oMD)
+                                       : fc_fwd::desc(pk, iMD, wMD, oMD);
+  fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
+
+  if (bData != NULL) {
+    biasVal_.reset(new memory(memory::primitive_desc(bMD, engine_), bData));
+    fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *biasVal_, *outVal_));
+  } else {
+    fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *outVal_));
+  }
+  pipelineFwd_.clear();
+  pipelineFwd_.push_back(*fwd_);
+}
+
+void MKLDNNFcLayer::resetBwd() {
+  if (!needResetBwd_) {
+    return;
+  }
+  needResetBwd_ = false;
+
+  bool hasBias = biases_ && biases_->getWGrad();
+  real* iData = getInputValue(0)->getData();
+  real* iDiff = getInputGrad(0) != nullptr ? getInputGrad(0)->getData() : NULL;
+  real* oDiff = getOutputGrad()->getData();
+  real* wDiff = weight_->getWGrad()->getData();
+  real* bDiff = hasBias ? biases_->getWGrad()->getData() : NULL;
+
+  /// backward weight
+  // create memory desc for backward memory
+  memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw)
+                                 : createMD({bs_, ic_}, format::nc);
+  memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw)
+                                 : createMD({oc_, ic_}, format::oi);
+  memory::desc oMD = createMD({bs_, oc_}, format::nc);
+  memory::desc bMD = bDiff != NULL ? createMD({oc_}, format::x)
+                                   : createMD({}, format::format_undef);
+
+  if (inVal_) {
+    // update data
+    inVal_->set_data_handle(iData);
+  } else {
+    inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData));
+  }
+
+  // create memory primitive desc and memory self
+  wgtGrad_.reset(new memory(memory::primitive_desc(wMD, engine_), wDiff));
+  outGrad_.reset(new memory(memory::primitive_desc(oMD, engine_), oDiff));
+
+  fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward, iMD, wMD, oMD);
+  fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
+  fc_bwdWgt::desc bwdWgtDesc = bDiff != NULL
+                                   ? fc_bwdWgt::desc(iMD, wMD, bMD, oMD)
+                                   : fc_bwdWgt::desc(iMD, wMD, oMD);
+  fc_bwdWgt::primitive_desc bwdWgtPD =
+      fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, fwdPD);
+
+  if (bDiff != NULL) {
+    biasGrad_.reset(new memory(memory::primitive_desc(bMD, engine_), bDiff));
+    bwdWgt_.reset(
+        new fc_bwdWgt(bwdWgtPD, *inVal_, *outGrad_, *wgtGrad_, *biasGrad_));
+  } else {
+    bwdWgt_.reset(new fc_bwdWgt(bwdWgtPD, *inVal_, *outGrad_, *wgtGrad_));
+  }
+  pipelineBwd_.clear();
+  pipelineBwd_.push_back(*bwdWgt_);
+
+  /// backward data
+  if (iDiff == NULL) {
+    return;
+  }
+  fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(iMD, wMD, oMD);
+  fc_bwdData::primitive_desc bwdDataPD =
+      fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD);
+  inGrad_.reset(new memory(memory::primitive_desc(iMD, engine_), iDiff));
+  CHECK(wgtVal_) << "Should have weight memory";
+  bwdData_.reset(new fc_bwdData(bwdDataPD, *outGrad_, *wgtVal_, *inGrad_));
+  pipelineBwd_.push_back(*bwdData_);
+}
+
+void MKLDNNFcLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  reshape();
+
+  {
+    REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
+
+    // update input data
+    // since it might be changed if this is after data layer
+    real* iData = getInputValue(0)->getData();
+    inVal_->set_data_handle(iData);
+
+    // just submit forward pipeline
+    stream_->submit(pipelineFwd_);
+  }
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwActTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void MKLDNNFcLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ {
+    REGISTER_TIMER_INFO("BpActTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  {
+    REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
+    resetBwd();
+
+    // update diff
+    real* oDiff = getOutputGrad()->getData();
+    outGrad_->set_data_handle(oDiff);
+
+    // just sumbmit backward pipeline
+    stream_->submit(pipelineBwd_);
+  }
+
+  {
+    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+    weight_->getParameterPtr()->incUpdate(callback);
+    if (biases_ && biases_->getWGrad()) {
+      biases_->getParameterPtr()->incUpdate(callback);
+    }
+  }
+}
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.h b/paddle/gserver/layers/MKLDNNFcLayer.h
new file mode 100644
index 0000000000..7954852a23
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of MKLDNNLayer fc layer.
+ *
+ * The config file api is mkldnn_fc
+ */
+class MKLDNNFcLayer : public MKLDNNLayer {
+protected:
+  // input layer size, can not be change after init
+  size_t iLayerSize_;  // == ic * ih * iw
+
+  // if has already init the weight
+  bool hasInitedWgt_;
+
+  // if input layer has image size info (ih>1 && iw>1)
+  bool hasSpatial_;
+
+  // fc weight and bias
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> biases_;
+
+public:
+  explicit MKLDNNFcLayer(const LayerConfig& config)
+      : MKLDNNLayer(config), hasInitedWgt_(false), hasSpatial_(true) {}
+
+  ~MKLDNNFcLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void convertWeightsFromPaddle() override;
+
+  void convertWeightsToPaddle() override;
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback& callback) override;
+
+protected:
+  /**
+   * reshape the input image sizes
+   * and reset output buffer size
+   * and reset mkldnn forward
+   */
+  void reshape();
+
+  /**
+   * reset the forward primitve and memory
+   * only would be called when input size changes
+   */
+  void resetFwd();
+
+  /**
+   * reset the backward primitve and memory for mkldnn fc
+   * only would be called when needed
+   */
+  void resetBwd();
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
new file mode 100644
index 0000000000..63e29f447e
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -0,0 +1,132 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Layer.h"
+#include "MKLDNNBase.h"
+#include "mkldnn.hpp"
+
+DECLARE_bool(use_mkldnn);
+DECLARE_bool(use_mkldnn_wgt);
+
+namespace paddle {
+
+class MKLDNNLayer;
+typedef std::shared_ptr<MKLDNNLayer> MKLDNNLayerPtr;
+
+/**
+ * @brief Base class of MKLDNNlayer.
+ *
+ */
+class MKLDNNLayer : public Layer {
+protected:
+  // batch size
+  int bs_;
+  // input image channel, height and width
+  int ic_, ih_, iw_;
+  // output image channel, height and width
+  int oc_, oh_, ow_;
+
+  // backward also need reset after reset forward handle
+  bool needResetBwd_;
+
+  // mkldnn engine, stream and primivtives
+  mkldnn::engine engine_;
+  std::shared_ptr<MKLDNNStream> stream_;
+  std::shared_ptr<mkldnn::primitive> fwd_;
+  std::shared_ptr<mkldnn::primitive> bwdWgt_;
+  std::shared_ptr<mkldnn::primitive> bwdData_;
+  std::vector<mkldnn::primitive> pipelineFwd_;
+  std::vector<mkldnn::primitive> pipelineBwd_;
+
+  // TODO(TJ): change below memory as MKLDNNMatrixPtr type
+  std::shared_ptr<mkldnn::memory> inVal_;
+  std::shared_ptr<mkldnn::memory> inGrad_;
+  std::shared_ptr<mkldnn::memory> outVal_;
+  std::shared_ptr<mkldnn::memory> outGrad_;
+  std::shared_ptr<mkldnn::memory> wgtVal_;
+  std::shared_ptr<mkldnn::memory> wgtGrad_;
+  std::shared_ptr<mkldnn::memory> biasVal_;
+  std::shared_ptr<mkldnn::memory> biasGrad_;
+
+public:
+  explicit MKLDNNLayer(const LayerConfig& config)
+      : Layer(config),
+        bs_(0),
+        ic_(0),
+        ih_(0),
+        iw_(0),
+        oc_(0),
+        oh_(0),
+        ow_(0),
+        needResetBwd_(true),
+        engine_(mkldnn::engine::cpu, 0),
+        stream_(nullptr),
+        fwd_(nullptr),
+        bwdWgt_(nullptr),
+        bwdData_(nullptr) {}
+
+  ~MKLDNNLayer() {}
+
+  virtual bool init(const LayerMap& layerMap,
+                    const ParameterMap& parameterMap) {
+    if (!Layer::init(layerMap, parameterMap)) {
+      return false;
+    }
+
+    CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
+                            << "Please set WITH_MKLDNN=ON "
+                            << "and set use_mkldnn=True";
+    stream_.reset(new MKLDNNStream());
+    engine_ = CPUEngine::Instance().getEngine();
+
+    // TODO(TJ): deivecId
+    return true;
+  }
+
+  /**
+   * convert weight from paddle format to mkldnn format
+   * weight_ will be override
+   */
+  virtual void convertWeightsFromPaddle() {}
+
+  /**
+   * convert mkldnn weight to paddle format
+   * weight_ will be override
+   */
+  virtual void convertWeightsToPaddle() {}
+
+  /**
+   * print info about sizes
+   */
+  virtual void printSizeInfo() {
+    VLOG(MKLDNN_SIZES) << getName() << ": bs: " << bs_ << ", ic: " << ic_
+                       << ", ih: " << ih_ << ", iw: " << iw_ << ", oc: " << oc_
+                       << ", oh: " << oh_ << ", ow: " << ow_;
+  }
+
+  // TODO(TJ): move to MkldnnMatrix
+  // create memory desc
+  inline mkldnn::memory::desc createMD(
+      mkldnn::memory::dims dims,
+      mkldnn::memory::format fmt,
+      mkldnn::memory::data_type type = mkldnn::memory::data_type::f32) {
+    // TODO(TJ): isFmtSuppoted(fmt)
+    return mkldnn::memory::desc(dims, type, fmt);
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PrintLayer.cpp b/paddle/gserver/layers/PrintLayer.cpp
index a97fa6bf78..0a1e17b9aa 100644
--- a/paddle/gserver/layers/PrintLayer.cpp
+++ b/paddle/gserver/layers/PrintLayer.cpp
@@ -29,7 +29,7 @@ public:
       vals.push_back(s.str());
     }
     size_t pos = 0;
-    int i = 0;
+    size_t i = 0;
     std::ostringstream s;
     const std::string& format = config_.user_arg();
     while (true) {
diff --git a/paddle/gserver/layers/RowL2NormLayer.cpp b/paddle/gserver/layers/RowL2NormLayer.cpp
new file mode 100644
index 0000000000..0d609be43b
--- /dev/null
+++ b/paddle/gserver/layers/RowL2NormLayer.cpp
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * A layer for L2 normalization in each row,
+ * \f[
+ *   out[i] = \frac{in[i]}{\sqrt{\sum_{k=1}^N in[k]^{2}}}
+ * \f]
+ * where the size of \f$in\f$ is (batchSize x dataDim),
+ * and the size of \f$out\f$ is (batchSize x dataDim).
+ */
+
+class RowL2NormLayer : public Layer {
+protected:
+  MatrixPtr inSquare_;
+  MatrixPtr l2NormReciprocal_;
+  MatrixPtr dotSum_;
+
+public:
+  explicit RowL2NormLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(row_l2_norm, RowL2NormLayer);
+
+bool RowL2NormLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 1U);
+
+  return true;
+}
+
+void RowL2NormLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV = getInputValue(0);
+
+  /* malloc memory for the output_ if necessary */
+  size_t batchSize = inV->getHeight();
+  size_t dataDim = getSize();
+  CHECK_EQ(dataDim, inV->getWidth());
+  resetOutput(batchSize, dataDim);
+  MatrixPtr outV = getOutputValue();
+
+  Matrix::resizeOrCreate(inSquare_, batchSize, dataDim, false, useGpu_);
+  inV->square2(*inSquare_);
+  Matrix::resizeOrCreate(l2NormReciprocal_, batchSize, 1, false, useGpu_);
+  inSquare_->rowSum(*l2NormReciprocal_);
+  l2NormReciprocal_->sqrt2(*l2NormReciprocal_);
+  l2NormReciprocal_->scalarDiv(*l2NormReciprocal_, 1.0);
+  outV->rowScale(0, *inV, *l2NormReciprocal_);
+}
+
+void RowL2NormLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr inG = getInputGrad(0);
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr outG = getOutputGrad();
+  size_t batchSize = inV->getHeight();
+
+  // inG[ij] += outG[ij] / l2NormReciprocal
+  // inG[ij] += -inV[ij] * l2NormReciprocal * l2NormReciprocal * DotMul(outG[i],
+  // inV[i])
+  if (inG) {
+    Matrix::resizeOrCreate(dotSum_, batchSize, 1, false, useGpu_);
+    dotSum_->zeroMem();
+    dotSum_->rowDotMul(0, *outG, *outV);
+    dotSum_->dotMul(*dotSum_, *l2NormReciprocal_);
+    dotSum_->dotMul(*dotSum_, *l2NormReciprocal_);
+    inSquare_->rowScale(0, *inV, *dotSum_);
+    inG->sub(*inSquare_);
+    inG->addRowScale(0, *outG, *l2NormReciprocal_);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ScaleShiftLayer.cpp b/paddle/gserver/layers/ScaleShiftLayer.cpp
new file mode 100644
index 0000000000..35fd038ab4
--- /dev/null
+++ b/paddle/gserver/layers/ScaleShiftLayer.cpp
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * A layer applies a linear transformation to each element in each row of
+ * the input matrix. For each element, the layer first re-scale it and then
+ * adds a bias to it.
+ *
+ * \f[
+ *    y = wx + b
+ * \f]
+ *
+ * Here, w is the scale and b is the bias. Both w and b are trainable scalars.
+ *
+ */
+
+class ScaleShiftLayer : public Layer {
+protected:
+  std::unique_ptr<Weight> scale_;
+  std::unique_ptr<Weight> offset_;
+
+public:
+  explicit ScaleShiftLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(scale_shift, ScaleShiftLayer);
+
+bool ScaleShiftLayer::init(const LayerMap& layerMap,
+                           const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(inputLayers_.size(), 1U);
+  scale_.reset(new Weight(1, 1, parameters_[0]));
+  if (biasParameter_.get() != NULL) {
+    offset_ = std::unique_ptr<Weight>(new Weight(1, 1, biasParameter_));
+  }
+  return true;
+}
+
+void ScaleShiftLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV = getInputValue(0);
+  resetOutput(inV->getHeight(), inV->getWidth());
+  MatrixPtr outV = getOutputValue();
+  real scaleValue = scale_->getW()->getElement(0, 0);
+  outV->mulScalar(*inV, scaleValue);
+  if (offset_) {
+    real offsetValue = offset_->getW()->getElement(0, 0);
+    outV->add(offsetValue);
+  }
+}
+
+void ScaleShiftLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr inG = getInputGrad(0);
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr outG = getOutputGrad();
+
+  /* Calculate the parameter gradient for the current layer */
+  if (scale_->getWGrad()) {
+    MatrixPtr rowSumMtx;
+    Matrix::resizeOrCreate(rowSumMtx, outG->getHeight(), 1, false, useGpu_);
+    // this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij} * c_{ij}
+    rowSumMtx->sumOfProducts(
+        /* b= */ *inV, /* c= */ *outG, /* scaleSum= */ 1, /* scaleDest= */ 0.);
+    // this_i = scaleDest * this_i + scaleSum * \sum_j b_{ji}
+    scale_->getWGrad()->sumCols(
+        /* b= */ *rowSumMtx, /* scaleSum= */ 1., /* scaleDest= */ 1.);
+    scale_->getParameterPtr()->incUpdate(callback);
+  }
+  if (offset_ && offset_->getWGrad()) {
+    MatrixPtr rowSumMtx;
+    Matrix::resizeOrCreate(rowSumMtx, outG->getHeight(), 1, false, useGpu_);
+    rowSumMtx->sumRows(*outG, 1., 0.);
+    offset_->getWGrad()->sumCols(*rowSumMtx, 1., 1.);
+    offset_->getParameterPtr()->incUpdate(callback);
+  }
+
+  /* Calculate the input layers error */
+  if (inG) {
+    real scaleValue = scale_->getW()->getElement(0, 0);
+    inG->add(*outG, scaleValue);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SliceProjection.cpp b/paddle/gserver/layers/SliceProjection.cpp
new file mode 100644
index 0000000000..267dd6154b
--- /dev/null
+++ b/paddle/gserver/layers/SliceProjection.cpp
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Projection.h"
+
+namespace paddle {
+
+/**
+ * SliceProjection can slice the input value into multiple parts,
+ * and then select some of them to merge into a new output.
+ *
+ * First, calculate the slices that need to be merged into the output.
+ * slices = input.slices().for_output()
+ *
+ * Second, merge each slice into the output.
+ * for(auto slice: slices) {
+ *   out.addAtOffset(slice, offset);
+ * }
+ *
+ * Input slices as output: s0, s1, ...:
+ *   -----------------------
+ *   |///|   |//////|      |
+ *   |/s0|   |//s1//|      |
+ *   |///|   |//////|      |
+ *   -----------------------
+ * Output, merge s0, s1, ... into one output:
+ *   ----------------
+ *   |///|//////|   |
+ *   |/s0|//s1//|...|
+ *   |///|//////|   |
+ *   ----------------
+ *
+ * The config file api is slice_projection.
+ */
+class SliceProjection : public Projection {
+public:
+  SliceProjection(const ProjectionConfig& config,
+                  const ParameterPtr& parameter,
+                  bool useGpu);
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+
+protected:
+  std::vector<std::pair<size_t, size_t>> slices_;
+};
+
+REGISTER_PROJECTION(slice, SliceProjection);
+
+/**
+ * Constructed function.
+ * @note SliceProjection should not have any parameter.
+ */
+SliceProjection::SliceProjection(const ProjectionConfig& config,
+                                 const ParameterPtr& parameter,
+                                 bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  CHECK(!parameter) << "'slice' projection should not have any parameter";
+
+  slices_.reserve(config.slices_size());
+  for (const auto& slice : config.slices()) {
+    slices_.push_back(std::make_pair(slice.start(), slice.end()));
+  }
+}
+
+void SliceProjection::forward() {
+  size_t offset = 0;
+  for (auto& slice : slices_) {
+    auto slice_out = in_->value->subColMatrix(slice.first, slice.second);
+    out_->value->addAtOffset(*slice_out, offset);
+    offset += slice_out->getWidth();
+  }
+}
+
+void SliceProjection::backward(const UpdateCallback& callback) {
+  if (in_->grad) {
+    size_t offset = 0;
+    for (auto& slice : slices_) {
+      auto slice_out = in_->grad->subColMatrix(slice.first, slice.second);
+      slice_out->addAtOffset(*out_->grad, offset);
+      offset += slice_out->getWidth();
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SubNestedSequenceLayer.cpp b/paddle/gserver/layers/SubNestedSequenceLayer.cpp
new file mode 100644
index 0000000000..648d3908f3
--- /dev/null
+++ b/paddle/gserver/layers/SubNestedSequenceLayer.cpp
@@ -0,0 +1,176 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+class SubNestedSequenceLayer : public Layer {
+public:
+  explicit SubNestedSequenceLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+private:
+  /*
+   * This functions generates the indices of rows in a batch according to the
+   * indices of selected sub-sequence in each sequence.
+   *
+   * Examples:
+   * selectedIndices:
+   *   [
+   *     [0, 1, -1],
+   *     [0, 1, 2],
+   *     [0, -1, -1],
+   *     [0, 2, 3],
+   *   ]
+   * inputSeqInfo:
+   *   [
+   *     [0,3,4],
+   *     [4,5,7,10,15],
+   *     [15,20],
+   *     [20,22,23,25,28]
+   *   ]
+   *
+   * ths output is saved to private member rowIndice_;
+   * [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
+   *  16,17,18,19,20,21,22,23,24,25,26,27]
+   */
+
+  void calSelectedCols(const MatrixPtr selectedIndices,
+                       const std::vector<std::vector<int>>& inputSeqInfo);
+
+  // if the second input of this layer is on GPU memory, copy it to CPU memory.
+  MatrixPtr selIdsCpu_;
+
+  // reorganized sequenceStartPositions and subSequenceStartPositions
+  // into a 2d vector to facilitate the sequence selection process.
+  std::vector<std::vector<int>> inputSeqInfoVec_;
+
+  // the final selected row indices in a batch,
+  // rowIdx_ and selectedRows_ actually share a same memory.
+  IVectorPtr rowIndice_;
+  std::vector<int> selectedRows_;
+};
+
+REGISTER_LAYER(sub_nested_seq, SubNestedSequenceLayer);
+
+bool SubNestedSequenceLayer::init(const LayerMap& layerMap,
+                                  const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(2U, inputLayers_.size());
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SubNestedSequenceLayer::calSelectedCols(
+    const MatrixPtr selectedIndices,
+    const std::vector<std::vector<int>>& inputSeqInfo) {
+  selectedRows_.clear();
+
+  std::vector<int> outSeqStartInfo(1, 0);
+  std::vector<int> outSubSeqStartInfo(1, 0);
+
+  size_t seqNum = selectedIndices->getHeight();
+  size_t beamSize = selectedIndices->getWidth();
+  for (size_t i = 0; i < seqNum; ++i) {
+    for (size_t j = 0; j < beamSize; ++j) {
+      if (selectedIndices->getElement(i, j) == -1.) break;
+      size_t selSubSeqIdx = selectedIndices->getElement(i, j);
+      CHECK_GT(inputSeqInfoVec_[i].size() - 1, selSubSeqIdx);
+
+      size_t subSeqLen = inputSeqInfoVec_[i][selSubSeqIdx + 1] -
+                         inputSeqInfoVec_[i][selSubSeqIdx];
+      for (size_t k = 0; k < subSeqLen; ++k)
+        selectedRows_.push_back(inputSeqInfoVec_[i][selSubSeqIdx] + k);
+      outSubSeqStartInfo.push_back(outSubSeqStartInfo.back() + subSeqLen);
+    }
+    outSeqStartInfo.push_back(outSubSeqStartInfo.back());
+  }
+
+  if (useGpu_) {
+    rowIndice_ = IVector::create(selectedRows_.size(), useGpu_);
+    rowIndice_->copyFrom(selectedRows_.data(), selectedRows_.size());
+  } else {
+    rowIndice_ =
+        IVector::create(selectedRows_.data(), selectedRows_.size(), useGpu_);
+  }
+
+  // create the sequence information for the output.
+  ICpuGpuVector::resizeOrCreate(
+      output_.sequenceStartPositions, outSeqStartInfo.size(), false);
+  output_.sequenceStartPositions->copyFrom(
+      outSeqStartInfo.data(), outSeqStartInfo.size(), false);
+
+  ICpuGpuVector::resizeOrCreate(
+      output_.subSequenceStartPositions, outSubSeqStartInfo.size(), false);
+  output_.subSequenceStartPositions->copyFrom(
+      outSubSeqStartInfo.data(), outSubSeqStartInfo.size(), false);
+}
+
+void SubNestedSequenceLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& inputSeq = getInput(0);
+  CHECK(inputSeq.hasSubseq()) << "The first input of SubNestSequence layer "
+                              << "must be a nested sequence.";
+  const MatrixPtr selectedIndices = getInputValue(1);
+  CHECK_EQ(size_t(inputSeq.getNumSequences()), selectedIndices->getHeight());
+
+  if (dynamic_cast<GpuMatrix*>(selectedIndices.get())) {
+    /*
+     * Currently, the second input for this layer is generated by
+     * kmax_sequence_score_layer whose output is always stored on CPU,
+     * or a data_layer which canbe on GPU.
+     *
+     * If the second input is on GPU, copy it to CPU memory, because this
+     * input always uses very few memory, and operations related to it are
+     * all logic control, not computations.
+     */
+    Matrix::resizeOrCreate(selIdsCpu_,
+                           selectedIndices->getHeight(),
+                           selectedIndices->getWidth(),
+                           false /* trans */,
+                           false /* useGpu */);
+    selIdsCpu_->copyFrom(*selectedIndices);
+  } else {
+    selIdsCpu_ = selectedIndices;
+  }
+
+  Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
+                              inputSeq.subSequenceStartPositions,
+                              inputSeqInfoVec_);
+  calSelectedCols(selIdsCpu_, inputSeqInfoVec_);
+
+  resetOutput(selectedRows_.size(), getSize());
+  getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
+}
+
+void SubNestedSequenceLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inputSeqGrad = getInputGrad(0);
+  MatrixPtr outputGrad = getOutputGrad();
+
+  if (inputSeqGrad) outputGrad->addToRows(*inputSeqGrad, *rowIndice_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index 92f6cbcfe5..c2a2993620 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -9,7 +9,7 @@ add_unittest_without_exec(test_ProtoDataProvider
 # mkdir will get error.
 add_test(NAME test_ProtoDataProvider
     COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoDataProvider
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 
 ################# test_LayerGrad #######################
 add_unittest_without_exec(test_LayerGrad
@@ -18,6 +18,15 @@ add_unittest_without_exec(test_LayerGrad
 add_test(NAME test_LayerGrad
     COMMAND test_LayerGrad)
 
+########## test_Mkldnn layers and activations ##########
+if(WITH_MKLDNN)
+    add_unittest_without_exec(test_MKLDNN
+        test_MKLDNN.cpp
+        MKLDNNTester.cpp
+        LayerGradUtil.cpp)
+    add_test(NAME test_MKLDNN COMMAND test_MKLDNN)
+endif()
+
 ################ test_CRFLayerGrad ####################
 add_unittest_without_exec(test_CRFLayerGrad
     test_CRFLayerGrad.cpp
@@ -50,13 +59,13 @@ add_unittest_without_exec(test_DetectionOutput
     test_DetectionOutput.cpp
     LayerGradUtil.cpp)
 
-add_test(NAME test_DetectionOutput 
+add_test(NAME test_DetectionOutput
     COMMAND test_DetectionOutput)
 ################# test_ConvUnify #######################
 add_unittest_without_exec(test_ConvUnify
     test_ConvUnify.cpp
     LayerGradUtil.cpp)
-    
+
 add_test(NAME test_ConvUnify
     COMMAND test_ConvUnify)
 ################# test_BatchNorm #######################
@@ -66,6 +75,16 @@ add_unittest_without_exec(test_BatchNorm
 
 add_test(NAME test_BatchNorm
     COMMAND test_BatchNorm)
+
+
+################# test_KmaxSeqScore #######################
+add_unittest_without_exec(test_KmaxSeqScore
+    test_KmaxSeqScore.cpp
+    LayerGradUtil.cpp)
+
+add_test(NAME test_KmaxSeqScore
+    COMMAND test_KmaxSeqScore)
+
 ################## test_Evaluator #######################
 add_unittest(test_Evaluator
     test_Evaluator.cpp)
@@ -82,8 +101,8 @@ if(WITH_PYTHON)
         test_PyDataProvider.cpp)
 
     add_test(NAME test_PyDataProvider
-        COMMAND .set_python_path.sh -d ./gserver/tests:${PROJ_ROOT}/python/ ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+        COMMAND .set_python_path.sh -d ./gserver/tests:${PADDLE_SOURCE_DIR}/python/ ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 endif()
 
 ############### test_RecurrentLayer #######################
@@ -96,7 +115,7 @@ if(NOT WITH_DOUBLE)
 
     add_test(NAME test_WarpCTCLayer
         COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 endif()
 
 ############### test_RecurrentGradientMachine ###############
@@ -106,20 +125,20 @@ add_unittest_without_exec(test_RecurrentGradientMachine
     test_RecurrentGradientMachine.cpp)
 add_test(NAME test_RecurrentGradientMachine
     COMMAND .set_python_path.sh -d
-            ${PROJ_ROOT}/python:${PROJ_ROOT}/paddle/gserver/tests
+            ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
             ${CMAKE_CURRENT_BINARY_DIR}/test_RecurrentGradientMachine
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 
 add_unittest_without_exec(test_NetworkCompare
     test_NetworkCompare.cpp)
 if(WITH_GPU)
     add_test(NAME test_NetworkCompare
-        COMMAND .set_python_path.sh -d ${PROJ_ROOT}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=true
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+        COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=true
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 else()
     add_test(NAME test_NetworkCompare
-        COMMAND .set_python_path.sh -d ${PROJ_ROOT}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=false
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+        COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=false
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 endif()
 
 
@@ -127,6 +146,6 @@ add_unittest_without_exec(test_PyDataProvider2
         test_PyDataProvider2.cpp)
 
 add_test(NAME test_PyDataProvider2
-   COMMAND .set_python_path.sh -d ${PROJ_ROOT}/paddle/gserver/tests:${PROJ_ROOT}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle
+   COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
 )
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index 9eca58f1a1..a38880e14c 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -388,19 +388,27 @@ void initDataLayer(TestConfig testConf,
         data.grad->zeroMem();
         break;
       case INPUT_SELF_DEFINE_DATA: {
-        size_t height = testConf.inputDefs[i].selfDefinedData->getHeight();
-        size_t width = testConf.inputDefs[i].selfDefinedData->getWidth();
-        CHECK_GT(static_cast<int>(height), 0);
-        CHECK_GT(static_cast<int>(width), 0);
-        data.value = Matrix::create(height, width, false, useGpu);
-        data.grad = Matrix::create(height, width, false, useGpu);
-        data.value->copyFrom(*testConf.inputDefs[i].selfDefinedData);
-        data.grad->zeroMem();
+        if (testConf.inputDefs[i].ids.size()) {
+          data.ids = IVector::create(testConf.inputDefs[i].ids.size(), useGpu);
+          data.ids->copyFrom(testConf.inputDefs[i].ids.data(),
+                             testConf.inputDefs[i].ids.size());
+        } else if (testConf.inputDefs[i].selfDefinedData) {
+          size_t height = testConf.inputDefs[i].selfDefinedData->getHeight();
+          size_t width = testConf.inputDefs[i].selfDefinedData->getWidth();
+          CHECK_GT(static_cast<int>(height), 0);
+          CHECK_GT(static_cast<int>(width), 0);
+          data.value = Matrix::create(height, width, false, useGpu);
+          data.grad = Matrix::create(height, width, false, useGpu);
+          data.value->copyFrom(*testConf.inputDefs[i].selfDefinedData);
+          data.grad->zeroMem();
+        } else {
+          LOG(FATAL) << "No self-defined data are given.";
+          return;
+        }
 
         const std::vector<int>& labelSeqStartPositions =
             testConf.inputDefs[i].labelSeqStartPositions;
         if (labelSeqStartPositions.size() != 0) {
-          CHECK(!sequenceStartPositions);
           CHECK_GE(static_cast<int>(labelSeqStartPositions.size()), 2);
 
           sequenceStartPositions =
@@ -410,6 +418,19 @@ void initDataLayer(TestConfig testConf,
                                            useGpu);
           data.sequenceStartPositions = sequenceStartPositions;
         }
+
+        const std::vector<int>& labelSubSeqStartPositions =
+            testConf.inputDefs[i].labelSubSeqStartPositions;
+        if (labelSubSeqStartPositions.size() != 0) {
+          CHECK_GE(static_cast<int>(labelSubSeqStartPositions.size()), 2);
+
+          subSequenceStartPositions =
+              ICpuGpuVector::create(labelSubSeqStartPositions.size(), useGpu);
+          subSequenceStartPositions->copyFrom(labelSubSeqStartPositions.data(),
+                                              labelSubSeqStartPositions.size(),
+                                              useGpu);
+          data.subSequenceStartPositions = subSequenceStartPositions;
+        }
         break;
       }
       default:
diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h
index d299b4dd09..88e831f78b 100644
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
@@ -67,6 +67,8 @@ struct InputDef {
   bool isStatic;
   std::vector<int> labelInitValue;
   std::vector<int> labelSeqStartPositions;
+  std::vector<int> labelSubSeqStartPositions;
+  std::vector<int> ids;
   MatrixPtr selfDefinedData;
 
   InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn) {
@@ -81,8 +83,10 @@ struct InputDef {
   InputDef(InputType type,
            string nameIn,
            MatrixPtr selfDefinedData,
-           std::vector<int> selfDefinedSeqStartPos = {})
+           std::vector<int> selfDefinedSeqStartPos = {},
+           std::vector<int> selfDefinedSubSeqStartPos = {})
       : labelSeqStartPositions(selfDefinedSeqStartPos),
+        labelSubSeqStartPositions(selfDefinedSubSeqStartPos),
         selfDefinedData(selfDefinedData) {
     inputType = type;
     name = nameIn;
@@ -92,6 +96,23 @@ struct InputDef {
     isStatic = false;
   }
 
+  InputDef(InputType type,
+           string nameIn,
+           const std::vector<int>& ids,
+           const std::vector<int>& selfDefinedSeqStartPos = {},
+           const std::vector<int>& selfDefinedSubSeqStartPos = {})
+      : labelSeqStartPositions(selfDefinedSeqStartPos),
+        labelSubSeqStartPositions(selfDefinedSubSeqStartPos),
+        ids(ids) {
+    selfDefinedData = nullptr;
+    inputType = type;
+    name = nameIn;
+    dim = 0;
+    sparse = {""};
+    paraSize = 0;
+    isStatic = false;
+  }
+
   InputDef(InputType type,
            string nameIn,
            size_t dimIn,
diff --git a/paddle/gserver/tests/MKLDNNTester.cpp b/paddle/gserver/tests/MKLDNNTester.cpp
new file mode 100644
index 0000000000..de1635be2a
--- /dev/null
+++ b/paddle/gserver/tests/MKLDNNTester.cpp
@@ -0,0 +1,382 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNTester.h"
+#include "paddle/gserver/layers/MKLDNNBase.h"
+#include "paddle/gserver/layers/MKLDNNLayer.h"
+
+namespace paddle {
+
+// init data layer and test layer of both dnn and reference
+void MKLDNNTester::reset(const TestConfig& dnn,
+                         const TestConfig& ref,
+                         size_t batchSize) {
+  const bool trans = false;
+  const bool useGpu = false;
+
+  // clear
+  configs_.clear();
+  layerNames_.clear();
+  dataLayers_.clear();
+  datas_.clear();
+  layerMaps_.clear();
+  parameters_.clear();
+  testLayers_.clear();
+
+  // resize
+  configs_.resize(NUM);
+  layerNames_.resize(NUM);
+  dataLayers_.resize(NUM);
+  datas_.resize(NUM);
+  layerMaps_.resize(NUM);
+  parameters_.resize(NUM);
+  testLayers_.resize(NUM);
+
+  // reset configs and layer names
+  configs_[DNN] = dnn;
+  configs_[REF] = ref;
+  layerNames_[DNN] = "mkldnn";     // the first is mkldnn layer
+  layerNames_[REF] = "reference";  // second is reference layer
+
+  // reset others
+  for (size_t i = 0; i < NUM; ++i) {
+    configs_[i].layerConfig.set_name(layerNames_[i]);
+    initDataLayer(configs_[i],
+                  &(dataLayers_[i]),
+                  &(datas_[i]),
+                  &(layerMaps_[i]),
+                  layerNames_[i],
+                  batchSize,
+                  trans,
+                  useGpu);
+    initTestLayer(
+        configs_[i], &(layerMaps_[i]), &(parameters_[i]), &(testLayers_[i]));
+  }
+  dnnLayer_ = testLayers_[DNN];
+  refLayer_ = testLayers_[REF];
+  EXPECT_EQ(dataLayers_[DNN].size(), dataLayers_[REF].size());
+  EXPECT_EQ(parameters_[DNN].size(), parameters_[REF].size());
+
+  setInputImgSize();
+}
+
+void MKLDNNTester::setInputImgSize() {
+  for (size_t n = 0; n < dataLayers_.size(); ++n) {
+    for (size_t i = 0; i < dataLayers_[n].size(); ++i) {
+      // TODO(TJ): fix me when concat and elewise ready
+      dataLayers_[n][i]->getOutput().setFrameHeight(ih_);
+      dataLayers_[n][i]->getOutput().setFrameWidth(iw_);
+    }
+  }
+}
+
+// init randome parameters of ref, and copy to mkldnn
+void MKLDNNTester::randomWgtDatas() {
+  EXPECT_EQ(parameters_[DNN].size(), parameters_[REF].size());
+  for (size_t i = 0; i < parameters_[REF].size(); ++i) {
+    const VectorPtr& dnnValue = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
+    const VectorPtr& refValue = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
+    parameters_[REF][i]->randomize();
+    dnnValue->copyFrom(*refValue);
+
+    VLOG(lvl_) << "Random weight data " << parameters_[DNN][i]->getName();
+    printVector(dnnValue);
+  }
+}
+
+// random botdata of ref layer and copy same to mkldnn
+void MKLDNNTester::randomBotDatas() {
+  CHECK_EQ(dataLayers_.size(), NUM);
+  for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
+    dataLayers_[REF][i]->getOutputValue()->randomizeUniform();
+    dataLayers_[DNN][i]->getOutputValue()->copyFrom(
+        *(dataLayers_[REF][i]->getOutputValue()));
+    VLOG(lvl_) << "Input " << i << " data:";
+    printMatrix(dataLayers_[REF][i]->getOutputValue());
+  }
+}
+
+void MKLDNNTester::randomTopDiffs() {
+  refLayer_->getOutputGrad()->randomizeUniform();
+  dnnLayer_->getOutputGrad()->copyFrom(*(refLayer_->getOutputGrad()));
+  VLOG(lvl_) << "Random dom Backward Input, TopDiff: ";
+  printMatrix(refLayer_->getOutputGrad());
+}
+
+void MKLDNNTester::checkForward() {
+  printTopDatas();
+  double delta = compareMatrix(testLayers_[DNN]->getOutputValue(),
+                               testLayers_[REF]->getOutputValue());
+  VLOG(MKLDNN_ALL) << "Check Forward";
+  EXPECT_LE(fabs(delta), eps_);
+}
+
+void MKLDNNTester::checkBackwardData() {
+  // TODO(TJ): uncomment me when batch norm ready
+  // const bool isBN = dnnLayer_->getType() == "mkldnn_batch_norm";
+  for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
+    const MatrixPtr& dnnDiff = dataLayers_[DNN][i]->getOutputGrad();
+    const MatrixPtr& refDiff = dataLayers_[REF][i]->getOutputGrad();
+    VLOG(lvl_) << "Mkldnn Backward Output BotDiff " << i;
+    printMatrix(dnnDiff);
+    VLOG(lvl_) << "Reference Backward Output BotDiff " << i;
+    printMatrix(refDiff);
+
+    double delta = compareMatrix(dnnDiff, refDiff);
+    EXPECT_LE(fabs(delta), eps_);
+    // TODO(TJ): uncomment me when batch norm ready
+    // if (isBN) {
+    //  // the other two inputs in batch norm are for moving mean and var
+    //  break;
+    // }
+  }
+}
+
+void MKLDNNTester::checkBackwardWgts() {
+  CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
+  vector<VectorPtr> dnnWgts;  // used to temply save mkldnn weights
+  saveWgt(parameters_[DNN], dnnWgts);
+
+  const MKLDNNLayerPtr dnnlayer =
+      std::dynamic_pointer_cast<MKLDNNLayer>(dnnLayer_);
+  CHECK(dnnlayer);
+  dnnlayer->convertWeightsToPaddle();
+  for (size_t i = 0; i < parameters_[DNN].size(); ++i) {
+    const VectorPtr& dnn = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
+    const VectorPtr& ref = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
+    VLOG(lvl_) << "Mkldnn Output weight " << parameters_[DNN][i]->getName();
+    printVector(dnn);
+    VLOG(lvl_) << "Reference Output weight " << parameters_[REF][i]->getName();
+    printVector(ref);
+
+    double delta = compareVector(dnn, ref);
+    EXPECT_LE(fabs(delta), eps_);
+  }
+
+  VLOG(MKLDNN_ALL) << "Restore dnn weights before comapre";
+  restoreWgt(dnnWgts, parameters_[DNN]);
+}
+
+void MKLDNNTester::saveWgt(const vector<ParameterPtr>& from,
+                           vector<VectorPtr>& to) {
+  const bool useGpu = false;
+  to.resize(from.size());
+  for (size_t i = 0; i < to.size(); ++i) {
+    const VectorPtr& wgt = from[i]->getBuf(PARAMETER_VALUE);
+    to[i] = Vector::create(wgt->getSize(), useGpu);
+    to[i]->copyFrom(*wgt);
+  }
+}
+
+void MKLDNNTester::restoreWgt(const vector<VectorPtr>& from,
+                              vector<ParameterPtr>& to) {
+  CHECK_EQ(from.size(), to.size());
+  for (size_t i = 0; i < from.size(); ++i) {
+    const VectorPtr& wgt = to[i]->getBuf(PARAMETER_VALUE);
+    wgt->copyFrom(*from[i]);
+  }
+}
+
+// clear parameters grad
+void MKLDNNTester::clearWgtDiffs() {
+  for (size_t n = 0; n < parameters_.size(); ++n) {
+    for (size_t i = 0; i < parameters_[n].size(); ++i) {
+      const VectorPtr& grad = parameters_[n][i]->getBuf(PARAMETER_GRADIENT);
+      if (grad) {
+        grad->zeroMem();
+      }
+    }
+  }
+}
+
+void MKLDNNTester::clearBotDiffs() {
+  // dnn and ref
+  for (size_t n = 0; n < dataLayers_.size(); ++n) {
+    // all inputs layers
+    for (size_t i = 0; i < dataLayers_[n].size(); ++i) {
+      dataLayers_[n][i]->getOutputGrad()->zeroMem();
+    }
+  }
+}
+
+void MKLDNNTester::clearBotDiffs(int n) {
+  CHECK_LT(n, NUM);
+  // all inputs layers
+  for (size_t i = 0; i < dataLayers_[n].size(); ++i) {
+    dataLayers_[n][i]->getOutputGrad()->zeroMem();
+  }
+}
+
+void MKLDNNTester::clearTopDatas() {
+  for (size_t i = 0; i < testLayers_.size(); ++i) {
+    testLayers_[i]->getOutputValue()->zeroMem();
+  }
+}
+
+void MKLDNNTester::printTopDatas() {
+  if (!log_) {
+    return;
+  }
+
+  for (int n = 0; n < NUM; ++n) {
+    VLOG(lvl_) << testLayers_[n]->getType() << " forward output TopData: ";
+    printMatrix(testLayers_[n]->getOutputValue());
+  }
+}
+
+void MKLDNNTester::printMatrix(const MatrixPtr& m) {
+  if (!log_) {
+    return;
+  }
+
+  std::ostringstream ostr;
+  m->print(ostr);
+  VLOG(lvl_) << std::endl << ostr.str();
+}
+
+void MKLDNNTester::printVector(const VectorPtr& v) {
+  if (!log_) {
+    return;
+  }
+
+  std::ostringstream ostr;
+  v->print(ostr, v->getSize());
+  VLOG(lvl_) << std::endl << ostr.str();
+}
+
+double MKLDNNTester::getDelta(const real* d1,
+                              const real* d2,
+                              size_t len,
+                              const float failRate,
+                              const float thres) {
+  double delta = 0, sum = 0;
+  int failCnt = 0;
+  const double eps = 1e-5;
+  double maxOut = 0;
+  for (size_t i = 0; i < len; ++i) {
+    double ref = fabs(d2[i]);
+    double diff = fabs(d1[i] - d2[i]);
+    delta += diff;
+    sum += ref;
+    if (ref > eps && fabs(d1[i]) > eps && diff / ref > thres) {
+      maxOut = std::max(maxOut, diff / ref);
+      failCnt++;
+    }
+  }
+  EXPECT_TRUE(std::isnormal(sum));
+  EXPECT_FALSE(std::isinf(sum));
+  EXPECT_FALSE(std::isnan(delta));
+  VLOG(MKLDNN_ALL) << "reference avg data: " << sum / len
+                   << ", delta: " << delta / sum << ", failCnt:" << failCnt;
+  return (failCnt / (float)len) > failRate ? maxOut : delta / sum;
+}
+
+double MKLDNNTester::compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2) {
+  CHECK_EQ(m1->getElementCnt(), m2->getElementCnt());
+  return getDelta(m1->getData(), m2->getData(), m1->getElementCnt());
+}
+
+double MKLDNNTester::compareVector(const VectorPtr& v1, const VectorPtr& v2) {
+  CHECK_EQ(v1->getSize(), v2->getSize());
+  return getDelta(v1->getData(), v2->getData(), v1->getSize());
+}
+
+void MKLDNNTester::runOnce() {
+  // test forward
+  randomBotDatas();
+  dnnLayer_->forward(PASS_TRAIN);
+  refLayer_->forward(PASS_TRAIN);
+  checkForward();
+
+  // test backward
+  randomTopDiffs();
+  dnnLayer_->backward(nullptr);
+  refLayer_->backward(nullptr);
+  checkBackwardData();
+  checkBackwardWgts();
+
+  // clear buffers
+  // ref code will addto the diff, dnn code will writeto it
+  // and clearTopDatas() and clearWgtDiffs() should be coverd by test layers
+  clearBotDiffs(REF);
+}
+
+void MKLDNNTester::run(const TestConfig& dnn,
+                       const TestConfig& ref,
+                       size_t batchSize,
+                       size_t inputImgH,
+                       size_t inputImgW,
+                       size_t iter,
+                       float epsilon,
+                       bool log,
+                       int level) {
+  VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: " << dnn.layerConfig.type()
+                     << " vs " << ref.layerConfig.type();
+  ih_ = inputImgH;
+  iw_ = inputImgW;
+  iter_ = iter;
+  eps_ = epsilon;
+  log_ = log;
+  lvl_ = level;
+
+  // Firstly test mkldnn init from PARAM_FORMAT_ORIGINAL weight
+  reset(dnn, ref, batchSize);
+  randomWgtDatas();
+  clearWgtDiffs();
+  clearBotDiffs();
+  for (size_t i = 0; i < iter_; ++i) {
+    VLOG(MKLDNN_TESTS) << "Check Iteration " << i;
+    runOnce();
+  }
+
+  if (parameters_[DNN].empty()) {
+    // has no paramters
+    return;
+  }
+
+  // After run some iterations, the mkldnn weight has been stored in dnnLayer
+  // and we can also get the mkldnn weight parameter header format.
+  // Weight parameter should always be index 0 (and bias index 1).
+  // TODO(TJ): should also consider mean and var format when batchnorm ready
+  int dnnWgtFmt = parameters_[DNN][0]->getHeaderFormat();
+  int refWgtFmt = parameters_[REF][0]->getHeaderFormat();
+  if (dnnWgtFmt == refWgtFmt) {
+    // weight format are equal, so no need check more
+    return;
+  }
+
+  // then save the weights and restart again
+  vector<VectorPtr> dnnWgts, refWgts;
+  CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
+  saveWgt(parameters_[DNN], dnnWgts);
+  saveWgt(parameters_[REF], refWgts);
+
+  // restart again with dnn weight format
+  reset(dnn, ref, batchSize);
+  // TODO(TJ): should also considerate mean and var format when batchnorm ready
+  parameters_[DNN][0]->setHeaderFormat(dnnWgtFmt);
+
+  // restore wgt
+  restoreWgt(dnnWgts, parameters_[DNN]);
+  restoreWgt(refWgts, parameters_[REF]);
+  clearWgtDiffs();
+  clearBotDiffs();
+
+  for (size_t i = 0; i < iter_; ++i) {
+    VLOG(MKLDNN_TESTS) << "Check Iteration " << i;
+    runOnce();
+  }
+}
+
+}  //  namespace paddle
diff --git a/paddle/gserver/tests/MKLDNNTester.h b/paddle/gserver/tests/MKLDNNTester.h
new file mode 100644
index 0000000000..e55e4493ff
--- /dev/null
+++ b/paddle/gserver/tests/MKLDNNTester.h
@@ -0,0 +1,120 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "LayerGradUtil.h"
+#include "paddle/gserver/layers/MKLDNNBase.h"
+
+namespace paddle {
+
+/**
+ * @brief test the functionality of Mkldnnlayers
+ * refer to paddle original function
+ */
+class MKLDNNTester {
+  enum {
+    DNN = 0,  // MKLDNN layer
+    REF = 1,  // Reference layer
+    NUM = 2,  // Number of total
+  };
+
+protected:
+  std::vector<TestConfig> configs_;
+  vector<string> layerNames_;
+  vector<vector<DataLayerPtr>> dataLayers_;
+  vector<vector<Argument>> datas_;
+  vector<LayerMap> layerMaps_;
+  vector<vector<ParameterPtr>> parameters_;
+  vector<LayerPtr> testLayers_;
+  LayerPtr dnnLayer_, refLayer_;
+
+  /// run some iterations, all the result should pass
+  size_t iter_;
+  /// whether to print out the details
+  bool log_;
+  /// vlog level to print the matrix details datas
+  int lvl_;
+  /// epsilon
+  float eps_;
+  /// input image size, default 1
+  size_t ih_, iw_;
+
+public:
+  explicit MKLDNNTester(size_t iter = 3, float epsilon = 1e-4) {
+    iter_ = iter;
+    eps_ = epsilon;
+    log_ = false;
+    lvl_ = MKLDNN_ALL;
+  }
+
+  ~MKLDNNTester() {}
+
+public:
+  void run(const TestConfig& dnn,
+           const TestConfig& ref,
+           size_t batchSize,
+           size_t inputImgH = 1,
+           size_t inputImgW = 1,
+           size_t iter = 3,
+           float epsilon = 1e-4,
+           bool log = false,
+           int level = MKLDNN_ALL);
+  void setLogLevel(int lvl) { lvl_ = lvl; }
+
+private:
+  void reset(const TestConfig& dnn, const TestConfig& ref, size_t batchSize);
+  void setInputImgSize();
+  void runOnce();
+
+  void randomWgtDatas();
+  void randomBotDatas();
+  void randomTopDiffs();
+
+  void checkForward();
+  void checkBackwardData();
+  void checkBackwardWgts();
+
+  void clearWgtDiffs();
+  void clearBotDiffs();
+  void clearBotDiffs(int n);  // clear specific layer
+  void clearTopDatas();
+
+  void printTopDatas();
+  void printMatrix(const MatrixPtr& m);
+  void printVector(const VectorPtr& v);
+
+  void saveWgt(const vector<ParameterPtr>& from, vector<VectorPtr>& to);
+  void restoreWgt(const vector<VectorPtr>& from, vector<ParameterPtr>& to);
+
+  double compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2);
+  double compareVector(const VectorPtr& v1, const VectorPtr& v2);
+
+  /**
+   * Get delta percent
+   * if many(>failRate) wrong(abs(dnn-ref)/abs(ref)>thres) points return the
+   * max(diff/ref)
+   * else return sum(abs(a-b)) / sum(abs(b))
+   * The return value should be smaller than eps when passing.
+   */
+  double getDelta(const real* d1,
+                  const real* d2,
+                  size_t len,
+                  const float failRate = 1e-3,
+                  const float thres = 0.1);
+};
+
+}  //  namespace paddle
diff --git a/paddle/gserver/tests/concat_slice_a.conf b/paddle/gserver/tests/concat_slice_a.conf
new file mode 100644
index 0000000000..dccf911089
--- /dev/null
+++ b/paddle/gserver/tests/concat_slice_a.conf
@@ -0,0 +1,41 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+
+data = data_layer(name ="input", size=8*16*16)
+
+conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation())
+conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation())
+
+proj1 = slice_projection(input=conv1, slices=[(0, 4), (4, 12)])
+
+proj2 = slice_projection(input=conv2, slices=[(1, 5), (5, 15)])
+
+concat = concat_layer(input=[proj1, proj2])
+
+outputs(concat)
+
diff --git a/paddle/gserver/tests/concat_slice_b.conf b/paddle/gserver/tests/concat_slice_b.conf
new file mode 100644
index 0000000000..29686ef281
--- /dev/null
+++ b/paddle/gserver/tests/concat_slice_b.conf
@@ -0,0 +1,41 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+
+data = data_layer(name ="input", size=8*16*16)
+
+conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation())
+conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation())
+
+proj1 = slice_projection(input=conv1, slices=[(0, 12)])
+
+proj2 = slice_projection(input=conv2, slices=[(1, 15)])
+
+concat = concat_layer(input=[proj1, proj2])
+
+outputs(concat)
+
diff --git a/paddle/gserver/tests/test_ActivationGrad.cpp b/paddle/gserver/tests/test_ActivationGrad.cpp
index b201ba8a5a..de93972a58 100644
--- a/paddle/gserver/tests/test_ActivationGrad.cpp
+++ b/paddle/gserver/tests/test_ActivationGrad.cpp
@@ -57,6 +57,39 @@ TEST(Activation, activation) {
   }
 }
 
+void testSequenceSoftmaxAct(bool hasSubseq) {
+  LOG(INFO) << "test activation: sequence softmax";
+
+  const size_t size = 1;
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("addto");
+  config.layerConfig.set_size(size);
+  config.layerConfig.set_active_type("sequence_softmax");
+  config.inputDefs.push_back(
+      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
+       "layer_0",
+       1,
+       0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "sequence_softmax",
+                  100,
+                  /* trans= */ false,
+                  useGpu,
+                  /* useWeight */ true);
+  }
+}
+
+TEST(SequenceSoftmaxActivation, activation) {
+  for (auto hasSubseq : {false, true}) {
+    LOG(INFO) << "hasSubseq = " << hasSubseq;
+    testSequenceSoftmaxAct(hasSubseq);
+  }
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/gserver/tests/test_BatchNorm.cpp b/paddle/gserver/tests/test_BatchNorm.cpp
index 83fcfed46c..659eefa31b 100644
--- a/paddle/gserver/tests/test_BatchNorm.cpp
+++ b/paddle/gserver/tests/test_BatchNorm.cpp
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/utils/GlobalConstants.h"
 
 #include "LayerGradUtil.h"
+#include "paddle/cuda/include/hl_batch_norm.h"
+#include "paddle/math/tests/TensorCheck.h"
 #include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
@@ -117,6 +119,74 @@ TEST(Layer, batchNorm) {
   CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getWidth()), 576);
 }
 
+#ifndef PADDLE_ONLY_CPU
+void batchNormInference(int n, int c, int h, int w) {
+  MatrixPtr input = std::make_shared<GpuMatrix>(n, c * h * w);
+  MatrixPtr cudnnOut = std::make_shared<GpuMatrix>(n, c * h * w);
+  MatrixPtr cudaOut = std::make_shared<GpuMatrix>(n, c * h * w);
+  MatrixPtr cudnnCheck = std::make_shared<CpuMatrix>(n, c * h * w);
+  MatrixPtr cudaCheck = std::make_shared<CpuMatrix>(n, c * h * w);
+  input->randomizeUniform();
+  cudnnOut->zeroMem();
+  cudaOut->zeroMem();
+
+  MatrixPtr scale = std::make_shared<GpuMatrix>(1, c);
+  scale->randomizeUniform();
+  MatrixPtr bias = std::make_shared<GpuMatrix>(1, c);
+  bias->randomizeUniform();
+
+  MatrixPtr movingMean = std::make_shared<GpuMatrix>(1, c);
+  movingMean->randomizeUniform();
+
+  MatrixPtr movingVar = std::make_shared<GpuMatrix>(1, c);
+  movingVar->randomizeUniform();
+  movingVar->clip(0.01, 50);
+
+  hl_tensor_descriptor ioDesc;
+  hl_tensor_descriptor bnDesc;
+  hl_create_tensor_descriptor(&ioDesc);
+  hl_create_tensor_descriptor(&bnDesc);
+  hl_tensor_reshape(ioDesc, n, c, h, w);
+  hl_tensor_reshape(bnDesc, 1, c, 1, 1);
+
+  double EPS = 1E-5;
+  hl_batch_norm_forward_inference(ioDesc,
+                                  input->getData(),
+                                  ioDesc,
+                                  cudnnOut->getData(),
+                                  bnDesc,
+                                  scale->getData(),
+                                  bias->getData(),
+                                  movingMean->getData(),
+                                  movingVar->getData(),
+                                  EPS);
+
+  hl_batch_norm_cuda_inference(input->getData(),
+                               cudaOut->getData(),
+                               scale->getData(),
+                               bias->getData(),
+                               movingMean->getData(),
+                               movingVar->getData(),
+                               EPS,
+                               n,
+                               c,
+                               h,
+                               w);
+
+  cudnnCheck->copyFrom(*cudnnOut);
+  cudaCheck->copyFrom(*cudaOut);
+  autotest::TensorCheckErr(*cudnnCheck, *cudaCheck);
+
+  hl_destroy_tensor_descriptor(ioDesc);
+  hl_destroy_tensor_descriptor(bnDesc);
+}
+
+TEST(BatchNorm, Inference) {
+  batchNormInference(33, 267, 1, 1);
+  batchNormInference(19, 105, 4, 4);
+}
+#endif
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/gserver/tests/test_KmaxSeqScore.cpp b/paddle/gserver/tests/test_KmaxSeqScore.cpp
new file mode 100644
index 0000000000..308abe6816
--- /dev/null
+++ b/paddle/gserver/tests/test_KmaxSeqScore.cpp
@@ -0,0 +1,165 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/gserver/layers/DataLayer.h"
+#include "paddle/trainer/Trainer.h"
+#include "paddle/utils/GlobalConstants.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+vector<int> randSampling(int range, int n) {
+  CHECK_GE(range, n);
+  vector<int> num(range);
+  iota(begin(num), end(num), 0);
+  if (range == n) return num;
+
+  random_shuffle(begin(num), end(num));
+  num.resize(n);
+  return num;
+}
+
+void genRandomSeqInfo(vector<int>& seqStartPosition,
+                      vector<int>& subSeqStartPosition) {
+  const int maxSeqNum = 100;
+  // generate random start position information
+  int seqNum = 1 + (rand() % maxSeqNum);
+  seqStartPosition.resize(seqNum + 1, 0);
+  subSeqStartPosition.resize(1, 0);
+
+  for (int i = 0; i < seqNum; ++i) {
+    int subSeqLen = 1 + (rand() % maxSeqNum);
+    for (int j = 0; j < subSeqLen; ++j)
+      subSeqStartPosition.push_back(subSeqStartPosition.back() + subSeqLen);
+    seqStartPosition[i + 1] = subSeqStartPosition.back();
+  }
+}
+
+void genRandomGroundTruth(real* values,
+                          vector<vector<int>>& groundTruth,
+                          vector<int>& startPos,
+                          size_t beamSize) {
+  groundTruth.resize(startPos.size() - 1, vector<int>(beamSize, -1));
+  for (size_t i = 0; i < startPos.size() - 1; ++i) {
+    int seqLen = startPos[i + 1] - startPos[i];
+    vector<int> pos =
+        randSampling(seqLen, min(static_cast<int>(beamSize), seqLen));
+    for (size_t j = 0; j < pos.size(); ++j) {
+      groundTruth[i][j] = pos[j];
+      values[startPos[i] + pos[j]] = 1.;
+    }
+  }
+}
+
+void checkLayerOut(vector<vector<int>> groundTruth,
+                   real* layerOut,
+                   size_t beamSize) {
+  for (size_t i = 0; i < groundTruth.size(); ++i) {
+    int begPos = i * beamSize;
+    vector<real> tmp(layerOut + begPos, layerOut + begPos + beamSize);
+    sort(begin(tmp), end(tmp));
+    sort(begin(groundTruth[i]), end(groundTruth[i]));
+    for (size_t j = 0; j < beamSize; ++j) CHECK_EQ(tmp[j], groundTruth[i][j]);
+  }
+}
+
+TEST(Layer, kmaxSeqScoreLayer) {
+  const size_t maxBeamSize = 100;
+  size_t beamSize = 1 + (rand() % maxBeamSize);
+
+  vector<int> seqStartPosition;
+  vector<int> subSeqStartPosition;
+  genRandomSeqInfo(seqStartPosition, subSeqStartPosition);
+  MatrixPtr inValue =
+      Matrix::create(subSeqStartPosition.back(), 1, false, false);
+
+  std::vector<bool> mode = {false};
+#ifndef PADDLE_ONLY_CPU
+  mode.push_back(true);
+#endif
+
+  for (auto hasSubseq : {false, true}) {
+    vector<vector<int>> groundTruth;
+    inValue->randomizeUniform();
+    genRandomGroundTruth(inValue->getData(),
+                         groundTruth,
+                         hasSubseq ? subSeqStartPosition : seqStartPosition,
+                         beamSize);
+
+    for (auto useGpu : mode) {
+      TestConfig config;
+      config.layerConfig.set_type("kmax_seq_score");
+      config.layerConfig.set_beam_size(beamSize);
+
+      if (hasSubseq) {
+        config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                                    "scores",
+                                    inValue,
+                                    seqStartPosition,
+                                    subSeqStartPosition});
+      } else {
+        config.inputDefs.push_back(
+            {INPUT_SELF_DEFINE_DATA, "scores", inValue, seqStartPosition});
+      }
+      config.layerConfig.add_inputs();
+
+      // data layer initialize
+      std::vector<DataLayerPtr> dataLayers;
+      LayerMap layerMap;
+      vector<Argument> datas;
+      initDataLayer(
+          config,
+          &dataLayers,
+          &datas,
+          &layerMap,
+          "kmax_seq_score",
+          100 /* actually this parameter is unused in self-defined input*/,
+          false,
+          useGpu);
+      // test layer initialize
+      std::vector<ParameterPtr> parameters;
+      LayerPtr kmaxSeqScoreLayer;
+      FLAGS_use_gpu = useGpu;
+      initTestLayer(config, &layerMap, &parameters, &kmaxSeqScoreLayer);
+      kmaxSeqScoreLayer->forward(PASS_TRAIN);
+
+      const MatrixPtr outValue = kmaxSeqScoreLayer->getOutputValue();
+      CHECK_EQ(outValue->getHeight(),
+               hasSubseq ? subSeqStartPosition.size() - 1
+                         : seqStartPosition.size() - 1);
+      CHECK_EQ(outValue->getWidth(), beamSize);
+      checkLayerOut(groundTruth, outValue->getData(), beamSize);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand((size_t)(time(NULL)));
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 67251f08e3..dd2c955e6a 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -152,6 +152,26 @@ TEST(Projection, identity) {
   }
 }
 
+TEST(Projection, slice) {
+  ProjectionConfig conf;
+  conf.set_type("slice");
+  conf.set_input_size(100);
+  SliceConfig& slice1 = *conf.add_slices();
+  slice1.set_start(10);
+  slice1.set_end(20);
+  SliceConfig& slice2 = *conf.add_slices();
+  slice2.set_start(50);
+  slice2.set_end(70);
+  conf.set_output_size(30);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 0,
+                       /* batchSize */ 10,
+                       useGpu);
+  }
+}
+
 TEST(Projection, scaling) {
   ProjectionConfig conf;
   conf.set_type("scaling");
@@ -347,6 +367,55 @@ TEST(Layer, CosSimVecMatLayer) {
   }
 }
 
+void testDepthwiseConvLayer(const string& type, bool useGpu) {
+  TestConfig config;
+  config.biasSize = 32;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_num_filters(32);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 2048, 192});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(3);
+  conv->set_channels(16);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(16);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(16);
+  conv->set_img_size_y(8);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+  conv->set_output_y(outputSize(conv->img_size_y(),
+                                conv->filter_size_y(),
+                                conv->padding_y(),
+                                conv->stride_y(),
+                                /* caffeMode */ true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+                              config.layerConfig.num_filters());
+
+  testLayerGrad(config, "depthwise_conv", 100, false, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "depthwise_conv", 2, false, useGpu, true, 0.02);
+}
+
+TEST(Layer, depthwiseConvLayer) {
+  //  'depthwise_conv' is a sepecial case of 'exconv' whose
+  //  groups size equals to the input channels size.
+  testDepthwiseConvLayer("exconv", /* useGpu= */ false);
+#ifndef PADDLE_ONLY_CPU
+  testDepthwiseConvLayer("exconv", /* useGpu= */ true);
+#endif
+}
+
 void testConvLayer(const string& type, bool trans, bool useGpu) {
   TestConfig config;
   config.biasSize = 16;
@@ -1802,6 +1871,157 @@ TEST(Layer, RowConvLayer) {
   }
 }
 
+TEST(Layer, CropLayer) {
+  TestConfig config;
+  // config input_0
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ImageConfig* img = input->mutable_image_conf();
+  img->set_channels(4);
+  img->set_img_size(16);
+  config.layerConfig.set_axis(2);
+  config.layerConfig.add_offset(0);
+  config.layerConfig.add_offset(0);
+
+  // config input_1
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 128, 0});
+  input = config.layerConfig.add_inputs();
+  img = input->mutable_image_conf();
+  img->set_channels(2);
+  img->set_img_size(8);
+
+  // config crop layer
+  config.layerConfig.set_type("crop");
+  config.layerConfig.set_name("cropLayer");
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "crop", 100, false, useGpu, false);
+  }
+}
+
+vector<real> randSampling(real range, int n) {
+  CHECK_GE(range, n);
+  vector<real> num(range);
+  iota(begin(num), end(num), 0.);
+  if (range == n) return num;
+
+  random_shuffle(begin(num), end(num));
+  num.resize(n);
+  sort(begin(num), end(num));
+  return num;
+}
+
+TEST(Layer, SubNestedSequenceLayer) {
+  // layer size is not crutial for this layer,
+  // so use a small layer size in unittest
+  const int layerSize = 4;
+
+  const int maxSeqNum = 50;
+  const int maxSeqLen = 50;
+  const int maxBeamSize = 32;
+
+  srand((size_t)(time(NULL)));
+  int beamSize = 1 + (rand() % maxBeamSize);
+
+  TestConfig config;
+  config.layerConfig.set_type("sub_nested_seq");
+  config.layerConfig.set_name("sub_nested_seq_layer");
+  config.layerConfig.set_size(layerSize);
+
+  int seqNum = 1 + (rand() % maxSeqNum);
+
+  // sequence information for the first input, it is a nested sequence
+  vector<int> seqStartPos(seqNum + 1, 0);
+  vector<int> subSeqStartPos(1, 0);
+
+  // selected indices
+  MatrixPtr selectedIndices = Matrix::create(seqNum, beamSize, false, false);
+  selectedIndices->one();
+  selectedIndices->mulScalar(-1.);
+  real* indicesData = selectedIndices->getData();
+
+  for (int i = 0; i < seqNum; ++i) {
+    int subSeqNum = 1 + (rand() % maxSeqNum);
+    for (int j = 0; j < subSeqNum; ++j) {
+      subSeqStartPos.push_back(subSeqStartPos.back() +
+                               (1 + (rand() % maxSeqLen)));
+    }
+    vector<real> selSeqs =
+        randSampling(static_cast<real>(subSeqNum), min(beamSize, subSeqNum));
+    memcpy(indicesData + (i * beamSize),
+           selSeqs.data(),
+           selSeqs.size() * sizeof(real));
+    seqStartPos[i + 1] = subSeqStartPos.back();
+  }
+
+  MatrixPtr seqInputPtr =
+      Matrix::create(seqStartPos.back(), layerSize, false, false);
+  seqInputPtr->randomizeUniform();
+  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                              "nested_seq_input",
+                              seqInputPtr,
+                              seqStartPos,
+                              subSeqStartPos});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back(
+      {INPUT_SELF_DEFINE_DATA, "selected_indices", selectedIndices});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "sub_nested_seq",
+                  /* batchSize */ seqNum,
+                  /* trans */ false,
+                  /* useGpu*/ useGpu,
+                  /* useWeight */ false);
+  }
+}
+
+TEST(Layer, ClipLayer) {
+  const size_t batchSize = 128;
+  const size_t size = 512;
+  TestConfig config;
+  config.layerConfig.set_type("clip");
+  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ClipConfig* layerConf = input->mutable_clip_conf();
+  double p1 = std::rand() / (double)RAND_MAX;
+  double p2 = std::rand() / (double)RAND_MAX;
+  layerConf->set_min(std::min(p1, p2));
+  layerConf->set_max(std::max(p1, p2));
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "clip", batchSize, false, useGpu, false);
+  }
+}
+
+TEST(Layer, RowL2NormLayer) {
+  const size_t batchSize = 128;
+  const size_t size = 512;
+  TestConfig config;
+  config.layerConfig.set_type("row_l2_norm");
+  config.layerConfig.set_size(size);
+  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "row_l2_norm", batchSize, false, useGpu, false);
+  }
+}
+
+TEST(Layer, ScaleShiftLayer) {
+  const size_t batchSize = 16;
+  const size_t size = 32;
+  TestConfig config;
+  config.layerConfig.set_type("scale_shift");
+  config.layerConfig.set_size(size);
+  config.biasSize = 1;
+  config.inputDefs.push_back(
+      {INPUT_DATA, "input", /* dim= */ size, /* paraSize= */ 1});
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "scale_shift", batchSize, false, useGpu, false);
+  }
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
new file mode 100644
index 0000000000..e1d2270df2
--- /dev/null
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -0,0 +1,76 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include "MKLDNNTester.h"
+#include "ModelConfig.pb.h"
+
+using namespace paddle;  // NOLINT
+
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(use_gpu);
+DECLARE_bool(use_mkldnn);
+
+struct testFCDesc {
+  int bs;
+  int ic;
+  int oc;
+  int ih, iw;  // oh == ow == 1
+};
+
+void testFcLayer(const testFCDesc& pm) {
+  const std::string compareTypes[] = {"mkldnn_fc", "fc"};
+  TestConfig cfg;
+  cfg.layerConfig.set_type(compareTypes[0]);
+  cfg.layerConfig.set_size(pm.oc);
+  cfg.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
+       /* size of weight= */ size_t(pm.oc * pm.ic * pm.ih * pm.iw)});
+  cfg.layerConfig.add_inputs();
+
+  MKLDNNTester tester;
+  for (auto biasSize : {pm.oc, 0}) {
+    cfg.biasSize = biasSize;
+    TestConfig ref = cfg;
+    ref.layerConfig.set_type(compareTypes[1]);
+    for (auto bs : {pm.bs, 1}) {
+      tester.run(cfg, ref, bs, pm.ih, pm.iw);
+    }
+  }
+}
+
+TEST(MKLDNNLayer, FcLayer) {
+  testFcLayer({/*bs*/ 2, /*ic*/ 2, /*oc*/ 3, /*ih*/ 1, /*iw*/ 1});
+  testFcLayer({/*bs*/ 3, /*ic*/ 7, /*oc*/ 19, /*ih*/ 1, /*iw*/ 1});
+  testFcLayer({/*bs*/ 8, /*ic*/ 16, /*oc*/ 32, /*ih*/ 13, /*iw*/ 13});
+  testFcLayer({/*bs*/ 4, /*ic*/ 12, /*oc*/ 18, /*ih*/ 13, /*iw*/ 11});
+  testFcLayer({/*bs*/ 2, /*ic*/ 64, /*oc*/ 32, /*ih*/ 16, /*iw*/ 16});
+  testFcLayer({/*bs*/ 15, /*ic*/ 3, /*oc*/ 6, /*ih*/ 16, /*iw*/ 16});
+}
+
+// TODO(TJ): add branch test
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  FLAGS_use_gpu = false;
+  FLAGS_use_mkldnn = true;
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp
index 40e662b22b..d36f72360f 100644
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -237,6 +237,12 @@ TEST(Compare, concat_table) {
   compareNetwork(config_file_a, config_file_b);
 }
 
+TEST(Compare, concat_slice) {
+  std::string config_file_a = "./gserver/tests/concat_slice_a.conf";
+  std::string config_file_b = "./gserver/tests/concat_slice_b.conf";
+  compareNetwork(config_file_a, config_file_b);
+}
+
 #ifndef PADDLE_ONLY_CPU
 TEST(Compare, img_pool) {
   std::string config_file_a = "./gserver/tests/img_pool_a.conf";
@@ -263,7 +269,8 @@ TEST(Compare, img_conv2) {
   bool useGpu = FLAGS_use_gpu;
   double eps = FLAGS_checkgrad_eps;
   FLAGS_use_gpu = true;
-  FLAGS_checkgrad_eps = 1e-2;
+  // Sometimes, this unit test will fail with 1e-2
+  FLAGS_checkgrad_eps = 4e-2;
   compareNetwork(config_file_a, config_file_b);
   FLAGS_use_gpu = useGpu;
   FLAGS_checkgrad_eps = eps;
diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
index de48b6fac9..5435808fb7 100644
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -12,21 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <cmath>
-#include <string.h>
 #include <paddle/utils/Logging.h>
+#include <string.h>
+#include <cmath>
 #include "BaseMatrix.h"
-#include "hl_matrix_ops.cuh"
-#include "hl_matrix_base.cuh"
-#include "hl_matrix_apply.cuh"
-#include "SIMDFunctions.h"
 #include "MathFunctions.h"
+#include "SIMDFunctions.h"
+#include "hl_matrix_apply.cuh"
+#include "hl_matrix_base.cuh"
+#include "hl_matrix_ops.cuh"
 
 namespace paddle {
 
 const char* SPARSE_SUPPORT_ERROR = "Sparse Matrix/Vector is not supported.";
 
-template<class T>
+template <class T>
 template <class Op>
 int BaseMatrixT<T>::applyUnary(Op op) {
   MatrixOffset offset(0, 0);
@@ -34,9 +34,11 @@ int BaseMatrixT<T>::applyUnary(Op op) {
   return 0;
 }
 
-template<class T>
+template <class T>
 template <class Op>
-int BaseMatrixT<T>::applyUnary(Op op, int numRows, int numCols,
+int BaseMatrixT<T>::applyUnary(Op op,
+                               int numRows,
+                               int numCols,
                                MatrixOffset& offset) {
   CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
   int dimM = numRows;
@@ -56,7 +58,7 @@ int BaseMatrixT<T>::applyUnary(Op op, int numRows, int numCols,
   return 0;
 }
 
-template<class T>
+template <class T>
 template <class Op>
 int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b) {
   CHECK(height_ == b.height_ && width_ == b.width_)
@@ -67,18 +69,23 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b) {
   return 0;
 }
 
-template<class T>
+template <class T>
 template <class Op>
-int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
-                                MatrixOffset& offset) {
+int BaseMatrixT<T>::applyBinary(
+    Op op, BaseMatrixT& b, int numRows, int numCols, MatrixOffset& offset) {
   applyBinary(op, b, numRows, numCols, offset, false_type(), false_type());
   return 0;
 }
 
-template<class T>
+template <class T>
 template <class Op, class bAsRowVector, class bAsColVector>
-int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
-                            MatrixOffset& offset, bAsRowVector, bAsColVector) {
+int BaseMatrixT<T>::applyBinary(Op op,
+                                BaseMatrixT& b,
+                                int numRows,
+                                int numCols,
+                                MatrixOffset& offset,
+                                bAsRowVector,
+                                bAsColVector) {
   CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
   CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
   CHECK(useGpu_ == b.useGpu_) << "Matrix type mismatch";
@@ -91,8 +98,8 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
   T* A = data_;
   T* B = b.data_;
   CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
-                           offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
   CHECK_LE(dimM + offset.aRow_, this->height_);
   CHECK_LE(dimN + offset.aCol_, this->width_);
   if (!bAsRowVector::value && !bAsColVector::value) {
@@ -115,7 +122,7 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
   return 0;
 }
 
-template<class T>
+template <class T>
 template <class Op>
 int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) {
   CHECK_EQ(height_, b.height_);
@@ -129,21 +136,29 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) {
   return 0;
 }
 
-template<class T>
+template <class T>
 template <class Op>
-int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
-                                 int numRows, int numCols,
+int BaseMatrixT<T>::applyTernary(Op op,
+                                 BaseMatrixT& b,
+                                 BaseMatrixT& c,
+                                 int numRows,
+                                 int numCols,
                                  MatrixOffset& offset) {
   applyTernary(op, b, c, numRows, numCols, offset, false_type(), false_type());
 
   return 0;
 }
 
-template<class T>
+template <class T>
 template <class Op, class cAsRowVector, class cAsColVector>
-int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
-                                 int numRows, int numCols, MatrixOffset& offset,
-                                 cAsRowVector, cAsColVector) {
+int BaseMatrixT<T>::applyTernary(Op op,
+                                 BaseMatrixT& b,
+                                 BaseMatrixT& c,
+                                 int numRows,
+                                 int numCols,
+                                 MatrixOffset& offset,
+                                 cAsRowVector,
+                                 cAsColVector) {
   CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
   CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
   CHECK(!c.isSparse()) << SPARSE_SUPPORT_ERROR;
@@ -160,10 +175,10 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
   T* B = b.data_;
   T* C = c.data_;
   CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
-                           offset.bRow_);
-  CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_,
-                           offset.cRow_);
+  CAL_MATRIX_START_ADDRESS(
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(
+      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
 
   CHECK_LE(dimM + offset.aRow_, this->height_);
   CHECK_LE(dimN + offset.aCol_, this->width_);
@@ -180,21 +195,21 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
   }
 
   if (true == useGpu_) {
-    hl_gpu_apply_ternary_op
-      <T, Op, cAsRowVector::value, cAsColVector::value>(
+    hl_gpu_apply_ternary_op<T, Op, cAsRowVector::value, cAsColVector::value>(
         op, A, B, C, dimM, dimN, lda, ldb, ldc);
   } else {
-    hl_cpu_apply_ternary_op
-      <T, Op, cAsRowVector::value, cAsColVector::value>(
+    hl_cpu_apply_ternary_op<T, Op, cAsRowVector::value, cAsColVector::value>(
         op, A, B, C, dimM, dimN, lda, ldb, ldc);
   }
 
   return 0;
 }
 
-template<class T>
+template <class T>
 template <class Op>
-int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
+int BaseMatrixT<T>::applyQuaternary(Op op,
+                                    BaseMatrixT& b,
+                                    BaseMatrixT& c,
                                     BaseMatrixT& d) {
   CHECK_EQ(height_, b.height_);
   CHECK_EQ(width_, b.width_);
@@ -209,10 +224,14 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
   return 0;
 }
 
-template<class T>
+template <class T>
 template <class Op>
-int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
-                                    BaseMatrixT& d, int numRows, int numCols,
+int BaseMatrixT<T>::applyQuaternary(Op op,
+                                    BaseMatrixT& b,
+                                    BaseMatrixT& c,
+                                    BaseMatrixT& d,
+                                    int numRows,
+                                    int numCols,
                                     MatrixOffset& offset) {
   CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
   CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
@@ -234,12 +253,12 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
   T* C = c.data_;
   T* D = d.data_;
   CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
-                           offset.bRow_);
-  CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_,
-                           offset.cRow_);
-  CAL_MATRIX_START_ADDRESS(D, d.height_, d.width_, ldd, offset.dCol_,
-                           offset.dRow_);
+  CAL_MATRIX_START_ADDRESS(
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(
+      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
+  CAL_MATRIX_START_ADDRESS(
+      D, d.height_, d.width_, ldd, offset.dCol_, offset.dRow_);
 
   CHECK_LE(dimM + offset.aRow_, this->height_);
   CHECK_LE(dimN + offset.aCol_, this->width_);
@@ -250,22 +269,29 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
   CHECK_LE(dimM + offset.dRow_, d.height_);
   CHECK_LE(dimN + offset.dCol_, d.width_);
   if (true == useGpu_) {
-    hl_gpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb,
-                               ldc, ldd);
+    hl_gpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd);
   } else {
-    hl_cpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb,
-                               ldc, ldd);
+    hl_cpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd);
   }
 
   return 0;
 }
 
-template<class T>
-template <class Agg, class Op, class Saver, class aAsRowVector,
+template <class T>
+template <class Agg,
+          class Op,
+          class Saver,
+          class aAsRowVector,
           class aAsColVector>
-int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
-                              int numRows, int numCols, MatrixOffset& offset,
-                              aAsRowVector, aAsColVector) {
+int BaseMatrixT<T>::aggregate(Agg agg,
+                              Op op,
+                              Saver sv,
+                              BaseMatrixT& b,
+                              int numRows,
+                              int numCols,
+                              MatrixOffset& offset,
+                              aAsRowVector,
+                              aAsColVector) {
   CHECK_EQ(useGpu_, b.useGpu_);
 
   int ld = stride_;
@@ -273,10 +299,10 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
 
   T* dst = data_;
   T* B = b.data_;
-  CAL_MATRIX_START_ADDRESS(dst, height_, width_, ld, offset.aCol_,
-                           offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
-                           offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(
+      dst, height_, width_, ld, offset.aCol_, offset.aRow_);
+  CAL_MATRIX_START_ADDRESS(
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
 
   if (aAsRowVector::value && !aAsColVector::value) {
     if (useGpu_) {
@@ -297,12 +323,21 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
   return 0;
 }
 
-template<class T>
-template <class Agg, class Op, class Saver, class aAsRowVector,
+template <class T>
+template <class Agg,
+          class Op,
+          class Saver,
+          class aAsRowVector,
           class aAsColVector>
-int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
-                              BaseMatrixT& c, int numRows, int numCols,
-                              MatrixOffset& offset, aAsRowVector,
+int BaseMatrixT<T>::aggregate(Agg agg,
+                              Op op,
+                              Saver sv,
+                              BaseMatrixT& b,
+                              BaseMatrixT& c,
+                              int numRows,
+                              int numCols,
+                              MatrixOffset& offset,
+                              aAsRowVector,
                               aAsColVector) {
   CHECK_EQ(useGpu_, b.useGpu_);
   CHECK_EQ(useGpu_, c.useGpu_);
@@ -314,28 +349,28 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
   T* dst = data_;
   T* B = b.data_;
   T* C = c.data_;
-  CAL_MATRIX_START_ADDRESS(dst, height_, width_, ld, offset.aCol_,
-                           offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
-                           offset.bRow_);
-  CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_,
-                           offset.cRow_);
+  CAL_MATRIX_START_ADDRESS(
+      dst, height_, width_, ld, offset.aCol_, offset.aRow_);
+  CAL_MATRIX_START_ADDRESS(
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(
+      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
 
   if (aAsRowVector::value && !aAsColVector::value) {
     if (useGpu_) {
-      hl_gpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B,
-                              ldb, C, ldc);
+      hl_gpu_matrix_column_op(
+          agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc);
     } else {
-      hl_cpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B,
-                              ldb, C, ldc);
+      hl_cpu_matrix_column_op(
+          agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc);
     }
   } else if (!aAsRowVector::value && aAsColVector::value) {
     if (useGpu_) {
-      hl_gpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B,
-                           ldb, C, ldc);
+      hl_gpu_matrix_row_op(
+          agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc);
     } else {
-      hl_cpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B,
-                           ldb, C, ldc);
+      hl_cpu_matrix_row_op(
+          agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc);
     }
   } else {
     LOG(FATAL) << "not supported";
@@ -350,15 +385,19 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
  */
 
 DEFINE_MATRIX_UNARY_OP(Neg, a = -a);
-template<class T>
-void BaseMatrixT<T>::neg() { applyUnary(unary::Neg<T>()); }
+template <class T>
+void BaseMatrixT<T>::neg() {
+  applyUnary(unary::Neg<T>());
+}
 
 DEFINE_MATRIX_UNARY_OP(Exp, a = exp(a));
-template<>
-void BaseMatrixT<real>::exp2() { applyUnary(unary::Exp<real>()); }
+template <>
+void BaseMatrixT<real>::exp2() {
+  applyUnary(unary::Exp<real>());
+}
 
 DEFINE_MATRIX_UNARY_OP(Log, a = log(a));
-template<>
+template <>
 void BaseMatrixT<real>::log2() {
   if (useGpu_) {
     applyUnary(unary::Log<real>());
@@ -368,30 +407,42 @@ void BaseMatrixT<real>::log2() {
 }
 
 DEFINE_MATRIX_UNARY_OP(Sqrt, a = sqrt(a));
-template<>
-void BaseMatrixT<real>::sqrt2() { applyUnary(unary::Sqrt<real>()); }
+template <>
+void BaseMatrixT<real>::sqrt2() {
+  applyUnary(unary::Sqrt<real>());
+}
 
 DEFINE_MATRIX_UNARY_OP(Square, a = a * a);
-template<class T>
-void BaseMatrixT<T>::square2() { applyUnary(unary::Square<T>()); }
+template <class T>
+void BaseMatrixT<T>::square2() {
+  applyUnary(unary::Square<T>());
+}
 
 DEFINE_MATRIX_UNARY_OP(Reciprocal, a = 1.0f / a);
-template<class T>
-void BaseMatrixT<T>::reciprocal2() { applyUnary(unary::Reciprocal<T>()); }
+template <class T>
+void BaseMatrixT<T>::reciprocal2() {
+  applyUnary(unary::Reciprocal<T>());
+}
 
 DEFINE_MATRIX_UNARY_OP(Abs, a = a > 0 ? a : -a);
-template<class T>
-void BaseMatrixT<T>::abs2() { applyUnary(unary::Abs<T>()); }
+template <class T>
+void BaseMatrixT<T>::abs2() {
+  applyUnary(unary::Abs<T>());
+}
 
 DEFINE_MATRIX_UNARY_OP(Sign, a = (a > 0) - (a < 0));
-template<class T>
-void BaseMatrixT<T>::sign2() { applyUnary(unary::Sign<T>()); }
+template <class T>
+void BaseMatrixT<T>::sign2() {
+  applyUnary(unary::Sign<T>());
+}
 
 DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
-template<class T>
-void BaseMatrixT<T>::zero() { applyUnary(unary::Zero<T>()); }
+template <class T>
+void BaseMatrixT<T>::zero() {
+  applyUnary(unary::Zero<T>());
+}
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::zeroAtOffset(int64_t columnOffset, int64_t numColumns) {
   int numRows = height_;
   int numCols = numColumns;
@@ -400,11 +451,13 @@ void BaseMatrixT<T>::zeroAtOffset(int64_t columnOffset, int64_t numColumns) {
 }
 
 DEFINE_MATRIX_UNARY_OP(One, a = 1);
-template<class T>
-void BaseMatrixT<T>::one() { applyUnary(unary::One<T>()); }
+template <class T>
+void BaseMatrixT<T>::one() {
+  applyUnary(unary::One<T>());
+}
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(a, p));
-template<>
+template <>
 void BaseMatrixT<real>::pow2(real p) {
   if (useGpu_) {
     applyUnary(unary::Pow<real>(p));
@@ -414,44 +467,67 @@ void BaseMatrixT<real>::pow2(real p) {
 }
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a -= p);
-template<class T>
-void BaseMatrixT<T>::subScalar(T p) { applyUnary(unary::SubScalar<T>(p)); }
+template <class T>
+void BaseMatrixT<T>::subScalar(T p) {
+  applyUnary(unary::SubScalar<T>(p));
+}
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a *= p);
-template<class T>
-void BaseMatrixT<T>::mulScalar(T p) { applyUnary(unary::MulScalar<T>(p)); }
+template <class T>
+void BaseMatrixT<T>::mulScalar(T p) {
+  applyUnary(unary::MulScalar<T>(p));
+}
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a /= p);
-template<class T>
-void BaseMatrixT<T>::divScalar(T p) { applyUnary(unary::DivScalar<T>(p)); }
+template <class T>
+void BaseMatrixT<T>::divScalar(T p) {
+  applyUnary(unary::DivScalar<T>(p));
+}
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(Assign, ONE_PARAMETER, a = p);
-template<class T>
-void BaseMatrixT<T>::assign(T p) { applyUnary(unary::Assign<T>(p)); }
+template <class T>
+void BaseMatrixT<T>::assign(T p) {
+  applyUnary(unary::Assign<T>(p));
+}
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(Add, ONE_PARAMETER, a += p);
-template<class T>
-void BaseMatrixT<T>::add(T p) { applyUnary(unary::Add<T>(p)); }
+template <class T>
+void BaseMatrixT<T>::add(T p) {
+  applyUnary(unary::Add<T>(p));
+}
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = a * p1 + p2);
-template<class T>
-void BaseMatrixT<T>::add(T p1, T p2) { applyUnary(unary::Add2<T>(p1, p2)); }
+template <class T>
+void BaseMatrixT<T>::add(T p1, T p2) {
+  applyUnary(unary::Add2<T>(p1, p2));
+}
 
-DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip, TWO_PARAMETER,
+DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip,
+                                 TWO_PARAMETER,
                                  a = a < p1 ? p1 : (a > p2 ? p2 : a));
-template<class T>
-void BaseMatrixT<T>::clip(T p1, T p2) { applyUnary(unary::Clip<T>(p1, p2)); }
+template <class T>
+void BaseMatrixT<T>::clip(T p1, T p2) {
+  applyUnary(unary::Clip<T>(p1, p2));
+}
 
-DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar, ONE_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ClipDerivative,
+                                  TWO_PARAMETER,
+                                  a = b < p1 ? 0 : (b > p2 ? 0 : 1));
+template <class T>
+void BaseMatrixT<T>::clipDerivative(BaseMatrixT& b, T p1, T p2) {
+  applyBinary(binary::ClipDerivative<T>(p1, p2), b);
+}
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar,
+                                 ONE_PARAMETER,
                                  a = a > p ? 1.0f : 0.0f);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::biggerThanScalar(T p) {
   applyUnary(unary::BiggerThanScalar<T>(p));
 }
 
-DEFINE_MATRIX_UNARY_PARAMETER_OP(DownClip, ONE_PARAMETER,
-                                 a = a > p ? a : p);
-template<class T>
+DEFINE_MATRIX_UNARY_PARAMETER_OP(DownClip, ONE_PARAMETER, a = a > p ? a : p);
+template <class T>
 void BaseMatrixT<T>::downClip(T p) {
   applyUnary(unary::DownClip<T>(p));
 }
@@ -462,12 +538,12 @@ void BaseMatrixT<T>::downClip(T p) {
  */
 
 DEFINE_MATRIX_BINARY_OP(Add, a += b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::add(BaseMatrixT& b) {
   applyBinary(binary::Add<T>(), b);
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::add(BaseMatrixT& b) {
   if (useGpu_) {
     applyBinary(binary::Add<real>(), b);
@@ -478,7 +554,7 @@ void BaseMatrixT<real>::add(BaseMatrixT& b) {
   }
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addAtOffset(BaseMatrixT& b, int64_t columnOffset) {
   if (columnOffset + b.width_ <= width_) {
     int numRows = height_;
@@ -497,43 +573,53 @@ void BaseMatrixT<T>::addAtOffset(BaseMatrixT& b, int64_t columnOffset) {
   }
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addP2P(BaseMatrixT& b) {
   T* A = data_;
   T* B = b.data_;
   int dimM = height_;
   int dimN = width_;
 
-  hl_gpu_apply_binary_op<T, binary::Add<T>, 0, 0>
-    (binary::Add<T>(), A, B, dimM, dimN, dimN, dimN);
+  hl_gpu_apply_binary_op<T, binary::Add<T>, 0, 0>(
+      binary::Add<T>(), A, B, dimM, dimN, dimN, dimN);
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addColVector(BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyBinary(binary::Add<T>(), b, numRows, numCols, offset, false_type(),
+  applyBinary(binary::Add<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              false_type(),
               true_type() /* bAsColVector */);
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addRowVector(BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyBinary(binary::Add<T>(), b, numRows, numCols, offset,
-              true_type() /* bAsRowVector */, false_type());
+  applyBinary(binary::Add<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              true_type() /* bAsRowVector */,
+              false_type());
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Add1, ONE_PARAMETER, a += b * p);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::add(BaseMatrixT& b, T p) {
   applyBinary(binary::Add1<T>(p), b);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(b, p));
-template<>
+template <>
 void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) {
   if (useGpu_) {
     applyBinary(binary::Pow<real>(p), b);
@@ -543,36 +629,45 @@ void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) {
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = p1 * a + p2 * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::add(BaseMatrixT& b, T p1, T p2) {
   applyBinary(binary::Add2<T>(p1, p2), b);
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addBias(BaseMatrixT& b, T scale) {
   MatrixOffset offset(0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyBinary(binary::Add1<T>(scale), b, numRows, numCols, offset,
-              true_type() /* bAsRowVector */, false_type());
+  applyBinary(binary::Add1<T>(scale),
+              b,
+              numRows,
+              numCols,
+              offset,
+              true_type() /* bAsRowVector */,
+              false_type());
 }
 
 DEFINE_MATRIX_BINARY_OP(Sub, a -= b);
-template<class T>
-void BaseMatrixT<T>::sub(BaseMatrixT& b) { applyBinary(binary::Sub<T>(), b); }
+template <class T>
+void BaseMatrixT<T>::sub(BaseMatrixT& b) {
+  applyBinary(binary::Sub<T>(), b);
+}
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Sub1, ONE_PARAMETER, a -= b * p);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::sub(BaseMatrixT& b, T p) {
   applyBinary(binary::Sub1<T>(p), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Relu, b = a > 0.0f ? a : 0.0f);
-template<class T>
-void BaseMatrixT<T>::relu(BaseMatrixT& b) { applyBinary(binary::Relu<T>(), b); }
+template <class T>
+void BaseMatrixT<T>::relu(BaseMatrixT& b) {
+  applyBinary(binary::Relu<T>(), b);
+}
 
 DEFINE_MATRIX_BINARY_OP(ReluDerivative, a *= (b > 0.0f ? 1.0f : 0.0f));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::reluDerivative(BaseMatrixT& b) {
   applyBinary(binary::ReluDerivative<T>(), b);
 }
@@ -582,7 +677,7 @@ DEFINE_MATRIX_BINARY_OP(Softrelu, const T THRESHOLD = 40.0;
                                               ? THRESHOLD
                                               : ((a < -THRESHOLD) ? (-THRESHOLD)
                                                                   : a))));
-template<>
+template <>
 void BaseMatrixT<real>::softrelu(BaseMatrixT& b) {
   applyBinary(binary::Softrelu<real>(), b);
 }
@@ -592,97 +687,100 @@ DEFINE_MATRIX_BINARY_OP(
     a *= (1.0 - exp(-1.0 * ((b > THRESHOLD)
                                 ? THRESHOLD
                                 : ((b < -THRESHOLD) ? (-THRESHOLD) : b)))));
-template<>
+template <>
 void BaseMatrixT<real>::softreluDerivative(BaseMatrixT& b) {
   applyBinary(binary::SoftreluDerivative<real>(), b);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Brelu, TWO_PARAMETER, b = a > p1 ? a : p1;
                                   b = b < p2 ? b : p2);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::brelu(BaseMatrixT& b) {
-  int p1 = 0, p2 = 24;    //! TODO(yuyang18): Make p1,p2 configuable.
+  int p1 = 0, p2 = 24;  //! TODO(yuyang18): Make p1,p2 configuable.
   applyBinary(binary::Brelu<T>(p1, p2), b);
 }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(BreluDerivative, TWO_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(BreluDerivative,
+                                  TWO_PARAMETER,
                                   a *= (b > p1 && b < p2) ? 1.0 : 0.0);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::breluDerivative(BaseMatrixT& b) {
   int p1 = 0, p2 = 24;
   applyBinary(binary::BreluDerivative<T>(p1, p2), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Square, b = a * a);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::square2(BaseMatrixT& b) {
   applyBinary(binary::Square<T>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(SquareDerivative, a *= 2.0 * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::squareDerivative(BaseMatrixT& b) {
   applyBinary(binary::SquareDerivative<T>(), b);
 }
 
-DEFINE_MATRIX_BINARY_OP(Tanh,
-    T tmp = -2.0 * a;
-    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-    b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
-template<>
+DEFINE_MATRIX_BINARY_OP(Tanh, T tmp = -2.0 * a;
+                        tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+                        b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
+template <>
 void BaseMatrixT<real>::tanh(BaseMatrixT& b) {
   applyBinary(binary::Tanh<real>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(TanhDerivative, a *= 1 - b * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::tanhDerivative(BaseMatrixT& b) {
   applyBinary(binary::TanhDerivative<T>(), b);
 }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanh, TWO_PARAMETER,
-                                  b = p1 *
-                                      (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0));
-template<>
+DEFINE_MATRIX_BINARY_PARAMETER_OP(
+    ScaledTanh, TWO_PARAMETER, b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0));
+template <>
 void BaseMatrixT<real>::scaledTanh(BaseMatrixT& b, real p1, real p2) {
   applyBinary(binary::ScaledTanh<real>(p1, p2), b);
 }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanhDerivative, TWO_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanhDerivative,
+                                  TWO_PARAMETER,
                                   a *= p2 * (p1 - b * b));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::scaledTanhDerivative(BaseMatrixT& b, T p1, T p2) {
   applyBinary(binary::ScaledTanhDerivative<T>(p1 * p1, p2 / p1), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Reciprocal, b = 1.0f / a);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b) {
   applyBinary(binary::Reciprocal<T>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(ReciprocalDerivative, a *= -b * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::reciprocalDerivative(BaseMatrixT& b) {
   applyBinary(binary::ReciprocalDerivative<T>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Abs, b = a > 0.0f ? a : -a);
-template<class T>
-void BaseMatrixT<T>::abs2(BaseMatrixT& b) { applyBinary(binary::Abs<T>(), b); }
+template <class T>
+void BaseMatrixT<T>::abs2(BaseMatrixT& b) {
+  applyBinary(binary::Abs<T>(), b);
+}
 
 DEFINE_MATRIX_BINARY_OP(AbsDerivative, a = (b > 0) ? a : (b < 0) ? -a : 0);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::absDerivative(BaseMatrixT& b) {
   applyBinary(binary::AbsDerivative<T>(), b);
 }
 
-DEFINE_MATRIX_BINARY_OP(
-    Sigmoid, const T THRESHOLD_MIN = -40.0; const T THRESHOLD_MAX = 13.0;
-    T tmp = (a < THRESHOLD_MIN) ? THRESHOLD_MIN
-                                   : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
-    b = 1.0f / (1.0f + exp(-tmp)));
-template<>
+DEFINE_MATRIX_BINARY_OP(Sigmoid, const T THRESHOLD_MIN = -40.0;
+                        const T THRESHOLD_MAX = 13.0;
+                        T tmp = (a < THRESHOLD_MIN)
+                                    ? THRESHOLD_MIN
+                                    : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
+                        b = 1.0f / (1.0f + exp(-tmp)));
+template <>
 void BaseMatrixT<real>::sigmoid(BaseMatrixT& b) {
   if (useGpu_) {
     applyBinary(binary::Sigmoid<real>(), b);
@@ -716,31 +814,31 @@ void BaseMatrixT<real>::sigmoid(BaseMatrixT& b) {
 }
 
 DEFINE_MATRIX_BINARY_OP(SigmoidDerivative, a *= b * (1 - b));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::sigmoidDerivative(BaseMatrixT& b) {
   applyBinary(binary::SigmoidDerivative<T>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(ExpDerivative, a *= b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::expDerivative(BaseMatrixT& b) {
   applyBinary(binary::ExpDerivative<T>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Sign, b = a > 0.0f ? 1.0f : -1.0f);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::sign2(BaseMatrixT& b) {
   applyBinary(binary::Sign<T>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Exp, a = exp(b));
-template<>
+template <>
 void BaseMatrixT<real>::exp2(BaseMatrixT& b) {
   applyBinary(binary::Exp<real>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Log, a = log(b));
-template<>
+template <>
 void BaseMatrixT<real>::log2(BaseMatrixT& b) {
   if (useGpu_) {
     applyBinary(binary::Log<real>(), b);
@@ -750,13 +848,13 @@ void BaseMatrixT<real>::log2(BaseMatrixT& b) {
 }
 
 DEFINE_MATRIX_BINARY_OP(Sqrt, a = sqrt(b));
-template<>
+template <>
 void BaseMatrixT<real>::sqrt2(BaseMatrixT& b) {
   applyBinary(binary::Sqrt<real>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(InvSqrt, a = 1.0f / sqrt(b));
-template<>
+template <>
 void BaseMatrixT<real>::invSqrt(BaseMatrixT& b) {
   if (useGpu_) {
     applyBinary(binary::InvSqrt<real>(), b);
@@ -768,37 +866,37 @@ void BaseMatrixT<real>::invSqrt(BaseMatrixT& b) {
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(IsEqual, ONE_PARAMETER, a = (b == p));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::isEqualTo(BaseMatrixT& b, T value) {
   applyBinary(binary::IsEqual<T>(value), b);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(AddScalar, ONE_PARAMETER, a = b + p);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addScalar(BaseMatrixT& b, T p) {
   applyBinary(binary::AddScalar<T>(p), b);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a = b - p);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::subScalar(BaseMatrixT& b, T p) {
   applyBinary(binary::SubScalar<T>(p), b);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a = b * p);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::mulScalar(BaseMatrixT& b, T p) {
   applyBinary(binary::MulScalar<T>(p), b);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a = b / p);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::divScalar(BaseMatrixT& b, T p) {
   applyBinary(binary::DivScalar<T>(p), b);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(ScalarDiv, ONE_PARAMETER, a = p / b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::scalarDiv(BaseMatrixT& b, T p) {
   applyBinary(binary::ScalarDiv<T>(p), b);
 }
@@ -810,20 +908,20 @@ void BaseMatrixT<T>::scalarDiv(BaseMatrixT& b, T p) {
 
 DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropy,
                          a = -c * log(b) - (1 - c) * log(1 - b));
-template<>
+template <>
 void BaseMatrixT<real>::softCrossEntropy(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::SoftCrossEntropy<real>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropyBp, a += (b - c) / (b * (1 - b)));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::softCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::SoftCrossEntropyBp<T>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropy,
                          a = c > 0.5 ? -log(b) : -log(1.0 - b));
-template<>
+template <>
 void BaseMatrixT<real>::binaryLabelCrossEntropy(BaseMatrixT& b,
                                                 BaseMatrixT& c) {
   if (useGpu_) {
@@ -851,70 +949,73 @@ void BaseMatrixT<real>::binaryLabelCrossEntropy(BaseMatrixT& b,
 
 DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropyBp,
                          a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::binaryLabelCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::BinaryCrossEntropyBp<T>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(Add, a = b + c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::add(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::Add<T>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add1, TWO_PARAMETER, a = p1 * b + p2 * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::add(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) {
   applyTernary(ternary::Add1<T>(p1, p2), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(Sub, a = b - c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::sub(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::Sub<T>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_PARAMETER_OP(Sub1, TWO_PARAMETER, a = p1 * b - p2 * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::sub(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) {
   applyTernary(ternary::Sub1<T>(p1, p2), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(Add2, a = a + b + c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::add2(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::Add2<T>(), b, c);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add3, THREE_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add3,
+                                   THREE_PARAMETER,
                                    a = p1 * a + p2 * b + p3 * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::add2(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) {
   applyTernary(ternary::Add3<T>(p1, p2, p3), b, c);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(SgdUpdate, THREE_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(SgdUpdate,
+                                   THREE_PARAMETER,
                                    c = p2 * c - p1 * (b + p3 * a);
                                    a = a + c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::sgdUpdate(BaseMatrixT& b,  // grad
                                BaseMatrixT& c,  // mom
-                               T p1,        // learningRate,
-                               T p2,        // momentum,
-                               T p3) {      // decayRate
+                               T p1,            // learningRate,
+                               T p2,            // momentum,
+                               T p3) {          // decayRate
   applyTernary(ternary::SgdUpdate<T>(p1, p2, p3), b, c);
 }
 
-DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(SgdUpdate, THREE_PARAMETER,
+DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(SgdUpdate,
+                                      THREE_PARAMETER,
                                       c = p2 * c - p1 * d * (b + p3 * a);
                                       a += c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::sgdUpdate(BaseMatrixT& b,  // grad,
                                BaseMatrixT& c,  // mom,
                                BaseMatrixT& d,  // lr,
-                               T p1,        // learningRate,
-                               T p2,        // momentum,
-                               T p3) {      // decayRate
+                               T p1,            // learningRate,
+                               T p2,            // momentum,
+                               T p3) {          // decayRate
   applyQuaternary(quaternary::SgdUpdate<T>(p1, p2, p3), b, c, d);
 }
 
@@ -922,19 +1023,22 @@ DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p * b;
                                   a = (a > lambda)
                                           ? (a - lambda)
                                           : (a < -lambda) ? (a + lambda) : 0);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::applyL1(BaseMatrixT& lr, T learningRate, T decayRate) {
   applyBinary(binary::ApplyL1<T>(learningRate * decayRate), lr);
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::applyL1(BaseMatrixT& lr,
                                 real learningRate,
                                 real decayRate) {
   if (useGpu_) {
     applyBinary(binary::ApplyL1<real>(learningRate * decayRate), lr);
   } else {
-    simd::decayL1(this->data_, this->data_, lr.data_, learningRate * decayRate,
+    simd::decayL1(this->data_,
+                  this->data_,
+                  lr.data_,
+                  learningRate * decayRate,
                   height_ * width_);
   }
 }
@@ -943,24 +1047,25 @@ DEFINE_MATRIX_UNARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p;
                                  a = (a > lambda)
                                          ? (a - lambda)
                                          : (a < -lambda) ? (a + lambda) : 0);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::applyL1(T learningRate, T decayRate) {
   applyUnary(unary::ApplyL1<T>(learningRate * decayRate));
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::applyL1(real learningRate, real decayRate) {
   if (useGpu_) {
     applyUnary(unary::ApplyL1<real>(learningRate * decayRate));
   } else {
-    simd::decayL1(this->data_, this->data_, learningRate * decayRate,
-                  height_ * width_);
+    simd::decayL1(
+        this->data_, this->data_, learningRate * decayRate, height_ * width_);
   }
 }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL2, ONE_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL2,
+                                  ONE_PARAMETER,
                                   a *= (1.0f / (1.0f + p * b)));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) {
   if (useGpu_) {
     applyBinary(binary::ApplyL2<T>(learningRate * decayRate), lr);
@@ -973,32 +1078,33 @@ void BaseMatrixT<T>::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) {
   }
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::applyL2(T learningRate, T decayRate) {
   BaseMatrixT<T>::mulScalar(1.0f / (1.0f + learningRate * decayRate));
 }
 
 DEFINE_MATRIX_BINARY_OP(DotMul, a *= b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotMul(BaseMatrixT& b) {
   applyBinary(binary::DotMul<T>(), b);
 }
 
 DEFINE_MATRIX_TERNARY_OP(DotMul, a = b * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotMul(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::DotMul<T>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(DotDiv, a = (b == 0.0) ? 0.0 : b / c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotDiv(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::DotDiv<T>(), b, c);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotDiv2P, TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotDiv2P,
+                                   TWO_PARAMETER,
                                    a = (b + p1) / (c + p2));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotDiv(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
   applyTernary(ternary::DotDiv2P<T>(p1, p2), b, c);
 }
@@ -1008,7 +1114,7 @@ DEFINE_MATRIX_QUATERNARY_OP(RankLoss, const T THRESHOLD = 40.0; a = b - c;
                                     ? THRESHOLD
                                     : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
                             a = log(1 + exp(a)) - a * d);
-template<>
+template <>
 void BaseMatrixT<real>::rankLoss(BaseMatrixT& b,
                                  BaseMatrixT& c,
                                  BaseMatrixT& d) {
@@ -1019,8 +1125,9 @@ DEFINE_MATRIX_QUATERNARY_OP(RankLossBp, const T THRESHOLD = 40.0; a = b - c;
                             a = (a > THRESHOLD)
                                     ? THRESHOLD
                                     : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
-                            a = exp(a); a = (a / (1 + a) - d));
-template<>
+                            a = exp(a);
+                            a = (a / (1 + a) - d));
+template <>
 void BaseMatrixT<real>::rankLossBp(BaseMatrixT& b,
                                    BaseMatrixT& c,
                                    BaseMatrixT& d) {
@@ -1033,7 +1140,7 @@ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLoss, const T THRESHOLD = 40.0;
                                                                  ? -THRESHOLD
                                                                  : b;
                          a = log(1 + exp(x)) - c * x);
-template<>
+template <>
 void BaseMatrixT<real>::logisticRegressionLoss(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::LogisticRegressionLoss<real>(), b, c);
 }
@@ -1043,22 +1150,23 @@ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLossBp, const T THRESHOLD = 40.0;
                          T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
                                                                  ? -THRESHOLD
                                                                  : b;
-                         x = exp(x); a = x / (1 + x) - c);
-template<>
+                         x = exp(x);
+                         a = x / (1 + x) - c);
+template <>
 void BaseMatrixT<real>::logisticRegressionLossBp(BaseMatrixT& b,
                                                  BaseMatrixT& c) {
   applyTernary(ternary::LogisticRegressionLossBp<real>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(BiggerThan, a = (b > c) ? 1.0f : 0.0f);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::biggerThan(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::BiggerThan<T>(), b, c);
 }
 
 DEFINE_MATRIX_QUATERNARY_OP(
     BiggerThan, a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
                                 BaseMatrixT& c,
                                 BaseMatrixT& d) {
@@ -1066,25 +1174,34 @@ void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
 }
 
 DEFINE_MATRIX_TERNARY_OP(Max, a = (b > c) ? b : c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::max2(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::Max<T>(), b, c);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(BinaryClassificationError, ONE_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(BinaryClassificationError,
+                                   ONE_PARAMETER,
                                    c += ((a > p) == (b > p)) ? 0.0f : 1.0f);
-template<class T>
-void BaseMatrixT<T>::binaryClassificationError2(size_t destCol, BaseMatrixT& b,
-                                                BaseMatrixT& c, T p) {
+template <class T>
+void BaseMatrixT<T>::binaryClassificationError2(size_t destCol,
+                                                BaseMatrixT& b,
+                                                BaseMatrixT& c,
+                                                T p) {
   CHECK(!useGpu_) << "do not support gpu";
   MatrixOffset offset(0, 0, 0, 0, destCol, 0);
   int numRows = b.height_;
   int numCols = b.width_;
-  b.applyTernary(ternary::BinaryClassificationError<T>(p), c, *this, numRows,
-                 numCols, offset, false_type(), true_type() /*cAsColVector*/);
+  b.applyTernary(ternary::BinaryClassificationError<T>(p),
+                 c,
+                 *this,
+                 numRows,
+                 numCols,
+                 offset,
+                 false_type(),
+                 true_type() /*cAsColVector*/);
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::binaryClassificationError(size_t destCol,
                                                   BaseMatrixT& b,
                                                   BaseMatrixT& c,
@@ -1092,127 +1209,148 @@ void BaseMatrixT<real>::binaryClassificationError(size_t destCol,
   MatrixOffset offset(destCol, 0, 0, 0, 0, 0);
   int numRows = b.height_;
   int numCols = b.width_;
-  aggregate(aggregate::sum(), base::binary::classificationError(p),
-            base::binary::add(), b, c, numRows, numCols, offset, false_type(),
+  aggregate(aggregate::sum(),
+            base::binary::classificationError(p),
+            base::binary::add(),
+            b,
+            c,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
             true_type() /*aAsColVector*/);
 }
 
-DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(Add3, THREE_PARAMETER,
+DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(Add3,
+                                      THREE_PARAMETER,
                                       a = p1 * b + p2 * c + p3 * d);
-template<class T>
-void BaseMatrixT<T>::add3(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1,
-                          T p2, T p3) {
+template <class T>
+void BaseMatrixT<T>::add3(
+    BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2, T p3) {
   applyQuaternary(quaternary::Add3<T>(p1, p2, p3), b, c, d);
 }
 
 DEFINE_MATRIX_TERNARY_OP(DotMulSquare, a = b * c * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotMulSquare(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::DotMulSquare<T>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(DotSquareSquare, a = b * b * c * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotSquareSquare(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::DotSquareSquare<T>(), b, c);
 }
 
 DEFINE_MATRIX_BINARY_OP(DotMulSquare, a *= b * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotMulSquare(BaseMatrixT& b) {
   applyBinary(binary::DotMulSquare<T>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(DotSquareMul, a = a * a * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotSquareMul(BaseMatrixT& b) {
   applyBinary(binary::DotSquareMul<T>(), b);
 }
 
-DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(AddSquareSum, THREE_PARAMETER,
+DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(AddSquareSum,
+                                      THREE_PARAMETER,
                                       T tmp = p1 * b + p2 * c + p3 * d;
                                       a += tmp * tmp);
-template<class T>
-void BaseMatrixT<T>::addSquareSum(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d,
-                                  T p1, T p2, T p3) {
+template <class T>
+void BaseMatrixT<T>::addSquareSum(
+    BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1, T p2, T p3) {
   applyQuaternary(quaternary::AddSquareSum<T>(p1, p2, p3), b, c, d);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(AddSquare, ONE_PARAMETER, a += p * b * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addSquare(BaseMatrixT& b, T p) {
   applyBinary(binary::AddSquare<T>(p), b);
 }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(DecayAddSquare, TWO_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(DecayAddSquare,
+                                  TWO_PARAMETER,
                                   a = p1 * a + p2 * b * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::decayAddSquare(BaseMatrixT& b, T p1, T p2) {
   applyBinary(binary::DecayAddSquare<T>(p1, p2), b);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DecayAddSquareMul, TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DecayAddSquareMul,
+                                   TWO_PARAMETER,
                                    a = p1 * a + p2 * b * b * c * c);
-template<class T>
-void BaseMatrixT<T>::decayAddSquareMul(BaseMatrixT& b, BaseMatrixT& c, T p1,
+template <class T>
+void BaseMatrixT<T>::decayAddSquareMul(BaseMatrixT& b,
+                                       BaseMatrixT& c,
+                                       T p1,
                                        T p2) {
   applyTernary(ternary::DecayAddSquareMul<T>(p1, p2), b, c);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(ReciprocalSum, THREE_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(ReciprocalSum,
+                                   THREE_PARAMETER,
                                    a = 1 / (p1 * b + p2 * c + p3));
-template<class T>
-void BaseMatrixT<T>::reciprocalSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2,
-                                   T p3) {
+template <class T>
+void BaseMatrixT<T>::reciprocalSum(
+    BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) {
   applyTernary(ternary::ReciprocalSum<T>(p1, p2, p3), b, c);
 }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2, TWO_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2,
+                                  TWO_PARAMETER,
                                   a = 1 / (p1 * b + p2));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b, T p1, T p2) {
   applyBinary(binary::Reciprocal2<T>(p1, p2), b);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSquareSum, TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSquareSum,
+                                   TWO_PARAMETER,
                                    T tmp = p1 * b + p2 * c;
                                    a *= tmp * tmp);
-template<class T>
-void BaseMatrixT<T>::dotMulSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1,
+template <class T>
+void BaseMatrixT<T>::dotMulSquareSum(BaseMatrixT& b,
+                                     BaseMatrixT& c,
+                                     T p1,
                                      T p2) {
   applyTernary(ternary::DotMulSquareSum<T>(p1, p2), b, c);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotSquareSum, TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotSquareSum,
+                                   TWO_PARAMETER,
                                    T tmp = p1 * b + p2 * c;
                                    a = tmp * tmp);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
   applyTernary(ternary::DotSquareSum<T>(p1, p2), b, c);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSum, TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSum,
+                                   TWO_PARAMETER,
                                    a *= p1 * b + p2 * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotMulSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
   applyTernary(ternary::DotMulSum<T>(p1, p2), b, c);
 }
 
 DEFINE_MATRIX_BINARY_OP(CopyAndClear, b = a; a = 0);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::copyAndClear(BaseMatrixT& b) {
   applyBinary(binary::CopyAndClear<T>(), b);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(AddDotMul, TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(AddDotMul,
+                                   TWO_PARAMETER,
                                    a = p1 * a + p2 * b * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addDotMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
   applyTernary(ternary::AddDotMul<T>(p1, p2), b, c);
 }
 
 DEFINE_MATRIX_BINARY_OP(Assign, a = b;);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::assign(BaseMatrixT& b) {
   if (useGpu_) {
     applyBinary(binary::Assign<T>(), b);
@@ -1223,7 +1361,7 @@ void BaseMatrixT<T>::assign(BaseMatrixT& b) {
   }
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) {
   if (columnOffset + b.width_ <= width_) {
     int numRows = height_;
@@ -1243,24 +1381,31 @@ void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) {
 }
 
 DEFINE_MATRIX_BINARY_OP(DeepSwap, T tmp = a; a = b; b = tmp);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::deepSwap(BaseMatrixT& b) {
-    applyBinary(binary::DeepSwap<T>(), b);
+  applyBinary(binary::DeepSwap<T>(), b);
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::rowDotMul(size_t destCol,
                                   BaseMatrixT& b,
                                   BaseMatrixT& c) {
   int numRows = b.height_;
   int numCols = b.width_;
   MatrixOffset offset(destCol, 0, 0, 0, 0, 0);
-  aggregate(aggregate::sum(), base::binary::mul(), base::binary::add(), b, c,
-            numRows, numCols, offset, false_type(),
+  aggregate(aggregate::sum(),
+            base::binary::mul(),
+            base::binary::add(),
+            b,
+            c,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
             true_type() /*aAsColVector*/);
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::rowDotMul2(size_t destCol,
                                 BaseMatrixT& b,
                                 BaseMatrixT& c) {
@@ -1283,17 +1428,24 @@ void BaseMatrixT<T>::rowDotMul2(size_t destCol,
   }
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::addDotMulVMM(BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
   int numRows = b.height_;
   int numCols = b.width_;
-  aggregate(aggregate::sum(), base::binary::mul(), base::binary::add(), b, c,
-            numRows, numCols, offset, true_type() /*aAsRowVector*/,
+  aggregate(aggregate::sum(),
+            base::binary::mul(),
+            base::binary::add(),
+            b,
+            c,
+            numRows,
+            numCols,
+            offset,
+            true_type() /*aAsRowVector*/,
             false_type());
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) {
   CHECK(!useGpu_) << "do not support gpu";
 
@@ -1314,16 +1466,22 @@ void BaseMatrixT<T>::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) {
 }
 
 DEFINE_MATRIX_TERNARY_OP(addDotMulMMV, a += b * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addDotMulMMV(BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyTernary(ternary::addDotMulMMV<T>(), b, c, numRows, numCols, offset,
-               true_type() /*cAsRowVector*/, false_type());
+  applyTernary(ternary::addDotMulMMV<T>(),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               true_type() /*cAsRowVector*/,
+               false_type());
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) {
   CHECK(!useGpu_) << "do not support gpu";
 
@@ -1343,16 +1501,22 @@ void BaseMatrixT<T>::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) {
   }
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::rowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, cCol, 0);
   int numRows = height_;
   int numCols = width_;
-  applyTernary(ternary::DotMul<T>(), b, c, numRows, numCols, offset,
-    false_type(), true_type() /*cAsColVector*/);
+  applyTernary(ternary::DotMul<T>(),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               false_type(),
+               true_type() /*cAsColVector*/);
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
   CHECK(!useGpu_) << "do not support gpu";
 
@@ -1372,52 +1536,82 @@ void BaseMatrixT<T>::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
   }
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::colScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, 0, cRow);
   int numRows = height_;
   int numCols = width_;
-  applyTernary(ternary::DotMul<T>(), b, c, numRows, numCols, offset,
-               true_type() /* cAsRowVector */, false_type() /* cAsColVector */);
+  applyTernary(ternary::DotMul<T>(),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               true_type() /* cAsRowVector */,
+               false_type() /* cAsColVector */);
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addColScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, 0, cRow);
   int numRows = height_;
   int numCols = width_;
-  applyTernary(ternary::addDotMulMMV<T>(), b, c, numRows, numCols, offset,
-               true_type() /* cAsRowVector */, false_type() /* cAsColVector */);
+  applyTernary(ternary::addDotMulMMV<T>(),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               true_type() /* cAsRowVector */,
+               false_type() /* cAsColVector */);
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, cCol, 0);
   int numRows = height_;
   int numCols = width_;
-  applyTernary(ternary::addDotMulMMV<T>(), b, c, numRows, numCols, offset,
-               false_type(), true_type() /*cAsColVector*/);
+  applyTernary(ternary::addDotMulMMV<T>(),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               false_type(),
+               true_type() /*cAsColVector*/);
 }
 
 DEFINE_MATRIX_TERNARY_PARAMETER_OP(RowAdd, ONE_PARAMETER, a = b + p * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::rowAdd(size_t cCol, BaseMatrixT& b, BaseMatrixT& c, T p) {
   MatrixOffset offset(0, 0, 0, 0, cCol, 0);
   int numRows = height_;
   int numCols = width_;
-  applyTernary(ternary::RowAdd<T>(p), b, c, numRows, numCols, offset,
-    false_type(), true_type() /*cAsColVector*/);
+  applyTernary(ternary::RowAdd<T>(p),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               false_type(),
+               true_type() /*cAsColVector*/);
 }
 
 DEFINE_MATRIX_TERNARY_OP(RowPow, a = pow(b, c));
-template<>
+template <>
 void BaseMatrixT<real>::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
   if (useGpu_) {
     MatrixOffset offset(0, 0, 0, 0, cCol, 0);
     int numRows = height_;
     int numCols = width_;
-    applyTernary(ternary::RowPow<real>(), b, c, numRows, numCols, offset,
-                 false_type(), true_type() /*cAsColVector*/);
+    applyTernary(ternary::RowPow<real>(),
+                 b,
+                 c,
+                 numRows,
+                 numCols,
+                 offset,
+                 false_type(),
+                 true_type() /*cAsColVector*/);
   } else {
     size_t height = this->height_;
     size_t width = this->width_;
@@ -1434,44 +1628,64 @@ void BaseMatrixT<real>::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
   }
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::mulRowVector(BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyBinary(binary::DotMul<T>(), b, numRows, numCols, offset,
-              true_type() /* bAsRowVector */, false_type());
+  applyBinary(binary::DotMul<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              true_type() /* bAsRowVector */,
+              false_type());
 }
 
 DEFINE_MATRIX_BINARY_OP(DotDiv, a /= b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::divRowVector(BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyBinary(binary::DotDiv<T>(), b, numRows, numCols, offset,
-              true_type() /* bAsRowVector */, false_type());
+  applyBinary(binary::DotDiv<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              true_type() /* bAsRowVector */,
+              false_type());
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::mulColVector(BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyBinary(binary::DotMul<T>(), b, numRows, numCols, offset,
-              false_type(), true_type() /* bAsColVector */);
+  applyBinary(binary::DotMul<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              false_type(),
+              true_type() /* bAsColVector */);
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::divColVector(BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyBinary(binary::DotDiv<T>(), b, numRows, numCols, offset,
-              false_type(), true_type() /* bAsColVector */);
+  applyBinary(binary::DotDiv<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              false_type(),
+              true_type() /* bAsColVector */);
 }
 
-template<>
+template <>
 template <class Agg>
 int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
@@ -1479,13 +1693,20 @@ int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
   size_t numCols = b.width_;
   CHECK_EQ(height_, numRows);
   CHECK_EQ(width_, 1UL);
-  aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows,
-            numCols, offset, false_type(), true_type() /*aAsColVector*/);
+  aggregate(agg,
+            base::unary::identity(),
+            base::binary::second(),
+            b,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
+            true_type() /*aAsColVector*/);
 
   return 0;
 }
 
-template<>
+template <>
 template <class Agg, class Saver>
 int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
@@ -1493,16 +1714,25 @@ int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
   size_t numCols = b.width_;
   CHECK_EQ(height_, numRows);
   CHECK_EQ(width_, 1UL);
-  aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset,
-            false_type(), true_type() /*aAsColVector*/);
+  aggregate(agg,
+            base::unary::identity(),
+            sv,
+            b,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
+            true_type() /*aAsColVector*/);
 
   return 0;
 }
 
-template<>
+template <>
 template <class Agg>
-int BaseMatrixT<real>::applyRow(
-     Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b) {
+int BaseMatrixT<real>::applyRow(Agg agg,
+                                real scaleDest,
+                                real scaleAgg,
+                                BaseMatrixT& b) {
   if (scaleDest != 0) {
     applyRow(agg, base::binary::add2(scaleDest, scaleAgg), b);
   } else {
@@ -1514,10 +1744,10 @@ int BaseMatrixT<real>::applyRow(
   return 0;
 }
 
-template<>
+template <>
 template <class Agg, class Op, class Saver>
-int BaseMatrixT<real>::applyRow(Agg agg, Op op, Saver sv,
-                                BaseMatrixT& b, BaseMatrixT& c) {
+int BaseMatrixT<real>::applyRow(
+    Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
   size_t numRows = b.height_;
   size_t numCols = b.width_;
@@ -1525,16 +1755,27 @@ int BaseMatrixT<real>::applyRow(Agg agg, Op op, Saver sv,
   CHECK_EQ(width_, 1UL);
   CHECK_EQ(c.height_, numRows);
   CHECK_EQ(c.width_, numCols);
-  aggregate(agg, op, sv,
-            b, c, numRows, numCols, offset,
-            false_type(), true_type() /*aAsColVector*/);
+  aggregate(agg,
+            op,
+            sv,
+            b,
+            c,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
+            true_type() /*aAsColVector*/);
   return 0;
 }
 
-template<>
+template <>
 template <class Agg, class Op>
-int BaseMatrixT<real>::applyRow(Agg agg, Op op, real scaleDest, real scaleAgg,
-                                BaseMatrixT& b, BaseMatrixT& c) {
+int BaseMatrixT<real>::applyRow(Agg agg,
+                                Op op,
+                                real scaleDest,
+                                real scaleAgg,
+                                BaseMatrixT& b,
+                                BaseMatrixT& c) {
   if (scaleDest != 0) {
     applyRow(agg, op, base::binary::add2(scaleDest, scaleAgg), b, c);
   } else {
@@ -1546,7 +1787,7 @@ int BaseMatrixT<real>::applyRow(Agg agg, Op op, real scaleDest, real scaleAgg,
   return 0;
 }
 
-template<>
+template <>
 template <class Agg>
 int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
@@ -1554,13 +1795,20 @@ int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
   size_t numCols = b.width_;
   CHECK_EQ(width_, numCols);
   CHECK_EQ(height_, 1UL);
-  aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows,
-            numCols, offset, true_type() /*aAsRowVector*/, false_type());
+  aggregate(agg,
+            base::unary::identity(),
+            base::binary::second(),
+            b,
+            numRows,
+            numCols,
+            offset,
+            true_type() /*aAsRowVector*/,
+            false_type());
 
   return 0;
 }
 
-template<>
+template <>
 template <class Agg, class Saver>
 int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
@@ -1568,16 +1816,25 @@ int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
   size_t numCols = b.width_;
   CHECK_EQ(width_, numCols);
   CHECK_EQ(height_, 1UL);
-  aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset,
-            true_type() /*aAsRowVector*/, false_type());
+  aggregate(agg,
+            base::unary::identity(),
+            sv,
+            b,
+            numRows,
+            numCols,
+            offset,
+            true_type() /*aAsRowVector*/,
+            false_type());
 
   return 0;
 }
 
-template<>
+template <>
 template <class Agg>
-int BaseMatrixT<real>::applyCol(
-     Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b) {
+int BaseMatrixT<real>::applyCol(Agg agg,
+                                real scaleDest,
+                                real scaleAgg,
+                                BaseMatrixT& b) {
   if (scaleDest != 0) {
     applyCol(agg, base::binary::add2(scaleDest, scaleAgg), b);
   } else {
@@ -1589,48 +1846,51 @@ int BaseMatrixT<real>::applyCol(
   return 0;
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::sumRows(BaseMatrixT& b, real scaleSum, real scaleDest) {
   applyRow(aggregate::sum(), scaleDest, scaleSum, b);
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::maxRows(BaseMatrixT& b) {
   applyRow(aggregate::max(), b);
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::minRows(BaseMatrixT& b) {
   applyRow(aggregate::min(), b);
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::maxCols(BaseMatrixT& b) {
   applyCol(aggregate::max(), b);
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::minCols(BaseMatrixT& b) {
   applyCol(aggregate::min(), b);
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::sumCols(BaseMatrixT& b, real scaleSum, real scaleDest) {
   applyCol(aggregate::sum(), scaleDest, scaleSum, b);
 }
 
-template<>
-void BaseMatrixT<real>::sumOfSquaredDiffs(
-    BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) {
-  applyRow(aggregate::sum(), base::binary::squaredDiff(),
-           scaleDest, scaleSum, b, c);
+template <>
+void BaseMatrixT<real>::sumOfSquaredDiffs(BaseMatrixT& b,
+                                          BaseMatrixT& c,
+                                          real scaleSum,
+                                          real scaleDest) {
+  applyRow(
+      aggregate::sum(), base::binary::squaredDiff(), scaleDest, scaleSum, b, c);
 }
 
-template<>
-void BaseMatrixT<real>::sumOfProducts(
-    BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) {
-  applyRow(aggregate::sum(), base::binary::mul(),
-           scaleDest, scaleSum, b, c);
+template <>
+void BaseMatrixT<real>::sumOfProducts(BaseMatrixT& b,
+                                      BaseMatrixT& c,
+                                      real scaleSum,
+                                      real scaleDest) {
+  applyRow(aggregate::sum(), base::binary::mul(), scaleDest, scaleSum, b, c);
 }
 
 template class BaseMatrixT<real>;
diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h
index 120d69f718..12ad2d45a0 100644
--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -488,6 +488,13 @@ public:
    */
   void clip(T p1, T p2);
 
+  /**
+   * this = b < low ? 0 : 1
+   *
+   * this = b > high ? 0 : 1
+   */
+  void clipDerivative(BaseMatrixT& b, T p1, T p2);
+
   /**
    * @code
    * a = a > p ? 1.0f : 0.0f
diff --git a/paddle/math/CMakeLists.txt b/paddle/math/CMakeLists.txt
index 9981de6160..bf28092e82 100644
--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -15,13 +15,13 @@
 file(GLOB MATH_HEADERS . *.h)
 file(GLOB MATH_SOURCES . *.cpp)
 set(MATH_SOURCES
-    "${PROJ_ROOT}/paddle/math/BaseMatrix.cu"
-    "${PROJ_ROOT}/paddle/math/TrainingAlgorithmOp.cu"
+    "${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu"
+    "${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu"
     ${MATH_SOURCES})
 if(NOT WITH_GPU)
     # then compile BaseMatrix.cu as c++ file
-    compile_cu_as_cpp("${PROJ_ROOT}/paddle/math/BaseMatrix.cu")
-    compile_cu_as_cpp("${PROJ_ROOT}/paddle/math/TrainingAlgorithmOp.cu")
+    compile_cu_as_cpp("${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu")
+    compile_cu_as_cpp("${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu")
     add_library(paddle_math STATIC
         ${MATH_SOURCES})
 else()
diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h
index 860cad1047..36d57bbb65 100644
--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -302,6 +302,10 @@ public:
   bool isSparse() const { return true; }
 
 private:
+  using Matrix::mul;
   using Matrix::copyFrom;
+  using Matrix::rowMax;
+  using Matrix::print;
+  using Matrix::subMatrix;
 };
 }  // namespace paddle
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index 7045562dd4..c8ba1074a1 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -202,7 +202,7 @@ double dotProduct<double>(const int n, const double* x, const double* y) {
   return cblas_ddot(n, x, 1, y, 1);
 }
 
-#ifdef PADDLE_USE_MKL
+#if defined(PADDLE_USE_MKL) || defined(PADDLE_USE_MKLML)
 
 template <>
 void vExp<float>(const int n, const float* a, float* r) {
@@ -243,7 +243,55 @@ template <>
 void vAdd<double>(const int n, const double* a, const double* b, double* r) {
   vdAdd(n, a, b, r);
 }
+#else
+
+DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
+template <class T>
+void vExp(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vExp<T>, 0, 0>(
+      binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a));
+template <class T>
+void vLog(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vLog<T>, 0, 0>(
+      binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
+template <class T>
+void vPow(const int n, const T* a, const T b, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vPow<T>, 0, 0>(
+      binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b);
+template <class T>
+void vAdd(const int n, const T* a, const T* b, T* r) {
+  hl_cpu_apply_ternary_op<T, ternary::vAdd<T>, 0, 0>(ternary::vAdd<T>(),
+                                                     const_cast<T*>(a),
+                                                     const_cast<T*>(b),
+                                                     r,
+                                                     1,
+                                                     n,
+                                                     n,
+                                                     n,
+                                                     n);
+}
+
+template void vExp(const int n, const float* a, float* r);
+template void vExp(const int n, const double* a, double* r);
+template void vLog(const int n, const float* a, float* r);
+template void vLog(const int n, const double* a, double* r);
+template void vPow(const int n, const float* a, const float b, float* r);
+template void vPow(const int n, const double* a, const double b, double* r);
+template void vAdd(const int n, const float* a, const float* b, float* r);
+template void vAdd(const int n, const double* a, const double* b, double* r);
 
+#endif
+
+#ifdef PADDLE_USE_MKL
 template <>
 void vInvSqrt<float>(const int n, const float* a, float* r) {
   vsInvSqrt(n, a, r);
@@ -275,20 +323,6 @@ void vTanh<double>(const int n, const double* a, double* r) {
 }
 #else
 
-DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
-template <class T>
-void vExp(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vExp<T>, 0, 0>(
-      binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a));
-template <class T>
-void vLog(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vLog<T>, 0, 0>(
-      binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
 DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
 template <class T>
 void vInvSqrt(const int n, const T* a, T* r) {
@@ -312,41 +346,12 @@ void vTanh(const int n, const T* a, T* r) {
       binary::vTanh<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
-template <class T>
-void vPow(const int n, const T* a, const T b, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vPow<T>, 0, 0>(
-      binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b);
-template <class T>
-void vAdd(const int n, const T* a, const T* b, T* r) {
-  hl_cpu_apply_ternary_op<T, ternary::vAdd<T>, 0, 0>(ternary::vAdd<T>(),
-                                                     const_cast<T*>(a),
-                                                     const_cast<T*>(b),
-                                                     r,
-                                                     1,
-                                                     n,
-                                                     n,
-                                                     n,
-                                                     n);
-}
-
-template void vExp(const int n, const float* a, float* r);
-template void vExp(const int n, const double* a, double* r);
-template void vLog(const int n, const float* a, float* r);
-template void vLog(const int n, const double* a, double* r);
 template void vInvSqrt(const int n, const double* a, double* r);
 template void vInvSqrt(const int n, const float* a, float* r);
 template void vLog1p(const int n, const float* a, float* r);
 template void vLog1p(const int n, const double* a, double* r);
 template void vTanh(const int n, const float* a, float* r);
 template void vTanh(const int n, const double* a, double* r);
-template void vPow(const int n, const float* a, const float b, float* r);
-template void vPow(const int n, const double* a, const double b, double* r);
-template void vAdd(const int n, const float* a, const float* b, float* r);
-template void vAdd(const int n, const double* a, const double* b, double* r);
 
 #endif
 
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index 8ada0d34c6..637643838f 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -15,6 +15,12 @@ limitations under the License. */
 #ifndef MATHFUNCTIONS_H_
 #define MATHFUNCTIONS_H_
 
+#ifdef PADDLE_USE_MKLML
+#include <mkl_cblas.h>
+#include <mkl_lapacke.h>
+#include <mkl_vml_functions.h>
+#endif
+
 #ifdef PADDLE_USE_MKL
 #include <mkl.h>
 #include <mkl_lapacke.h>
diff --git a/paddle/math/MathUtils.cpp b/paddle/math/MathUtils.cpp
index 5bbc3e4e37..980b6e1388 100644
--- a/paddle/math/MathUtils.cpp
+++ b/paddle/math/MathUtils.cpp
@@ -25,7 +25,7 @@ namespace paddle {
  */
 void sparseRand(
     int* major, int* minor, int nnz, int majorLen, int minorMax, bool useGpu) {
-  CHECK(size_t(nnz) > size_t(1));
+  CHECK(size_t(nnz) >= size_t(1));
   int* cpuMajor;
   int* cpuMinor;
   CpuIVector cpuMinorVec(nnz);
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 4431d613f6..27f7d95b75 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -1016,81 +1016,6 @@ void GpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
   LOG(INFO) << "the  diffCnt is " << diffCnt;
 }
 
-void GpuMatrix::convExpand(Matrix& feature,
-                           int feaImgHeight,
-                           int feaImgWidth,
-                           int channels,
-                           int blockH,
-                           int blockW,
-                           int strideH,
-                           int strideW,
-                           int paddingH,
-                           int paddingW,
-                           int outputH,
-                           int outputW) {
-  CHECK(feature.useGpu_ == true) << "Matrix type are not equal";
-
-  CHECK_EQ(size_t(feaImgHeight * feaImgWidth * channels),
-           feature.getHeight() * feature.getWidth())
-      << "Matrix dimensions are not equal";
-
-  size_t elemCnt = outputH * outputW * blockH * blockW * channels;
-  CHECK_EQ(elemCnt, height_ * width_) << "Matrix dimensions are not equal";
-
-  hl_expand_feature2col(feature.getData(),
-                        channels,
-                        feaImgHeight,
-                        feaImgWidth,
-                        blockH,
-                        blockW,
-                        strideH,
-                        strideW,
-                        paddingH,
-                        paddingW,
-                        outputH,
-                        outputW,
-                        getData());
-}
-
-void GpuMatrix::convShrink(Matrix& expandFeat,
-                           int thisImgHeight,
-                           int thisImgWidth,
-                           int channels,
-                           int blockH,
-                           int blockW,
-                           int strideH,
-                           int strideW,
-                           int paddingH,
-                           int paddingW,
-                           int outputH,
-                           int outputW,
-                           real alpha,
-                           real beta) {
-  CHECK(expandFeat.useGpu_ == true) << "Matrix type are not equal";
-  CHECK_EQ(size_t(thisImgHeight * thisImgWidth * channels),
-           getHeight() * getWidth())
-      << "Matrix dimensions are not equal";
-
-  size_t elemCnt = outputH * outputW * blockW * blockH * channels;
-  CHECK(elemCnt == expandFeat.getHeight() * expandFeat.getWidth())
-      << "Matrix dimensions are not equal";
-  hl_shrink_col2feature(expandFeat.getData(),
-                        channels,
-                        thisImgHeight,
-                        thisImgWidth,
-                        blockH,
-                        blockW,
-                        strideH,
-                        strideW,
-                        paddingH,
-                        paddingW,
-                        outputH,
-                        outputW,
-                        getData(),
-                        alpha,
-                        beta);
-}
-
 void GpuMatrix::maxPoolForward(Matrix& inputMat,
                                size_t imgSizeH,
                                size_t imgSizeW,
@@ -1777,103 +1702,6 @@ void CpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
   CHECK_EQ(info, 0);
 }
 
-void CpuMatrix::convExpand(Matrix& feature,
-                           int feaImgHeight,
-                           int feaImgWidth,
-                           int channels,
-                           int blockH,
-                           int blockW,
-                           int strideH,
-                           int strideW,
-                           int paddingH,
-                           int paddingW,
-                           int outputH,
-                           int outputW) {
-  CHECK(feature.useGpu_ == false) << "Matrix type are not equal";
-
-  CHECK_EQ(size_t(feaImgHeight * feaImgWidth * channels),
-           feature.getHeight() * feature.getWidth())
-      << "Matrix dimensions are not equal";
-
-  size_t elemCnt = outputH * outputW * blockH * blockW * channels;
-  CHECK_EQ(elemCnt, height_ * width_) << "Matrix dimensions are not equal";
-
-  int channelsCol = channels * blockH * blockW;
-  real* srcData = feature.getData();
-  for (int c = 0; c < channelsCol; ++c) {
-    int wOffset = c % blockW;
-    int hOffset = (c / blockW) % blockH;
-    int c_im = c / blockH / blockW;
-    for (int h = 0; h < outputH; ++h) {
-      for (int w = 0; w < outputW; ++w) {
-        // no c_im*height to Exclude the channel number
-        int imgRowIdx = h * strideH + hOffset;
-        int imgColIdx = w * strideW + wOffset;
-        if ((imgRowIdx - paddingH) < 0 ||
-            (imgRowIdx - paddingH) >= feaImgHeight ||
-            (imgColIdx - paddingW) < 0 ||
-            (imgColIdx - paddingW) >= feaImgWidth) {
-          data_[(c * outputH + h) * outputW + w] = 0;
-        } else {
-          imgRowIdx += c_im * feaImgHeight - paddingH;
-          imgColIdx -= paddingW;
-          data_[(c * outputH + h) * outputW + w] =
-              srcData[imgRowIdx * feaImgWidth + imgColIdx];
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::convShrink(Matrix& expandFeat,
-                           int thisImgHeight,
-                           int thisImgWidth,
-                           int channels,
-                           int blockH,
-                           int blockW,
-                           int strideH,
-                           int strideW,
-                           int paddingH,
-                           int paddingW,
-                           int outputH,
-                           int outputW,
-                           real alpha,
-                           real beta) {
-  CHECK(expandFeat.useGpu_ == false) << "Matrix type are not equal";
-  CHECK_EQ(size_t(thisImgHeight * thisImgWidth * channels),
-           getHeight() * getWidth())
-      << "Matrix dimensions are not equal";
-
-  size_t elemCnt = outputH * outputW * blockH * blockW * channels;
-
-  CHECK(elemCnt == expandFeat.getHeight() * expandFeat.getWidth())
-      << "Matrix dimensions are not equal";
-
-  real* expandData = expandFeat.getData();
-  int channelsCol = channels * blockH * blockW;
-  for (int c = 0; c < channelsCol; ++c) {
-    int wOffset = c % blockW;
-    int hOffset = (c / blockW) % blockH;
-    int c_im = c / blockW / blockH;
-    for (int h = 0; h < outputH; ++h) {
-      for (int w = 0; w < outputW; ++w) {
-        int imRowIdx = h * strideH + hOffset;
-        int imColIdx = w * strideW + wOffset;
-        if ((imRowIdx - paddingH) >= 0 &&
-            (imRowIdx - paddingH) < thisImgHeight &&
-            (imColIdx - paddingW) >= 0 &&
-            (imColIdx - paddingW) < thisImgWidth) {
-          imRowIdx += c_im * thisImgHeight - paddingH;
-          imColIdx -= paddingW;
-          data_[imRowIdx * thisImgWidth + imColIdx] =
-              alpha * expandData[(c * outputH + h) * outputW + w] +
-              beta * data_[imRowIdx * thisImgWidth + imColIdx];
-        }
-      }
-    }
-  }
-}
-
 void CpuMatrix::maxPoolForward(Matrix& inputMat,
                                size_t imgSizeH,
                                size_t imgSizeW,
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 7dfd593225..bb802bbb2c 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -859,49 +859,6 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-  /**
-   * This function is used to calculate the convolution:
-   *
-   * It will expand a feature matrix according to the
-   * convolution filters
-   */
-  virtual void convExpand(Matrix& feature,
-                          int feaImgHeight,
-                          int feaImgWidth,
-                          int channels,
-                          int blockH,
-                          int blockW,
-                          int strideH,
-                          int strideW,
-                          int paddingH,
-                          int paddingW,
-                          int outputH,
-                          int outputW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * This function is the reverse implementation of convExpand:
-   *
-   * Its function is to restore a expanded-matrix into a feature matrix
-   */
-  virtual void convShrink(Matrix& expandColMat,
-                          int thisImgHeight,
-                          int thisImgWidth,
-                          int channels,
-                          int blockH,
-                          int blockW,
-                          int strideH,
-                          int strideW,
-                          int paddingH,
-                          int paddingW,
-                          int outputH,
-                          int outputW,
-                          real alpha = 1.0f,
-                          real beta = 0.0f) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
   /**
    * Pooling forward operation, pick out the largest element
    * in the sizeX of value
@@ -1335,34 +1292,6 @@ public:
 
   void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
 
-  void convExpand(Matrix& feature,
-                  int feaImgHeight,
-                  int feaImgWidth,
-                  int channels,
-                  int blockH,
-                  int blockW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  int outputH,
-                  int outputW);
-
-  void convShrink(Matrix& expandColMat,
-                  int thisImgHeight,
-                  int thisImgWidth,
-                  int channels,
-                  int blockH,
-                  int blochW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingWreal,
-                  int outputH,
-                  int outputW,
-                  real alpha = 1.0f,
-                  real beta = 0.0f);
-
   void maxPoolForward(Matrix& inputMat,
                       size_t imgSizeH,
                       size_t imgSizeW,
@@ -1522,34 +1451,6 @@ public:
 
   MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
 
-  void convExpand(Matrix& feature,
-                  int feaImgHeight,
-                  int feaImgWidth,
-                  int channels,
-                  int blcokH,
-                  int blockW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  int outputH,
-                  int outputW);
-
-  void convShrink(Matrix& expandFeat,
-                  int thisImgHeight,
-                  int thisImgWidth,
-                  int channels,
-                  int blockH,
-                  int blockW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  int outputH,
-                  int outputW,
-                  real alpha = 1.0f,
-                  real beta = 0.0f);
-
   void maxPoolForward(Matrix& inputMat,
                       size_t imgSizeH,
                       size_t imgSizeW,
diff --git a/paddle/math/SparseMatrix.h b/paddle/math/SparseMatrix.h
index f6cd5df338..16300db081 100644
--- a/paddle/math/SparseMatrix.h
+++ b/paddle/math/SparseMatrix.h
@@ -231,6 +231,9 @@ public:
 private:
   using Matrix::mul;
   using Matrix::copyFrom;
+  using Matrix::rowMax;
+  using Matrix::print;
+  using Matrix::subMatrix;
 };
 
 }  // namespace paddle
diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp
index 7ce17a3207..4adaaef983 100644
--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
@@ -32,9 +32,7 @@ static InitFunction __init_storage_engine([]() { StorageEngine::singleton(); },
 StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {}
 
 StorageEngine::~StorageEngine() {
-  if (cpuAllocator_) {
-    delete cpuAllocator_;
-  }
+  delete cpuAllocator_;
   for (auto it : gpuAllocator_) {
     delete it;
   }
diff --git a/paddle/math/TrainingAlgorithmOp.cu b/paddle/math/TrainingAlgorithmOp.cu
index 72ff077270..fc746b8533 100644
--- a/paddle/math/TrainingAlgorithmOp.cu
+++ b/paddle/math/TrainingAlgorithmOp.cu
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "BaseMatrix.h"
 #include "TrainingAlgorithmOp.h"
+#include "paddle/utils/Logging.h"
 
 #if __cplusplus > 199711L
 
@@ -32,10 +32,10 @@ void sparseMomentumApply(BaseMatrix& value,
                          real tau,
                          real learningRate) {
   auto expr1 = momU.lazyAssign(momU - (alpha * gamma * learningRate) * grad);
-  auto expr2 = momV.lazyAssign(
-    momV + (tau * alpha * gamma * learningRate) * grad);
-  auto expr3 = value.lazyAssign(
-    (tau / beta + (real)1 / alpha) * momU + ((real)1 / beta) * momV);
+  auto expr2 =
+      momV.lazyAssign(momV + (tau * alpha * gamma * learningRate) * grad);
+  auto expr3 = value.lazyAssign((tau / beta + (real)1 / alpha) * momU +
+                                ((real)1 / beta) * momV);
 
   AssignEvaluate(expr1, expr2, expr3);
 }
@@ -52,12 +52,12 @@ void adadeltaApply(BaseMatrix& value,
                    real momentum,
                    real decayRate) {
   auto expr1 = accum.lazyAssign(rou * accum + ((real)1 - rou) * grad.square());
-  auto expr2 = lr.lazyAssign(
-    ((accum_update + epsilon) / (accum + epsilon)).sqrt());
-  auto expr3 = accum_update.lazyAssign(
-    rou * accum_update + ((real)1 - rou) * (grad * lr).square());
-  auto expr4 = mom.lazyAssign(
-    mom * momentum - learningRate * lr * (grad + value * decayRate));
+  auto expr2 =
+      lr.lazyAssign(((accum_update + epsilon) / (accum + epsilon)).sqrt());
+  auto expr3 = accum_update.lazyAssign(rou * accum_update +
+                                       ((real)1 - rou) * (grad * lr).square());
+  auto expr4 = mom.lazyAssign(mom * momentum -
+                              learningRate * lr * (grad + value * decayRate));
   auto expr5 = value.lazyAssign(value + mom);
 
   AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
@@ -74,10 +74,10 @@ void adagradApply(BaseMatrix& value,
                   real momentum,
                   real decayRate) {
   auto expr1 = accum.lazyAssign(accum + grad.square());
-  auto expr2 = lr.lazyAssign(
-    (accum_buffer + accum + epsilon).sqrt().reciprocal());
-  auto expr3 = mom.lazyAssign(
-    mom * momentum - learningRate * lr * (grad + value * decayRate));
+  auto expr2 =
+      lr.lazyAssign((accum_buffer + accum + epsilon).sqrt().reciprocal());
+  auto expr3 = mom.lazyAssign(mom * momentum -
+                              learningRate * lr * (grad + value * decayRate));
   auto expr4 = value.lazyAssign(value + mom);
 
   AssignEvaluate(expr1, expr2, expr3, expr4);
@@ -98,8 +98,8 @@ void rmspropApply(BaseMatrix& value,
                   bool firstTime) {
   auto expr2 = f.lazyAssign(accumulatedRou * f + ((real)1 - rou) * grad);
   auto expr3 = lr.lazyAssign((g - f.square() + epsilon).sqrt().reciprocal());
-  auto expr4 = mom.lazyAssign(
-    mom * momentum - learningRate * lr * (grad + value * decayRate));
+  auto expr4 = mom.lazyAssign(mom * momentum -
+                              learningRate * lr * (grad + value * decayRate));
   auto expr5 = value.lazyAssign(value + mom);
 
   if (firstTime) {
@@ -107,8 +107,8 @@ void rmspropApply(BaseMatrix& value,
 
     AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
   } else {
-    auto expr1 = g.lazyAssign(
-      accumulatedRou * g + ((real)1 - rou) * grad.square());
+    auto expr1 =
+        g.lazyAssign(accumulatedRou * g + ((real)1 - rou) * grad.square());
 
     AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
   }
@@ -127,8 +127,8 @@ void decayedAdagradApply(BaseMatrix& value,
                          real decayRate,
                          bool firstTime) {
   auto expr2 = lr.lazyAssign((accum + epsilon).sqrt().reciprocal());
-  auto expr3 = mom.lazyAssign(
-    mom * momentum - learningRate * lr * (grad + value * decayRate));
+  auto expr3 = mom.lazyAssign(mom * momentum -
+                              learningRate * lr * (grad + value * decayRate));
   auto expr4 = value.lazyAssign(value + mom);
 
   if (firstTime) {
@@ -136,8 +136,8 @@ void decayedAdagradApply(BaseMatrix& value,
 
     AssignEvaluate(expr1, expr2, expr3, expr4);
   } else {
-    auto expr1 = accum.lazyAssign(
-      accumulatedRou * accum + ((real)1 - rou) * grad.square());
+    auto expr1 = accum.lazyAssign(accumulatedRou * accum +
+                                  ((real)1 - rou) * grad.square());
 
     AssignEvaluate(expr1, expr2, expr3, expr4);
   }
@@ -153,13 +153,12 @@ void adamApply(BaseMatrix& value,
                real beta2_power,
                real epsilon,
                real learningRate) {
-  real alpha = learningRate *
-      std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
+  real alpha =
+      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
 
   auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
   auto expr2 = v.lazyAssign(beta2 * v + ((real)1 - beta2) * grad.square());
-  auto expr3 = value.lazyAssign(
-    value - (mom * alpha) / (v.sqrt() + epsilon));
+  auto expr3 = value.lazyAssign(value - (mom * alpha) / (v.sqrt() + epsilon));
 
   AssignEvaluate(expr1, expr2, expr3);
 }
@@ -173,10 +172,10 @@ void adamaxApply(BaseMatrix& value,
                  int64_t step,
                  real alpha) {
   auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
-  auto expr2 = u.lazyAssign(
-    (beta2 * u > grad.abs()).condition(beta2 * u, grad.abs()));
+  auto expr2 =
+      u.lazyAssign((beta2 * u > grad.abs()).condition(beta2 * u, grad.abs()));
   auto expr3 = value.lazyAssign(
-    value - (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u));
+      value - (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u));
 
   AssignEvaluate(expr1, expr2, expr3);
 }
@@ -322,8 +321,8 @@ void adamApply(BaseMatrix& value,
                real beta2_power,
                real epsilon,
                real learningRate) {
-  real alpha = learningRate *
-      std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
+  real alpha =
+      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
 
   // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
   mom = beta1 * mom + ((real)1 - beta1) * grad;
@@ -331,7 +330,7 @@ void adamApply(BaseMatrix& value,
   // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
   v = beta2 * v + ((real)1 - beta2) * grad.square();
 
-  value -=  (mom * alpha) / (v.sqrt() + epsilon);
+  value -= (mom * alpha) / (v.sqrt() + epsilon);
 }
 
 void adamaxApply(BaseMatrix& value,
diff --git a/paddle/math/tests/test_Tensor.cu b/paddle/math/tests/test_Tensor.cu
index 40e38434fa..31b693afa8 100644
--- a/paddle/math/tests/test_Tensor.cu
+++ b/paddle/math/tests/test_Tensor.cu
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include "paddle/math/Matrix.h"
 #include "TensorCheck.h"
+#include "paddle/math/Matrix.h"
 
 using paddle::Matrix;
 using paddle::CpuMatrix;
@@ -26,25 +26,25 @@ using paddle::GpuIVector;
 using autotest::TensorCheckEqual;
 using autotest::TensorCheckErr;
 
-#define INIT_UNARY(A1, A2)                  \
-    Tensor A1(height, width);               \
-    Tensor A2(height, width);               \
-    A1.randomizeUniform();                  \
-    A2.copyFrom(A1)
-#define INIT_BINARY(A1, A2, B)              \
-    INIT_UNARY(A1, A2);                     \
-    Tensor B(height, width);                \
-    B.randomizeUniform()
-#define INIT_TERNARY(A1, A2, B, C)          \
-    INIT_BINARY(A1, A2, B);                 \
-    Tensor C(height, width);                \
-    C.randomizeUniform()
-#define INIT_QUATERNARY(A1, A2, B, C, D)    \
-    INIT_TERNARY(A1, A2, B, C);             \
-    Tensor D(height, width);                \
-    D.randomizeUniform()
-
-template<typename Tensor>
+#define INIT_UNARY(A1, A2)  \
+  Tensor A1(height, width); \
+  Tensor A2(height, width); \
+  A1.randomizeUniform();    \
+  A2.copyFrom(A1)
+#define INIT_BINARY(A1, A2, B) \
+  INIT_UNARY(A1, A2);          \
+  Tensor B(height, width);     \
+  B.randomizeUniform()
+#define INIT_TERNARY(A1, A2, B, C) \
+  INIT_BINARY(A1, A2, B);          \
+  Tensor C(height, width);         \
+  C.randomizeUniform()
+#define INIT_QUATERNARY(A1, A2, B, C, D) \
+  INIT_TERNARY(A1, A2, B, C);            \
+  Tensor D(height, width);               \
+  D.randomizeUniform()
+
+template <typename Tensor>
 struct TestUnaryMatrix {
   typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
 
@@ -59,7 +59,7 @@ struct TestUnaryMatrix {
   }
 };
 
-template<typename Tensor>
+template <typename Tensor>
 struct TestBinaryMatrix {
   typedef std::function<void(Tensor& A1, Tensor& A2, Tensor& B)> BinaryFunc;
 
@@ -74,10 +74,10 @@ struct TestBinaryMatrix {
   }
 };
 
-template<typename Tensor>
+template <typename Tensor>
 struct TestTernaryMatrix {
-  typedef std::function<void(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C)> TernaryFunc;
+  typedef std::function<void(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C)>
+      TernaryFunc;
 
   explicit TestTernaryMatrix(TernaryFunc testTernaryFunc) {
     for (auto height : {1, 11, 73, 128, 200, 330}) {
@@ -90,10 +90,11 @@ struct TestTernaryMatrix {
   }
 };
 
-template<typename Tensor>
+template <typename Tensor>
 struct TestQuaternaryMatrix {
   typedef std::function<void(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D)> QuaternaryFunc;
+      Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D)>
+      QuaternaryFunc;
 
   explicit TestQuaternaryMatrix(QuaternaryFunc testQuaternaryFunc) {
     for (auto height : {1, 11, 73, 128, 200, 330}) {
@@ -106,7 +107,7 @@ struct TestQuaternaryMatrix {
   }
 };
 
-template<typename Tensor, class T>
+template <typename Tensor, class T>
 struct TestUnaryVectorT {
   typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
 
@@ -142,11 +143,11 @@ void SetTensorValue(Matrix& matrix, real value) {
   }
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorAddScalar(Tensor& A1, Tensor& A2) {
   real p1 = 2.5;
   real p2 = 3.0;
-  A1.add(p1);   // a += p
+  A1.add(p1);  // a += p
   A2 += p1;
   TensorCheckEqual(A1, A2);
 
@@ -155,7 +156,7 @@ void testTensorAddScalar(Tensor& A1, Tensor& A2) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSubScalar(Tensor& A1, Tensor& A2) {
   real p = 2.5;
   A1.subScalar(p);  // a -= p
@@ -163,7 +164,7 @@ void testTensorSubScalar(Tensor& A1, Tensor& A2) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorMulScalar(Tensor& A1, Tensor& A2) {
   real p = 2.5;
   A1.mulScalar(p);  // a *= p
@@ -177,7 +178,7 @@ void testTensorMulScalar(Tensor& A1, Tensor& A2) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorDivScalar(Tensor& A1, Tensor& A2) {
   real p = 2.5;
   A1.divScalar(p);  // a /= p
@@ -185,44 +186,44 @@ void testTensorDivScalar(Tensor& A1, Tensor& A2) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorNeg(Tensor& A1, Tensor& A2) {
   A1.neg();  // a = -a
   A2 = -A2;
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorAbs(Tensor& A1, Tensor& A2) {
   A1.abs2();  // a = a > 0 ? a : -a
   A2 = A2.abs();
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSquare(Tensor& A1, Tensor& A2) {
   A1.square2();  // a = a * a
   A2 = A2.square();
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorReciprocal(Tensor& A1, Tensor& A2) {
   A1.reciprocal2();  // a = 1.0f / a
   A2 = A2.reciprocal();
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSign(Tensor& A1, Tensor& A2) {
   A1.sign2();  // a = (a > 0) - (a < 0)
   A2 = A2.sign();
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorAssign(Tensor& A1, Tensor& A2) {
-  A1.assign(1.5);   // a = p
+  A1.assign(1.5);  // a = p
   A2 = A2.constant(1.5);
   TensorCheckEqual(A1, A2);
 
@@ -235,7 +236,7 @@ void testTensorAssign(Tensor& A1, Tensor& A2) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testUnaryBaseOp(Tensor& A1, Tensor& A2) {
   testTensorAddScalar(A1, A2);
   testTensorSubScalar(A1, A2);
@@ -249,9 +250,9 @@ void testUnaryBaseOp(Tensor& A1, Tensor& A2) {
   testTensorAssign(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) {
-  A1.add(2);   // a += p
+  A1.add(2);  // a += p
   A2 += 2;
   TensorCheckEqual(A1, A2);
 
@@ -266,46 +267,46 @@ void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) {
 TEST(Unary, BaseOp) {
   TestUnaryMatrix<CpuMatrix> testCpuMatrix(testUnaryBaseOp<CpuMatrix>);
   TestUnaryVectorT<CpuVector, real> testCpuVector(testUnaryBaseOp<CpuVector>);
-  TestUnaryVectorT<CpuIVector, int>
-    testCpuIVector(testUnaryBaseOpInt<CpuIVector>);
+  TestUnaryVectorT<CpuIVector, int> testCpuIVector(
+      testUnaryBaseOpInt<CpuIVector>);
 
 #ifndef PADDLE_ONLY_CPU
   TestUnaryMatrix<GpuMatrix> testGpuMatrix(testUnaryBaseOp<GpuMatrix>);
   TestUnaryVectorT<GpuVector, real> testGpuVector(testUnaryBaseOp<GpuVector>);
-  TestUnaryVectorT<GpuIVector, int>
-    testGpuIVector(testUnaryBaseOpInt<GpuIVector>);
+  TestUnaryVectorT<GpuIVector, int> testGpuIVector(
+      testUnaryBaseOpInt<GpuIVector>);
 #endif
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorExp(Tensor& A1, Tensor& A2) {
   A1.exp2();  // a = exp(a)
   A2 = A2.exp();
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorLog(Tensor& A1, Tensor& A2) {
   A1.log2();  // a = log(a)
   A2 = A2.log();
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSqrt(Tensor& A1, Tensor& A2) {
   A1.sqrt2();  // a = sqrt(a)
   A2 = A2.sqrt();
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorPow(Tensor& A1, Tensor& A2) {
   A1.pow2(3.2);  // a = pow(a, p)
   A2 = A2.pow(3.2);
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testUnayrMathOp(Tensor& A1, Tensor& A2) {
   testTensorExp(A1, A2);
   testTensorLog(A1, A2);
@@ -321,7 +322,7 @@ TEST(Unary, MathOp) {
 #endif
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorClip(Tensor& A1, Tensor& A2) {
   real p1 = 0.003f;
   real p2 = 0.877f;
@@ -331,7 +332,7 @@ void testTensorClip(Tensor& A1, Tensor& A2) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) {
   real p = 0.5f;
   A1.biggerThanScalar(p);  // a = a > p ? 1.0f : 0.0f
@@ -339,7 +340,7 @@ void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorapplyL1(Tensor& A1, Tensor& A2) {
   /**
    * T lambda = p;
@@ -351,14 +352,15 @@ void testTensorapplyL1(Tensor& A1, Tensor& A2) {
   real learningRate = 0.7f;
   real decayRate = 0.6f;
   A1.applyL1(learningRate, decayRate);
-  A2 = (A2 > (learningRate * decayRate)).condition(
-    (A2 - (learningRate * decayRate)),
-    (A2 < -(learningRate * decayRate)).condition(
-      (A2 + (learningRate * decayRate)), (real)0.0));
+  A2 = (A2 > (learningRate * decayRate))
+           .condition(
+               (A2 - (learningRate * decayRate)),
+               (A2 < -(learningRate * decayRate))
+                   .condition((A2 + (learningRate * decayRate)), (real)0.0));
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testUnayrCompareOp(Tensor& A1, Tensor& A2) {
   testTensorClip(A1, A2);
   testTensorBiggerThanScalar(A1, A2);
@@ -377,7 +379,7 @@ TEST(Unary, CompareOp) {
 #endif
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) {
   real p1 = 2.5;
   real p2 = 3.2;
@@ -406,7 +408,7 @@ void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) {
   real p = 2.5;
   A1.sub(B);  // a -= b
@@ -422,7 +424,7 @@ void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) {
   real p = 2.5;
   A1.mulScalar(B, p);  // a = b * p
@@ -442,7 +444,7 @@ void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) {
   real p = 2.5;
   A1.divScalar(B, p);  // a = b / p
@@ -454,28 +456,28 @@ void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorAssign(Tensor& A1, Tensor& A2, Tensor& B) {
   A1.assign(B);  // a = b
   A2 = B;
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSquare(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.square2(A1);   // b = a * a
+  B.square2(A1);  // b = a * a
   A2 = B.square();
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSquareDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
   A1.squareDerivative(B);  // a *= 2.0 * b
   A2 = A2 * (real)2.0 * B;
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) {
   B.reciprocal2(A1);  // b = 1.0f / a
   A2 = B.reciprocal();
@@ -490,33 +492,33 @@ void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) {
   real learningRate = 0.7f;
   real decayRate = 1.2f;
   A1.applyL2(B, learningRate, decayRate);  // a *= (1.0f / (1.0f + p * b))
-  A2 *= (B.constant(1.0f) +
-    B.constant(learningRate * decayRate) * B).reciprocal();
+  A2 *= (B.constant(1.0f) + B.constant(learningRate * decayRate) * B)
+            .reciprocal();
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorReciprocalDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
   A1.reciprocalDerivative(B);  // a *= -b * b
   A2 *= (-B) * B;
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSign(Tensor& A1, Tensor& A2, Tensor& B) {
   B.sign2(A1);  // b = a > 0.0f ? 1.0f : -1.0f
   A2 = B.sign();
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorAbs(Tensor& A1, Tensor& A2, Tensor& B) {
   B.abs2(A1);  // b = a > 0.0f ? a : -a
   A2 = B.abs();
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testBinaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B) {
   testTensorAdd(A1, A2, B);
   testTensorSub(A1, A2, B);
@@ -539,7 +541,7 @@ TEST(Binary, BaseOp) {
 #endif
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) {
   // a = exp(b)
   A1.exp2(B);
@@ -547,14 +549,14 @@ void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) {
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorExpDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
   A1.expDerivative(B);  // a *= b
   A2 *= B;
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) {
   // a = log(b)
   A1.log2(B);
@@ -562,7 +564,7 @@ void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) {
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
   // a = sqrt(b)
   A1.sqrt2(B);
@@ -570,7 +572,7 @@ void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
   // a = 1.0f / sqrt(b)
   A1.invSqrt(B);
@@ -578,14 +580,14 @@ void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorPow(Tensor& A1, Tensor& A2, Tensor& B) {
   A1.pow2(B, 2.5f);  // a = pow(b, p)
   A2 = B.pow(2.5f);
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) {
   /*
    * const T THRESHOLD = 40.0;
@@ -597,12 +599,14 @@ void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) {
 
   real THRESHOLD = 40.0;
   A2 = (B.constant(1.0f) +
-        (B > THRESHOLD).condition(
-          THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)).exp()).log();
+        (B > THRESHOLD)
+            .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B))
+            .exp())
+           .log();
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
   /*
    * const T THRESHOLD = 40.0;
@@ -612,14 +616,16 @@ void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
    */
   A1.softreluDerivative(B);
   real THRESHOLD = 40.0;
-  A2 = A2 * (B.constant(1.0f) -
-             (B.constant(-1.0f) *
-              (B > THRESHOLD).condition(
-                THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B))).exp());
+  A2 = A2 *
+       (B.constant(1.0f) -
+        (B.constant(-1.0f) *
+         (B > THRESHOLD)
+             .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)))
+            .exp());
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) {
   /*
     const T THRESHOLD_MIN = -40.0;
@@ -632,46 +638,47 @@ void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) {
 
   const real THRESHOLD_MIN = -40.0;
   const real THRESHOLD_MAX = 13.0;
-  auto tmp = (B < THRESHOLD_MIN).condition(
-    THRESHOLD_MIN, (B > THRESHOLD_MAX).condition(THRESHOLD_MAX, B));
+  auto tmp = (B < THRESHOLD_MIN)
+                 .condition(THRESHOLD_MIN,
+                            (B > THRESHOLD_MAX).condition(THRESHOLD_MAX, B));
   A2 = (B.constant(1.0f) + (-tmp).exp()).reciprocal();
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSigmoidDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
   A1.sigmoidDerivative(B);  // a *= b * (1 - b)
   A2 *= B * (B.constant(1.0f) - B);
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorTanh(Tensor& A1, Tensor& A2, Tensor& B) {
   B.tanh(A1);  // b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
   A2 = B.constant(2.0f) / ((B * ((real)-2.0f)).exp() + (real)1.0f) - (real)1.0f;
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
   A1.tanhDerivative(B);  // a *= 1 - b * b
   A2 *= B.constant(1.0f) - B * B;
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorScaledTanh(Tensor& A1, Tensor& A2, Tensor& B) {
   real p1 = 2.5;
   real p2 = 3.1;
   // b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0)
   B.scaledTanh(A1, p1, p2);
   A2 = B.constant(p1) *
-      (B.constant(2.0f) / ((B.constant(-2.0f) * p2 * B).exp() + (real)1.0)
-       - (real)1.0);
+       (B.constant(2.0f) / ((B.constant(-2.0f) * p2 * B).exp() + (real)1.0) -
+        (real)1.0);
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
   real p1 = 2.5;
   real p2 = 3.1;
@@ -681,7 +688,7 @@ void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testBinaryMathOp(Tensor& A1, Tensor& A2, Tensor& B) {
   testTensorTanhDerivative(A1, A2, B);
   testTensorScaledTanhDerivative(A1, A2, B);
@@ -708,21 +715,21 @@ TEST(Binary, MathOp) {
 #endif
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorRelu(Tensor& A1, Tensor& A2, Tensor& B) {
   B.relu(A1);  // b = a > 0.0f ? a : 0.0f
   A2 = (B > (real)0.0f).condition(B, (real)0.0f);
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorReluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
   A1.reluDerivative(B);  // a *= (b > 0.0f ? 1.0f : 0.0f)
   A2 *= (B > (real)0.0).condition((real)1.0, (real)0.0);
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) {
   /*
    * b = a > p1 ? a : p1
@@ -736,7 +743,7 @@ void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
   SetTensorValue(B, 32.0f);
   /*
@@ -748,15 +755,15 @@ void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorAbsDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
   A1.absDerivative(B);  // a = (b > 0) ? a : (b < 0) ? -a : 0
-  A2 = (B > (real)0.0f).condition(A2,
-    (B < (real)0.0f).condition(-A2, (real)0.0f));
+  A2 = (B > (real)0.0f)
+           .condition(A2, (B < (real)0.0f).condition(-A2, (real)0.0f));
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) {
   real p = 0.613;
   SetTensorValue(B, p);
@@ -765,7 +772,7 @@ void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) {
   /**
    * T lambda = p * b;
@@ -778,12 +785,13 @@ void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) {
   real decayRate = 0.6f;
   A1.applyL1(B, learningRate, decayRate);
   auto lambda = B.constant(learningRate * decayRate) * B;
-  A2 = (A2 > lambda).condition(
-    (A2 - lambda), (A2 < -lambda).condition((A2 + lambda), (real)0.0f));
+  A2 = (A2 > lambda)
+           .condition((A2 - lambda),
+                      (A2 < -lambda).condition((A2 + lambda), (real)0.0f));
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testBinaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B) {
   B.subScalar(0.5f);
   SetTensorValue(B, 0.0f);
@@ -807,7 +815,7 @@ TEST(Binary, CompareOp) {
 #endif
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   A1.add(B, C);  // a = b + c
   A2 = B + C;
@@ -833,7 +841,7 @@ void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   A1.sub(B, C);  // a = b - c
   A2 = B - C;
@@ -846,7 +854,7 @@ void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   A1.dotMul(B, C);  // a = b * c
   A2 = B * C;
@@ -892,7 +900,7 @@ void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   A1.dotDiv(B, C);  // a = (b == 0.0) ? 0.0 : b / c
   A2 = (B == (real)0.0).condition((real)0.0, B / C);
@@ -905,7 +913,7 @@ void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   real p1 = 1.5;
   real p2 = 2.5;
@@ -915,14 +923,14 @@ void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSoftCrossEntropy(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   A1.softCrossEntropy(B, C);  // a = -c * log(b) - (1 - c) * log(1 - b)
   A2 = -C * B.log() - (C.constant(1.0f) - C) * (B.constant(1.0f) - B).log();
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSoftCrossEntropyBp(Tensor& A1,
                                   Tensor& A2,
                                   Tensor& B,
@@ -932,7 +940,7 @@ void testTensorSoftCrossEntropyBp(Tensor& A1,
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTernaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   testTensorAdd(A1, A2, B, C);
   testTensorSub(A1, A2, B, C);
@@ -952,30 +960,30 @@ TEST(Ternary, BaseOp) {
 #endif
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorBinaryLabelCrossEntropy(Tensor& A1,
                                        Tensor& A2,
                                        Tensor& B,
                                        Tensor& C) {
   A1.binaryLabelCrossEntropy(B, C);  // a = c > 0.5 ? -log(b) : -log(1.0 - b)
-  A2 = (C > (real)0.5).condition(
-    -(B.log()), -((B.constant(1.0f) - B).log()));
+  A2 = (C > (real)0.5).condition(-(B.log()), -((B.constant(1.0f) - B).log()));
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorBinaryLabelCrossEntropyBp(Tensor& A1,
                                          Tensor& A2,
                                          Tensor& B,
                                          Tensor& C) {
   // a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b)
   A1.binaryLabelCrossEntropyBp(B, C);
-  A2 += (C > (real)0.5).condition(
-    (B.constant(-1.0f) / B), (B.constant(1.0f) - B).reciprocal());
+  A2 += (C > (real)0.5)
+            .condition((B.constant(-1.0f) / B),
+                       (B.constant(1.0f) - B).reciprocal());
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorLogisticRegressionLoss(Tensor& A1,
                                       Tensor& A2,
                                       Tensor& B,
@@ -991,13 +999,14 @@ void testTensorLogisticRegressionLoss(Tensor& A1,
    */
   A1.logisticRegressionLoss(B, C);
   real THRESHOLD = 40.0;
-  auto tmp = (B > THRESHOLD).condition(
-    THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
+  auto tmp =
+      (B > THRESHOLD)
+          .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
   A2 = (C.constant(1.0f) + tmp.exp()).log() - C * tmp;
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorLogisticRegressionLossBp(Tensor& A1,
                                         Tensor& A2,
                                         Tensor& B,
@@ -1013,28 +1022,29 @@ void testTensorLogisticRegressionLossBp(Tensor& A1,
    */
   A1.logisticRegressionLossBp(B, C);
   real THRESHOLD = 40.0;
-  auto tmp = (B > THRESHOLD).condition(
-    THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
+  auto tmp =
+      (B > THRESHOLD)
+          .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
   auto tmp2 = tmp.exp();
   A2 = tmp2 / (C.constant(1.0) + tmp2) - C;
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorBiggerThan(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   A1.biggerThan(B, C);  // a = (b > c) ? 1.0f : 0.0f
   A2 = (B > C).condition((real)1.0f, (real)0.0f);
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorMax(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   A1.max2(B, C);  // a = (b > c) ? b : c
   A2 = (B > C).condition(B, C);
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTernaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   testTensorBinaryLabelCrossEntropyBp(A1, A2, B, C);
   testTensorBinaryLabelCrossEntropy(A1, A2, B, C);
@@ -1053,12 +1063,9 @@ TEST(Ternary, CompareOp) {
 #endif
 }
 
-template<typename Tensor>
-void testQuaternaryAdd(Tensor& A1,
-                       Tensor& A2,
-                       Tensor& B,
-                       Tensor& C,
-                       Tensor& D) {
+template <typename Tensor>
+void testQuaternaryAdd(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
   // A1.add3(B, C, D, 1.5f, 2.5f, 3.5f);  // a = p1 * b + p2 * c + p3 * d
   // A2 = B * 1.5f + C * 2.5f + D * 3.5f;
   // TensorCheckEqual(A1, A2);
@@ -1084,25 +1091,19 @@ TEST(Quaternary, BaseOp) {
 #endif
 }
 
-template<typename Tensor>
-void testTensorBiggerThan(Tensor& A1,
-                          Tensor& A2,
-                          Tensor& B,
-                          Tensor& C,
-                          Tensor& D) {
+template <typename Tensor>
+void testTensorBiggerThan(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
   // a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
   A1.biggerThan(B, C, D);
-  A2 = ((B > C && D > (real)0.5)
-        || (B < C && D < (real)0.5)).condition((real)1.0, (real)0.0);
+  A2 = ((B > C && D > (real)0.5) || (B < C && D < (real)0.5))
+           .condition((real)1.0, (real)0.0);
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
-void testTensorRankLoss(Tensor& A1,
-                        Tensor& A2,
-                        Tensor& B,
-                        Tensor& C,
-                        Tensor& D) {
+template <typename Tensor>
+void testTensorRankLoss(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
   /**
    * const T THRESHOLD = 40.0; a = b - c;
    * a = (a > THRESHOLD)
@@ -1114,19 +1115,17 @@ void testTensorRankLoss(Tensor& A1,
 
   real THRESHOLD = 40.0;
   auto tmp = B - C;
-  auto tmp2 = (tmp > THRESHOLD).condition(
-    THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
+  auto tmp2 =
+      (tmp > THRESHOLD)
+          .condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
   A2 = (D.constant(1.0f) + tmp2.exp()).log() - tmp2 * D;
 
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
-void testTensorRankLossBp(Tensor& A1,
-                          Tensor& A2,
-                          Tensor& B,
-                          Tensor& C,
-                          Tensor& D) {
+template <typename Tensor>
+void testTensorRankLossBp(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
   /**
    * const T THRESHOLD = 40.0; a = b - c;
    * a = (a > THRESHOLD)
@@ -1137,20 +1136,18 @@ void testTensorRankLossBp(Tensor& A1,
   A1.rankLossBp(B, C, D);
   real THRESHOLD = 40.0;
   auto tmp = B - C;
-  auto tmp2 = (tmp > THRESHOLD).condition(
-    THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
+  auto tmp2 =
+      (tmp > THRESHOLD)
+          .condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
   auto tmp3 = tmp2.exp();
   A2 = tmp3 / (D.constant(1.0f) + tmp3) - D;
 
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
-void testQuaternaryCompareOp(Tensor& A1,
-                             Tensor& A2,
-                             Tensor& B,
-                             Tensor& C,
-                             Tensor& D) {
+template <typename Tensor>
+void testQuaternaryCompareOp(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
   testTensorBiggerThan(A1, A2, B, C, D);
   testTensorRankLoss(A1, A2, B, C, D);
   testTensorRankLossBp(A1, A2, B, C, D);
diff --git a/paddle/math/tests/test_lazyAssign.cu b/paddle/math/tests/test_lazyAssign.cu
index 786d863a53..92afab4ff7 100644
--- a/paddle/math/tests/test_lazyAssign.cu
+++ b/paddle/math/tests/test_lazyAssign.cu
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+#include "PerfUtils.h"
+#include "TensorCheck.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/TensorAssign.h"
-#include "TensorCheck.h"
-#include "PerfUtils.h"
 
 using paddle::BaseMatrix;
 using paddle::CpuMatrix;
@@ -27,14 +27,28 @@ using autotest::TensorCheckErr;
 typedef std::function<void(int height, int width)> testMatrixFunc;
 void testMatrixCase(testMatrixFunc matrixFunc) {
   for (auto height : {1}) {
-    for (auto width : {1, 32, 64, 128, 512, 1024, 4096, 32768, 65536, 131072,
-                       262144, 524288, 1048576, 2097152, 4194304, 8388608}) {
+    for (auto width : {1,
+                       32,
+                       64,
+                       128,
+                       512,
+                       1024,
+                       4096,
+                       32768,
+                       65536,
+                       131072,
+                       262144,
+                       524288,
+                       1048576,
+                       2097152,
+                       4194304,
+                       8388608}) {
       matrixFunc(height, width);
     }
   }
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testLazyAssign(int height, int width) {
   Tensor A1(height, width);
   Tensor A2(height, width);
@@ -49,40 +63,39 @@ void testLazyAssign(int height, int width) {
 
   EXPRESSION_PERFORMANCE(A1 = B + C; A1 = A1 * D;);
 
-  EXPRESSION_PERFORMANCE(
-    auto expr1 = A2.lazyAssign(B + C);
-    auto expr2 = A2.lazyAssign(A2 * D);
-    AssignEvaluate(expr1, expr2););
+  EXPRESSION_PERFORMANCE(auto expr1 = A2.lazyAssign(B + C);
+                         auto expr2 = A2.lazyAssign(A2 * D);
+                         AssignEvaluate(expr1, expr2););
 
   TensorCheckErr(A1, A2);
 }
 
-TEST(lazyAssign, CPU) {
-  testMatrixCase(testLazyAssign<CpuMatrix>);
-}
+TEST(lazyAssign, CPU) { testMatrixCase(testLazyAssign<CpuMatrix>); }
 
 #ifndef PADDLE_ONLY_CPU
-TEST(lazyAssign, GPU) {
-  testMatrixCase(testLazyAssign<GpuMatrix>);
-}
+TEST(lazyAssign, GPU) { testMatrixCase(testLazyAssign<GpuMatrix>); }
 #endif
 
-template<typename Tensor>
-void sgdUpdateTensor(Tensor& A, Tensor& B, Tensor& C, Tensor& D,
-     real p1, real p2, real p3) {
+template <typename Tensor>
+void sgdUpdateTensor(
+    Tensor& A, Tensor& B, Tensor& C, Tensor& D, real p1, real p2, real p3) {
   C = C * p2 - D * (B + A * p3) * p1;
   A += C;
 }
 
-void sgdUpdateLazyAssign(BaseMatrix& A, BaseMatrix& B,
-    BaseMatrix& C, BaseMatrix& D,
-    real p1, real p2, real p3) {
+void sgdUpdateLazyAssign(BaseMatrix& A,
+                         BaseMatrix& B,
+                         BaseMatrix& C,
+                         BaseMatrix& D,
+                         real p1,
+                         real p2,
+                         real p3) {
   auto expr1 = C.lazyAssign(C * p2 - D * (B + A * p3) * p1);
   auto expr2 = A.lazyAssign(A + C);
   AssignEvaluate(expr1, expr2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testSgdUpdate(int height, int width) {
   Tensor A1(height, width);
   Tensor A2(height, width);
@@ -113,16 +126,13 @@ void testSgdUpdate(int height, int width) {
    * a = a + c;
    */
   // BaseMatrix API
-  EXPRESSION_PERFORMANCE(
-  A1.sgdUpdate(B, C1, D, p1, p2, p3););
+  EXPRESSION_PERFORMANCE(A1.sgdUpdate(B, C1, D, p1, p2, p3););
 
   // Tensor expression
-  EXPRESSION_PERFORMANCE(
-    sgdUpdateTensor(A2, B, C2, D, p1, p2, p3));
+  EXPRESSION_PERFORMANCE(sgdUpdateTensor(A2, B, C2, D, p1, p2, p3));
 
   // lazyAssign
-  EXPRESSION_PERFORMANCE(
-    sgdUpdateLazyAssign(A3, B, C3, D, p1, p2, p3));
+  EXPRESSION_PERFORMANCE(sgdUpdateLazyAssign(A3, B, C3, D, p1, p2, p3));
 
   TensorCheckErr(A1, A2);
   TensorCheckErr(A1, A3);
@@ -130,12 +140,8 @@ void testSgdUpdate(int height, int width) {
   TensorCheckErr(C1, C3);
 }
 
-TEST(sgdUpdate, CPU) {
-  testMatrixCase(testSgdUpdate<CpuMatrix>);
-}
+TEST(sgdUpdate, CPU) { testMatrixCase(testSgdUpdate<CpuMatrix>); }
 
 #ifndef PADDLE_ONLY_CPU
-TEST(sgdUpdate, GPU) {
-  testMatrixCase(testSgdUpdate<GpuMatrix>);
-}
+TEST(sgdUpdate, GPU) { testMatrixCase(testSgdUpdate<GpuMatrix>); }
 #endif
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 354f58df39..d77478f345 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -79,8 +79,8 @@ void testMatrixMaxSequence(int batchSize, int inputDim) {
 }
 
 TEST(Matrix, maxSequence) {
-  for (auto batchSize : {1, 10, 128, 1000, 6000}) {
-    for (auto inputDim : {1, 32, 100, 512}) {
+  for (auto batchSize : {1, 3, 997}) {   // prime numbers close to 1, 4, 1024
+    for (auto inputDim : {1, 7, 131}) {  // prime numbers close to 1, 8, 128
       VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim;
       testMatrixMaxSequence(batchSize, inputDim);
     }
@@ -240,14 +240,10 @@ TEST(Matrix, unary) {
     // inverse matrix
     testMatrixInverse(height);
 #else
-    LOG(WARNING) << "Cannot run Matrix Inverse Unit Test.\n"
-                 << "Failed to find lapack library in current system.\n"
-                 << "To address this issue, Please adopt one of the following "
-                    "approaches: \n"
-                 << "1. Simply issue `sudo apt-get install liblapacke-dev` to "
-                    "avoid re-build source code. \n"
-                 << "2. Install MKL/Openblas/ATLAS and re-build PaddlePaddle "
-                    "source code.";
+    LOG(WARNING) << "This version of PaddlePaddle was not built with LAPACK"
+                 << "support so we cannot test matrix inverse. To test "
+                 << "matrix inverse, please install LAPACKE "
+                 << "and MKL/Openblas/ATLAS, and re-build PaddlePaddle.";
 #endif
   }
 }
@@ -341,8 +337,8 @@ void testMatrixSoftmaxBp(int height, int width) {
 }
 
 TEST(Matrix, softmax) {
-  for (auto height : {1, 11, 73, 128, 200}) {
-    for (auto width : {1, 32, 100, 512, 1000}) {
+  for (auto height : {1, 3, 131}) {    // prime numbers close to 1, 4, 127
+    for (auto width : {1, 17, 251}) {  // prime numbers close to 1, 16, 256
       VLOG(3) << " height=" << height << " width=" << width;
 
       testMatrixSoftmax(height, width);
@@ -527,7 +523,7 @@ void testVectorRowFunc(int size) {
 }
 
 TEST(Vector, rowFunc) {
-  for (auto size : {1, 5, 31, 90, 150, 500, 1000, 4000}) {
+  for (auto size : {1, 3, 997}) {  // prime numbers close to 1, 4, 1024
     VLOG(3) << " size=" << size;
     testVectorRowFunc(size);
   }
@@ -604,7 +600,7 @@ void testVectorIsEqual(int size) {
 }
 
 TEST(Vector, Equal) {
-  for (auto size : {1, 5, 31, 90, 150, 500, 1000, 4000}) {
+  for (auto size : {1, 3, 997}) {  // prime numbers close to 1, 4, 1024
     VLOG(3) << " size=" << size;
     testVectorReset<int>(size);
     testVectorReset<real>(size);
@@ -635,9 +631,8 @@ void testMatrixTopK(int samples, int dim, int beamSize) {
 }
 
 TEST(Matrix, topK) {
-  for (auto samples : {1, 5, 31, 90, 150, 500}) {
-    for (auto dim :
-         {1, 5, 8, 10, 15, 64, 80, 120, 256, 300, 1280, 5120, 50000}) {
+  for (auto samples : {1, 17, 131}) {  // prime numbers close to 1, 16, 127
+    for (auto dim : {1, 3, 997}) {     // prime numbers close to 1, 4, 1024
       for (auto beamSize : {1, 5, 10, 20, 40, (int)rand() % dim + 1}) {
         if (beamSize > dim) continue;
         VLOG(3) << " samples=" << samples << " beamSize=" << beamSize
@@ -650,6 +645,7 @@ TEST(Matrix, topK) {
 
 void testSMatrixTopK(int samples, int dim, int beamSize, real ratio) {
   int nnz = samples * dim * ratio;
+  if (nnz < 1) nnz = 1;  // Because sparseRand in MathUtil.cpp requires this.
   MatrixPtr cpuSrc = std::make_shared<CpuSparseMatrix>(samples, dim, nnz);
   MatrixPtr gpuSrc = std::make_shared<GpuSparseMatrix>(samples, dim, nnz);
   MatrixPtr cpuVal = std::make_shared<CpuMatrix>(samples, beamSize);
@@ -683,9 +679,9 @@ void testSMatrixTopK(int samples, int dim, int beamSize, real ratio) {
 }
 
 TEST(SMatrix, topK) {
-  for (auto samples : {1, 5, 100}) {
-    for (auto dim : {10000, 10000, 50000}) {
-      for (auto beamSize : {1, 5, 40, 100, 500}) {
+  for (auto samples : {1, 3, 61}) {
+    for (auto dim : {1, 3, 61}) {
+      for (auto beamSize : {1, 3, 61}) {
         for (auto ratio : {0.01, 0.001}) {
           if (beamSize > dim) continue;
           VLOG(3) << " samples=" << samples << " beamSize=" << beamSize
@@ -806,10 +802,9 @@ void testClassificationError(int numSamples, int dim, int topkSize) {
 }
 
 TEST(Matrix, classificationError) {
-  for (auto numSamples : {1, 5, 31, 90, 150, 300}) {
-    for (auto dim :
-         {1, 5, 8, 10, 15, 64, 80, 120, 256, 300, 1280, 5120, 50000}) {
-      for (auto topkSize : {1, 5, 10, 20, 40, (int)rand() % dim + 1}) {
+  for (auto numSamples : {1, 3, 31}) {
+    for (auto dim : {1, 3, 31}) {
+      for (auto topkSize : {1, 3, (int)rand() % dim + 1}) {
         if (topkSize > dim) continue;
         VLOG(3) << " sample= " << numSamples << " topkSize= " << topkSize
                 << " dim= " << dim;
@@ -1016,13 +1011,15 @@ void testAvgPoolFwdBwd(int numSamples,
   TensorCheckErr(*inputGrad, *inputGpuGrad);
 }
 
+// TODO(yi): I noticed many such blindly combinatorial tests in this
+// file.  They are no help to locate defects at all.
 TEST(Matrix, PoolFwdBwd) {
-  for (auto numSamples : {5, 32}) {
-    for (auto channels : {1, 9, 32}) {
-      for (auto imgSizeH : {14, 28}) {
-        for (auto imgSizeW : {16, 30}) {
-          for (auto sizeX : {2, 5}) {
-            for (auto sizeY : {2, 5}) {
+  for (auto numSamples : {1, 3}) {
+    for (auto channels : {1, 3}) {
+      for (auto imgSizeH : {13, 17}) {
+        for (auto imgSizeW : {17, 19}) {
+          for (auto sizeX : {2, 3}) {
+            for (auto sizeY : {2, 3}) {
               for (auto sH : {1, 2}) {
                 for (auto sW : {1, 2}) {
                   for (auto pH : {0, (sizeY - 1) / 2}) {
@@ -1128,8 +1125,8 @@ TEST(Matrix, MaxOutFwdBwd) {
 }
 
 TEST(CpuMatrix, copyFrom) {
-  const size_t height = 1000;
-  const size_t width = 1000;
+  const size_t height = 31;
+  const size_t width = 53;
   CpuMatrix cpu(height, width);
   GpuMatrix gpu(height, width);
   CpuMatrix copy(height, width);
@@ -1141,4 +1138,69 @@ TEST(CpuMatrix, copyFrom) {
   TensorCheckEqual(cpu, copy);
 }
 
+void testBatch2seqPadding(int batchSize, int inputDim) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+  IVectorPtr cpuSequence;
+  generateSequenceStartPositions(batchSize, cpuSequence);
+  for (int i = 0; i < int(cpuSequence->getSize()); ++i) {
+    (cpuSequence->getData())[i] += 1;  // so no way that maxSeqLen is 0;
+  }
+
+  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
+  gpuSequence->copyFrom(*cpuSequence);
+
+  size_t numSeq = cpuSequence->getSize() - 1;
+  size_t maxSeqLen = *std::max_element(cpuSequence->getData(),
+                                       cpuSequence->getData() + numSeq);
+
+  printf("numSeq = %ld, maxSeqLen = %ld\n", numSeq, maxSeqLen);
+  MatrixPtr cBatch = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim);
+  MatrixPtr gBatch = std::make_shared<GpuMatrix>(numSeq * maxSeqLen, inputDim);
+  MatrixPtr cCheck = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim);
+
+  // hl_sequence2batch_copy_padding(gBatch->getData(),
+  //                                gpuInput->getData(),
+  //                                cpuSequence->getData(),
+  //                                inputDim,
+  //                                maxSeqLen,
+  //                                numSeq,
+  //                                false,
+  //                                true);
+  // cCheck->copyFrom(*gBatch);
+
+  // int* seqStart = cpuSequence->getData();
+  // float* batchData = cBatch->getData();
+  // float* seqData = cpuInput->getData();
+  // for (size_t i = 0; i < maxSeqLen; i++) {
+  //   for (size_t j = 0; j < numSeq; j++) {
+  //     size_t sequenceStart = seqStart[j];
+  //     size_t sequenceLength = seqStart[j + 1] - seqStart[j];
+  //     if (i < sequenceLength) {
+  //       memcpy(batchData + (i * numSeq + j) * inputDim,
+  //              seqData + (sequenceStart + i) * inputDim,
+  //              inputDim * sizeof(real));
+  //     } else {
+  //       memset(batchData + (i * numSeq + j) * inputDim,
+  //              0,
+  //              inputDim * sizeof(real));
+  //     }
+  //   }
+  // }
+
+  // TensorCheckErr(*cBatch, *cCheck);
+}
+
+TEST(Matrix, warpCTC) {
+  for (auto batchSize : {1, 3, 17}) {
+    for (auto inputDim : {1, 3, 31}) {
+      VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim;
+      testBatch2seqPadding(batchSize, inputDim);
+    }
+  }
+}
+
 #endif
diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index 3943c3cfad..9cc4233e43 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -1 +1,16 @@
 add_subdirectory(detail)
+
+cc_library(memory SRCS memory.cc)
+cc_library(memcpy SRCS memcpy.cc)
+
+cc_library(paddle_memory
+    DEPS
+    memory
+    memcpy
+    meta_data
+    meta_cache
+    memory_block
+    buddy_allocator
+    system_allocator)
+
+cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
diff --git a/paddle/memory/README.md b/paddle/memory/README.md
index 96a331a486..7f95e80f98 100644
--- a/paddle/memory/README.md
+++ b/paddle/memory/README.md
@@ -1,140 +1,4 @@
-## Design
+# Region-based Heterogeneous Memory Management
 
-### Usage
-
-To allocate 4KB CPU memory:
-
-```cpp
-p = memory::Alloc(platform::CPUPlace(), 4*1024);
-```
-
-To allocate 4KB memory on the 3rd GPU:
-
-```cpp
-p = memory::Alloc(platform::GPUPlace(2), 4*1024);
-```
-
-To free memory and check the so-far used amount of memory on a place:
-
-```cpp
-auto pl = platform::GPUPlace(0);
-p = memory::Alloc(pl, 4*1024);
-cout << memory::Used(pl);
-memory::Free(pl, p);
-```
-
-### API
-
-In `paddle/memory/memory.h` we have:
-
-```cpp
-namespace memory {
-template <typename Place> void* Alloc(Place, size_t);
-template <typename Place> void Free(Place, void*);
-template <typename Place> size_t Used(Place);
-}  // namespace memory
-```
-
-These function templates have specializations on either `platform::CPUPlace` or `platform::GPUPlace`:
-
-```cpp
-template<>
-void* Alloc<CPUPlace>(CPUPlace p, size_t size) {
-  return GetCPUBuddyAllocator()->Alloc(size);
-}
-```
-
-and 
-
-```cpp
-template<>
-void Alloc<GPUPlace>(GPUPlace p, size_t size) {
-  return GetGPUBuddyAllocator(p.id)->Alloc(size);
-}
-```
-
-Similar specializations exist for `Free` and `Used`.
-
-### Implementation
-
-`GetCPUBuddyAllocator` and `GetGPUBuddyAllocator` are singletions.
-
-```cpp
-BuddyAllocator* GetCPUBuddyAllocator() {
-  static BuddyAllocator* a = NULL;
-  if (a == NULL) {
-    a = new BuddyAllocator(new CPUAllocator /*backup allocator*/, ...);
-  }
-  return a;
-}
-
-BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
-  static BuddyAllocator* as = NULL;
-  if (as == NULL) {
-    as = new BuddyAllocator*[platform::NumGPUs()];
-    for (int gpu = 0; gpu < platform::NumGPUs(); gpu++) {
-      as[gpu] = new BuddyAllocator(new GPUAllocator(gpu) /* backup allocator */, ...);
-    }
-  }
-  return as[gpu_id);
-```
-
-#### `BuddyAllocator`
-
-`BuddyAllocator` implements the buddy allocation algorithm.  Its constructor takes parameters only related with the algorithm:
-
-```cpp
-BuddyAllocator::BuddyAllocator(initial_pool_size, max_pool_size) {
-  ...
-}
-```
-
-Please be aware that **`BuddyAllocator` always allocate aligned memory**, aligned on 32-bytes, which can hold a `BuddyAllocator::Block` object:
-
-```cpp
-class BuddyAllocator {
- private:
-  struct Block {
-    size_t size;
-    Block* left, right;
-    size_t index; // allocator id
-  };
-  ...
-};
-```
-
-Because BuddyAllocator has the meta-data of each block, it can trace the used memory -- record the amount returned by `Alloc` freed in `Free`.  Instead, `CPUAllocator` and `GPUAllocator` doesn't know the size of freed memory block and cannot do the trace.
-
-#### System Allocators
-
-The `GPUAllocator` and `CPUAllocator` are calls *system allocators*.  They work as the fallback allocators of `BuddyAllocator`.
-
-## Justification
-
-I got inspiration from Majel and Caffe2, though above design look different from both.
-
-### Caffe2
-
-In Caffe2, `Tensor<Context>::mutable_data()` allocates the memroy.  In particular, [`Tensor<Context>::mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L523) calls [`Tensor<Context>::raw_mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L459), which in turn calls [`Context::New`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L479).
-
-There are two implementations of `Context`:
-
-1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory.
-
-1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202).  This looks very similar to class `majel::GPUPlace`, who also has an `int id_` data member.   `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
-
-### Majel
-
-In Majel, there are basically two allocator types:
-
-1. `cpu::SystemAllocator`, which has similar functionality to `caffe2::CPUContext::New/Delete`.
-1. `gpu::SystemAllocator`, which has similar functionality to `caffe2::CUDAContext::New/Delete`.
-
-However, memory allocation is not via these two allocators.  Instead, these two allocators are defined in hidden namespaces.
-
-In Majel there are hidden global variables like:
-
-1. `cpu::SystemAllocator g_cpu_allocator`, and
-1. `vector<gpu::SystemAllocator*> g_gpu_allocators(NUM_GPUS)`.
-
-Programs allocate memory via a BuddyAllocator, which can take the `g_cpu_allocator` or a `g_gpu_allocators[gpu_id]` as its *fallback allocator*, so that if BuddyAllocator cannot find a block in its memory pool, it extends its memory pool by calling the fallback allocator's `New(size_t)`.
+Please check out the [design documentation](http://gangliao.me) to find out more details about
+buddy memory allocator for both CPU and GPU.
diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index 72d3749ad7..b9c3fc31c1 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -1,7 +1,15 @@
 if(${WITH_GPU})
-  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags)
-  nv_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
+  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info)
 else(${WITH_GPU})
-  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags)
-  cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
+  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info)
 endif(${WITH_GPU})
+
+cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
+
+cc_library(meta_data SRCS meta_data.cc)
+
+cc_library(meta_cache SRCS meta_cache.cc)
+
+cc_library(memory_block SRCS memory_block.cc)
+
+cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS glog)
diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index ebe680f5ee..bb44970109 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -12,22 +12,315 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#pragma once
-
 #include "paddle/memory/detail/buddy_allocator.h"
+#include "glog/logging.h"
 
 namespace paddle {
 namespace memory {
 namespace detail {
 
-BuddyAllocator::BuddyAllocator(size_t pool_size, size_t max_pools,
-                               SystemAllocator* system_allocator)
-    : pool_size_(pool_size),
-      max_pools_(max_pools),
-      system_allocator_(system_allocator) {
-  PADDLE_ASSERT(pool_size > 0);
-  PADDLE_ASSERT(max_pools > 0);
-  PADDLE_ASSERT(system_allocator != nullptr);
+BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator,
+                               size_t min_chunk_size, size_t max_chunk_size)
+    : min_chunk_size_(min_chunk_size),
+      max_chunk_size_(max_chunk_size),
+      cache_(system_allocator->UseGpu()),
+      system_allocator_(std::move(system_allocator)) {}
+
+BuddyAllocator::~BuddyAllocator() {
+  VLOG(3) << "BuddyAllocator Disconstructor makes sure that all of these "
+             "have actually been freed";
+  while (!pool_.empty()) {
+    auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
+    VLOG(3) << "Free from block (" << block << ", " << max_chunk_size_ << ")";
+
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+    pool_.erase(pool_.begin());
+  }
+}
+
+inline size_t align(size_t size, size_t alignment) {
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+void* BuddyAllocator::Alloc(size_t unaligned_size) {
+  // adjust allocation alignment
+  size_t size = align(unaligned_size + sizeof(Metadata), min_chunk_size_);
+
+  // acquire the allocator lock
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  VLOG(3) << "Allocate " << unaligned_size << " bytes from chunk size " << size;
+
+  // if the allocation is huge, send directly to the system allocator
+  if (size > max_chunk_size_) {
+    VLOG(3) << "Allocate from system allocator.";
+    return SystemAlloc(size);
+  }
+
+  // query and allocate from the existing chunk
+  auto it = FindExistChunk(size);
+
+  // refill the pool if failure
+  if (it == pool_.end()) {
+    it = RefillPool();
+    // if still failure, fail fatally
+    if (it == pool_.end()) {
+      return nullptr;
+    }
+  } else {
+    VLOG(3) << "Allocation from existing memory block " << std::get<2>(*it)
+            << " at address "
+            << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
+  }
+
+  total_used_ += size;
+  total_free_ -= size;
+
+  // split the allocation and return data for use
+  return reinterpret_cast<MemoryBlock*>(SplitToAlloc(it, size))->data();
+}
+
+void BuddyAllocator::Free(void* p) {
+  // Point back to metadata
+  auto block = static_cast<MemoryBlock*>(p)->metadata();
+
+  // Acquire the allocator lock
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  VLOG(3) << "Free from address " << block;
+
+  if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) {
+    VLOG(3) << "Free directly from system allocator";
+    system_allocator_->Free(block, block->total_size(cache_),
+                            block->index(cache_));
+
+    // Invalidate GPU allocation from cache
+    cache_.invalidate(block);
+
+    return;
+  }
+
+  block->mark_as_free(cache_);
+
+  total_used_ -= block->total_size(cache_);
+  total_free_ += block->total_size(cache_);
+
+  // Trying to merge the right buddy
+  if (block->has_right_buddy(cache_)) {
+    VLOG(3) << "Merging this block " << block << " with its right buddy "
+            << block->right_buddy(cache_);
+
+    auto right_buddy = block->right_buddy(cache_);
+
+    if (right_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      // Take away right buddy from pool
+      pool_.erase(IndexSizeAddress(right_buddy->index(cache_),
+                                   right_buddy->total_size(cache_),
+                                   right_buddy));
+
+      // merge its right buddy to the block
+      block->merge(cache_, right_buddy);
+    }
+  }
+
+  // Trying to merge the left buddy
+  if (block->has_left_buddy(cache_)) {
+    VLOG(3) << "Merging this block " << block << " with its left buddy "
+            << block->left_buddy(cache_);
+
+    auto left_buddy = block->left_buddy(cache_);
+
+    if (left_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      // Take away right buddy from pool
+      pool_.erase(IndexSizeAddress(left_buddy->index(cache_),
+                                   left_buddy->total_size(cache_), left_buddy));
+
+      // merge the block to its left buddy
+      left_buddy->merge(cache_, block);
+      block = left_buddy;
+    }
+  }
+
+  // Dumping this block into pool
+  VLOG(3) << "Inserting free block (" << block << ", "
+          << block->total_size(cache_) << ")";
+  pool_.insert(
+      IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
+
+  // Clean up if existing too much free memory
+
+  // Prefer freeing fallback allocation first
+  CleanIdleFallBackAlloc();
+
+  // Free normal allocation
+  CleanIdleNormalAlloc();
+}
+
+size_t BuddyAllocator::Used() { return total_used_; }
+
+void* BuddyAllocator::SystemAlloc(size_t size) {
+  size_t index = 0;
+  void* p = system_allocator_->Alloc(index, size);
+
+  VLOG(3) << "Allocated " << p << " from system allocator.";
+
+  if (p == nullptr) return nullptr;
+
+  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::HUGE_CHUNK, index,
+                                     size, nullptr, nullptr);
+
+  return static_cast<MemoryBlock*>(p)->data();
+}
+
+BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
+#ifndef PADDLE_ONLY_CPU
+  if (system_allocator_->UseGpu()) {
+    if ((total_used_ + total_free_) == 0) {
+      // Compute the maximum allocation size for the first allocation.
+      max_chunk_size_ = platform::GpuMaxChunkSize();
+    }
+  }
+#endif  // PADDLE_ONLY_CPU
+
+  // Allocate a new maximum sized block
+  size_t index = 0;
+  void* p = system_allocator_->Alloc(index, max_chunk_size_);
+
+  if (p == nullptr) return pool_.end();
+
+  VLOG(3) << "Creating and inserting new block " << p
+          << " from system allocator";
+
+  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index,
+                                     max_chunk_size_, nullptr, nullptr);
+
+  // gpu fallback allocation
+  if (system_allocator_->UseGpu() &&
+      static_cast<MemoryBlock*>(p)->index(cache_) == 1) {
+    fallback_alloc_count_++;
+  }
+
+  total_free_ += max_chunk_size_;
+
+  // dump the block into pool
+  return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first;
+}
+
+BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
+  size_t index = 0;
+
+  while (1) {
+    auto it = pool_.lower_bound(IndexSizeAddress(index, size, nullptr));
+
+    // no match chunk memory
+    if (it == pool_.end()) return it;
+
+    if (std::get<0>(*it) > index) {
+      // find suitable one
+      if (std::get<1>(*it) >= size) {
+        return it;
+      }
+      // update and continue
+      index = std::get<0>(*it);
+      continue;
+    }
+    return it;
+  }
+}
+
+void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
+                                   size_t size) {
+  auto block = static_cast<MemoryBlock*>(std::get<2>(*it));
+  pool_.erase(it);
+
+  VLOG(3) << "Split block (" << block << ", " << block->total_size(cache_)
+          << ") into";
+  block->split(cache_, size);
+
+  VLOG(3) << "Left block (" << block << ", " << block->total_size(cache_)
+          << ")";
+  block->set_type(cache_, MemoryBlock::ARENA_CHUNK);
+
+  // the rest of memory if exist
+  if (block->has_right_buddy(cache_)) {
+    if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      VLOG(3) << "Insert right block (" << block->right_buddy(cache_) << ", "
+              << block->right_buddy(cache_)->total_size(cache_) << ")";
+
+      pool_.insert(
+          IndexSizeAddress(block->right_buddy(cache_)->index(cache_),
+                           block->right_buddy(cache_)->total_size(cache_),
+                           block->right_buddy(cache_)));
+    }
+  }
+
+  return block;
+}
+
+void BuddyAllocator::CleanIdleFallBackAlloc() {
+  // If fallback allocation does not exist, return directly
+  if (!fallback_alloc_count_) return;
+
+  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
+    // If free memory block less than max_chunk_size_, return directly
+    if (std::get<1>(*pool) < max_chunk_size_) return;
+
+    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
+
+    // If no GPU fallback allocator, return
+    if (!system_allocator_->UseGpu() || block->index(cache_) == 0) {
+      return;
+    }
+
+    VLOG(3) << "Return block " << block << " to fallback allocator.";
+
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+
+    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
+
+    total_free_ -= max_chunk_size_;
+    fallback_alloc_count_--;
+
+    // If no fall allocation exists, return directly
+    if (!fallback_alloc_count_) return;
+  }
+}
+
+void BuddyAllocator::CleanIdleNormalAlloc() {
+  auto shall_free_alloc = [&]() -> bool {
+    // free all fallback allocations
+    if (fallback_alloc_count_ > 0) {
+      return true;
+    }
+    // keep 2x overhead if we haven't fallen back
+    if ((total_used_ + max_chunk_size_) * 2 < total_free_) {
+      return true;
+    }
+    return false;
+  };
+
+  if (!shall_free_alloc()) return;
+
+  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
+    // If free memory block less than max_chunk_size_, return directly
+    if (std::get<1>(*pool) < max_chunk_size_) return;
+
+    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
+
+    VLOG(3) << "Return block " << block << " to base allocator.";
+
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+
+    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
+
+    total_free_ -= max_chunk_size_;
+
+    if (!shall_free_alloc()) return;
+  }
 }
 
 }  // namespace detail
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 82e6aaedc7..9c41378483 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -14,9 +14,16 @@
 
 #pragma once
 
+#include "paddle/memory/detail/meta_cache.h"
+#include "paddle/memory/detail/meta_data.h"
 #include "paddle/memory/detail/system_allocator.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/cpu_info.h"
+#include "paddle/platform/gpu_info.h"
 
 #include <mutex>
+#include <set>
+#include <unordered_map>
 #include <vector>
 
 namespace paddle {
@@ -25,61 +32,80 @@ namespace detail {
 
 class BuddyAllocator {
  public:
-  BuddyAllocator(size_t pool_size, size_t max_pools,
-                 SystemAllocator* system_allocator);
+  BuddyAllocator(SystemAllocator* system_allocator, size_t min_chunk_size,
+                 size_t max_chunk_size);
+
   ~BuddyAllocator();
 
-  void* Alloc(size_t size);
-  void Free(void*);
+ public:
+  void* Alloc(size_t unaligned_size);
+  void Free(void* ptr);
   size_t Used();
 
+ public:
+  // Disable copy and assignment
+  BuddyAllocator(const BuddyAllocator&) = delete;
+  BuddyAllocator& operator=(const BuddyAllocator&) = delete;
+
  private:
-  struct Block {
-    size_t size_;
-    Block* left_;   // left buddy
-    Block* right_;  // right buddy
-  };
+  // Tuple (allocator index, memory size, memory address)
+  using IndexSizeAddress = std::tuple<size_t, size_t, void*>;
+  // Each element in PoolSet is a free allocation
+  using PoolSet = std::set<IndexSizeAddress>;
 
-  // Initially, there is only one pool.  If a Alloc founds not enough
-  // memory from that pool, and there has not been max_num_pools_,
-  // create a new pool by calling system_allocator_.Alloc(pool_size_).
-  std::vector<void*> pools_;
+  /*! \brief Allocate fixed-size memory from system */
+  void* SystemAlloc(size_t size);
 
-  size_t pool_size_;      // the size of each pool;
-  size_t max_num_pools_;  // the size of all pools;
+  /*! \brief If existing chunks are not suitable, refill pool */
+  PoolSet::iterator RefillPool();
 
-  SystemAllocator* system_allocator_;
+  /**
+   *  \brief   Find the suitable chunk from existing pool and split
+   *           it to left and right buddies
+   *
+   *  \param   it     the iterator of pool list
+   *  \param   size   the size of allocation
+   *
+   *  \return  the left buddy address
+   */
+  void* SplitToAlloc(PoolSet::iterator it, size_t size);
 
-  std::mutex mutex_;
+  /*! \brief Find the existing chunk which used to allocation */
+  PoolSet::iterator FindExistChunk(size_t size);
 
-  // Disable copy and assignment.
-  BuddyAllocator(const BuddyAllocator&) = delete;
-  BuddyAllocator& operator=(const BuddyAllocator&) = delete;
-};
+  /*! \brief Clean idle fallback allocation */
+  void CleanIdleFallBackAlloc();
+
+  /*! \brief Clean idle normal allocation */
+  void CleanIdleNormalAlloc();
 
-BuddyAllocator<CPUAllocator>* GetCPUBuddyAllocator() {
-  static BuddyAllocator<CPUAllocator>* a = nullptr;
-  if (a == nullptr) {
-    a = new BuddyAllocator<CPUAllocator>();
-  }
-  return a;
-}
-
-#ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
-
-BuddyAllocator<GPUAllocator>* GetGPUBuddyAllocator(int gpu_id) {
-  static BuddyAllocator<GPUAllocator>** as = NULL;
-  if (as == NULL) {
-    int gpu_num = platform::GetDeviceCount();
-    as = new BuddyAllocator<GPUAllocator>*[gpu_num];
-    for (int gpu = 0; gpu < gpu_num; gpu++) {
-      as[gpu] = new BuddyAllocator<GPUAllocator>();
-    }
-  }
-  return as[gpu_id];
-}
-
-#endif  // PADDLE_ONLY_CPU
+ private:
+  size_t total_used_ = 0;  // the total size of used memory
+  size_t total_free_ = 0;  // the total size of free memory
+
+  size_t min_chunk_size_;  // the minimum size of each chunk
+  size_t max_chunk_size_;  // the maximum size of each chunk
+
+ private:
+  /**
+   * \brief A list of free allocation
+   *
+   * \note  Only store free chunk memory in pool
+   */
+  PoolSet pool_;
+
+  /*! Record fallback allocation count for auto-scaling */
+  size_t fallback_alloc_count_ = 0;
+
+ private:
+  /*! Unify the metadata format between GPU and CPU allocations */
+  MetadataCache cache_;
+
+ private:
+  /*! Allocate CPU/GPU memory from system */
+  SystemAllocator* system_allocator_;
+  std::mutex mutex_;
+};
 
 }  // namespace detail
 }  // namespace memory
diff --git a/paddle/memory/detail/memory_block.cc b/paddle/memory/detail/memory_block.cc
new file mode 100644
index 0000000000..fc40993208
--- /dev/null
+++ b/paddle/memory/detail/memory_block.cc
@@ -0,0 +1,157 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_cache.h"
+#include "paddle/memory/detail/meta_data.h"
+#include "paddle/platform/assert.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+void MemoryBlock::init(MetadataCache& cache, Type t, size_t index, size_t size,
+                       void* left_buddy, void* right_buddy) {
+  cache.store(this, Metadata(t, index, size - sizeof(Metadata), size,
+                             static_cast<MemoryBlock*>(left_buddy),
+                             static_cast<MemoryBlock*>(right_buddy)));
+}
+
+MemoryBlock::Type MemoryBlock::type(MetadataCache& cache) const {
+  return cache.load(this).type;
+}
+
+size_t MemoryBlock::size(MetadataCache& cache) const {
+  return cache.load(this).size;
+}
+
+size_t MemoryBlock::total_size(MetadataCache& cache) const {
+  return cache.load(this).total_size;
+}
+
+MemoryBlock* MemoryBlock::left_buddy(MetadataCache& cache) const {
+  return cache.load(this).left_buddy;
+}
+
+MemoryBlock* MemoryBlock::right_buddy(MetadataCache& cache) const {
+  return cache.load(this).right_buddy;
+}
+
+void MemoryBlock::split(MetadataCache& cache, size_t size) {
+  // make sure the split fits
+  PADDLE_ASSERT(total_size(cache) >= size);
+
+  // bail out if there is no room for another partition
+  if (total_size(cache) - size <= sizeof(Metadata)) {
+    return;
+  }
+
+  // find the position of the split
+  void* right_partition = reinterpret_cast<uint8_t*>(this) + size;
+
+  size_t remaining_size = total_size(cache) - size;
+
+  // Add the new block as a buddy
+  auto metadata = cache.load(this);
+
+  // Write the metadata for the new block
+  auto new_block_right_buddy = metadata.right_buddy;
+
+  cache.store(
+      static_cast<MemoryBlock*>(right_partition),
+      Metadata(FREE_CHUNK, index(cache), remaining_size - sizeof(Metadata),
+               remaining_size, this, new_block_right_buddy));
+
+  metadata.right_buddy = static_cast<MemoryBlock*>(right_partition);
+  metadata.size = size - sizeof(Metadata);
+  metadata.total_size = size;
+
+  cache.store(this, metadata);
+
+  // Write metadata for the new block's right buddy
+  if (new_block_right_buddy != nullptr) {
+    auto buddy_metadata = cache.load(new_block_right_buddy);
+
+    buddy_metadata.left_buddy = static_cast<MemoryBlock*>(right_partition);
+
+    cache.store(new_block_right_buddy, buddy_metadata);
+  }
+}
+
+void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) {
+  // only free blocks can be merged
+  PADDLE_ASSERT(type(cache) == FREE_CHUNK);
+  PADDLE_ASSERT(right_buddy->type(cache) == FREE_CHUNK);
+
+  auto metadata = cache.load(this);
+
+  // link this->buddy's buddy
+  metadata.right_buddy = right_buddy->right_buddy(cache);
+
+  // link buddy's buddy -> this
+  if (metadata.right_buddy != nullptr) {
+    auto buddy_metadata = cache.load(metadata.right_buddy);
+
+    buddy_metadata.left_buddy = this;
+
+    cache.store(metadata.right_buddy, buddy_metadata);
+  }
+
+  metadata.size += right_buddy->total_size(cache);
+  metadata.total_size += right_buddy->total_size(cache);
+
+  cache.store(this, metadata);
+  cache.store(right_buddy, Metadata(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr));
+}
+
+void MemoryBlock::mark_as_free(MetadataCache& cache) {
+  // check for double free or corruption
+  PADDLE_ASSERT(type(cache) != FREE_CHUNK);
+  PADDLE_ASSERT(type(cache) != INVALID_CHUNK);
+
+  set_type(cache, FREE_CHUNK);
+}
+
+void MemoryBlock::set_type(MetadataCache& cache, Type t) {
+  auto metadata = cache.load(this);
+
+  metadata.type = t;
+
+  cache.store(this, metadata);
+}
+
+bool MemoryBlock::has_left_buddy(MetadataCache& cache) const {
+  return left_buddy(cache) != nullptr;
+}
+
+bool MemoryBlock::has_right_buddy(MetadataCache& cache) const {
+  return right_buddy(cache) != nullptr;
+}
+
+size_t MemoryBlock::index(MetadataCache& cache) const {
+  return cache.load(this).index;
+}
+
+void* MemoryBlock::data() const {
+  return const_cast<Metadata*>(reinterpret_cast<const Metadata*>(this)) + 1;
+}
+
+MemoryBlock* MemoryBlock::metadata() const {
+  return const_cast<MemoryBlock*>(reinterpret_cast<const MemoryBlock*>(
+      reinterpret_cast<const Metadata*>(this) - 1));
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/memory_block.h b/paddle/memory/detail/memory_block.h
new file mode 100644
index 0000000000..a5168b519f
--- /dev/null
+++ b/paddle/memory/detail/memory_block.h
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <cstddef>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+// Forward Declarations
+class MetadataCache;
+
+/*! \brief A class used to interpret the contents of a memory block */
+class MemoryBlock {
+ public:
+  enum Type {
+    FREE_CHUNK,    // memory is free and idle
+    ARENA_CHUNK,   // memory is being occupied
+    HUGE_CHUNK,    // memory is out of management
+    INVALID_CHUNK  // memory is invalid
+  };
+
+ public:
+  void init(MetadataCache& cache, Type t, size_t index, size_t size,
+            void* left_buddy, void* right_buddy);
+
+ public:
+  /*! \brief The type of the allocation */
+  Type type(MetadataCache& cache) const;
+
+  /*! \brief The size of the data region */
+  size_t size(MetadataCache& cache) const;
+
+  /*! \brief An index to track the allocator */
+  size_t index(MetadataCache& cache) const;
+
+  /*! \brief The total size of the block */
+  size_t total_size(MetadataCache& cache) const;
+
+  /*! \brief Check the left buddy of the block */
+  bool has_left_buddy(MetadataCache& cache) const;
+
+  /*! \brief Check the right buddy of the block */
+  bool has_right_buddy(MetadataCache& cache) const;
+
+  /*! \brief Get the left buddy */
+  MemoryBlock* left_buddy(MetadataCache& cache) const;
+
+  /*! \brief Get the right buddy */
+  MemoryBlock* right_buddy(MetadataCache& cache) const;
+
+ public:
+  /*! \brief Split the allocation into left/right blocks */
+  void split(MetadataCache& cache, size_t size);
+
+  /*! \brief Merge left and right blocks together */
+  void merge(MetadataCache& cache, MemoryBlock* right_buddy);
+
+  /*! \brief Mark the allocation as free */
+  void mark_as_free(MetadataCache& cache);
+
+  /*! \brief Change the type of the allocation */
+  void set_type(MetadataCache& cache, Type t);
+
+ public:
+  /*! \brief Get a pointer to the memory block's data */
+  void* data() const;
+
+  /*! \brief Get a pointer to the memory block's metadata */
+  MemoryBlock* metadata() const;
+
+ public:
+  static size_t overhead();
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/meta_cache.cc b/paddle/memory/detail/meta_cache.cc
new file mode 100644
index 0000000000..30ff80e7ba
--- /dev/null
+++ b/paddle/memory/detail/meta_cache.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/memory/detail/meta_cache.h"
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/platform/assert.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {}
+
+Metadata MetadataCache::load(const MemoryBlock* block) {
+  if (uses_gpu_) {
+    auto existing_metadata = cache_.find(block);
+    PADDLE_ASSERT(existing_metadata->second.check_guards());
+    return existing_metadata->second;
+  } else {
+    PADDLE_ASSERT(reinterpret_cast<const Metadata*>(block)->check_guards());
+    return *reinterpret_cast<const Metadata*>(block);
+  }
+}
+
+void MetadataCache::store(MemoryBlock* block,
+                          const Metadata& original_metadata) {
+  auto metadata = original_metadata;
+
+  metadata.update_guards();
+
+  if (uses_gpu_) {
+    cache_[block] = metadata;
+  } else {
+    *reinterpret_cast<Metadata*>(block) = metadata;
+  }
+}
+
+void MetadataCache::invalidate(MemoryBlock* block) {
+  if (uses_gpu_) {
+    cache_.erase(block);
+  }
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/meta_cache.h b/paddle/memory/detail/meta_cache.h
new file mode 100644
index 0000000000..cf58156442
--- /dev/null
+++ b/paddle/memory/detail/meta_cache.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_data.h"
+
+#include <unordered_map>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+/**
+ *  \brief A cache for accessing memory block meta-data that may be expensive
+ *         to access directly.
+ *
+ *  \note  This class exists to unify the metadata format between GPU and CPU
+ *         allocations. It should be removed when the CPU can access all GPU
+ *         allocations directly via UVM.
+ */
+class MetadataCache {
+ public:
+  explicit MetadataCache(bool uses_gpu);
+
+ public:
+  /*! \brief Load the associated metadata for the specified memory block. */
+  Metadata load(const MemoryBlock* memory_block);
+
+  /*! \brief Store the associated metadata for the specified memory block. */
+  void store(MemoryBlock* memory_block, const Metadata& meta_data);
+
+  /*! \brief Indicate that the specified metadata will no longer be used. */
+  void invalidate(MemoryBlock* memory_block);
+
+ public:
+  MetadataCache(const MetadataCache&) = delete;
+  MetadataCache& operator=(const MetadataCache&) = delete;
+
+ private:
+  bool uses_gpu_;
+
+ private:
+  typedef std::unordered_map<const MemoryBlock*, Metadata> MetadataMap;
+
+ private:
+  MetadataMap cache_;
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/meta_data.cc b/paddle/memory/detail/meta_data.cc
new file mode 100644
index 0000000000..70c5c1f439
--- /dev/null
+++ b/paddle/memory/detail/meta_data.cc
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/memory/detail/meta_data.h"
+
+#include <functional>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
+                   MemoryBlock* l, MemoryBlock* r)
+    : type(t),
+      index(i),
+      size(s),
+      total_size(ts),
+      left_buddy(l),
+      right_buddy(r) {}
+
+Metadata::Metadata()
+    : type(MemoryBlock::INVALID_CHUNK),
+      index(0),
+      size(0),
+      total_size(0),
+      left_buddy(nullptr),
+      right_buddy(nullptr) {}
+
+template <class T>
+inline void hash_combine(std::size_t& seed, const T& v) {
+  std::hash<T> hasher;
+  seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+
+inline size_t hash(const Metadata* metadata, size_t initial_seed) {
+  size_t seed = initial_seed;
+
+  hash_combine(seed, (size_t)metadata->type);
+  hash_combine(seed, metadata->index);
+  hash_combine(seed, metadata->size);
+  hash_combine(seed, metadata->total_size);
+  hash_combine(seed, metadata->left_buddy);
+  hash_combine(seed, metadata->right_buddy);
+
+  return seed;
+}
+
+void Metadata::update_guards() {
+  guard_begin = hash(this, 1);
+  guard_end = hash(this, 2);
+}
+
+bool Metadata::check_guards() const {
+  return guard_begin == hash(this, 1) && guard_end == hash(this, 2);
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/meta_data.h b/paddle/memory/detail/meta_data.h
new file mode 100644
index 0000000000..628cf1f2e3
--- /dev/null
+++ b/paddle/memory/detail/meta_data.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/detail/memory_block.h"
+
+#include <stddef.h>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+class Metadata {
+ public:
+  Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
+           MemoryBlock* r);
+  Metadata();
+
+ public:
+  /*! \brief Update the guards when metadata is changed */
+  void update_guards();
+
+  /*! \brief Check consistency to previous modification */
+  bool check_guards() const;
+
+ public:
+  // TODO(gangliao): compress this
+  // clang-format off
+  size_t            guard_begin = 0;
+  MemoryBlock::Type type        = MemoryBlock::INVALID_CHUNK;
+  size_t            index       = 0;
+  size_t            size        = 0;
+  size_t            total_size  = 0;
+  MemoryBlock*      left_buddy  = nullptr;
+  MemoryBlock*      right_buddy = nullptr;
+  size_t            guard_end   = 0;
+  // clang-format on
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
index 50bec926f8..a270bd5958 100644
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -13,76 +13,127 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/memory/detail/system_allocator.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/gpu_info.h"
 
 #include <stdlib.h>    // for malloc and free
 #include <sys/mman.h>  // for mlock and munlock
 
 #include "gflags/gflags.h"
-#include "paddle/platform/assert.h"
-#include "paddle/platform/cuda.h"
 
 // If use_pinned_memory is true, CPUAllocator calls mlock, which
 // returns pinned and locked memory as staging areas for data exchange
 // between host and device.  Allocates too much would reduce the amount
 // of memory available to the system for paging.  So, by default, we
 // should set false to use_pinned_memory.
-DEFINE_bool(use_pinned_memory, false,
-            "If set, allocate cpu/gpu pinned memory.");
+DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
 
 namespace paddle {
 namespace memory {
 namespace detail {
 
-void* CPUAllocator::Alloc(size_t size) {
+void* CPUAllocator::Alloc(size_t& index, size_t size) {
   // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
   // malloc might not return nullptr if size is zero, but the returned
   // pointer shall not be dereferenced -- so we make it nullptr.
   if (size <= 0) return nullptr;
 
+  index = 0;  // unlock memory
+
   void* p = malloc(size);
-  if (p != nullptr && FLAGS_use_pinned_memory) {
-    mlock(p, size);
+
+  if (p != nullptr) {
+    if (FLAGS_use_pinned_memory) {
+      index = 1;
+      mlock(p, size);  // lock memory
+    }
   }
+
   return p;
 }
 
-void CPUAllocator::Free(void* p, size_t size) {
-  if (p != nullptr && FLAGS_use_pinned_memory) {
+void CPUAllocator::Free(void* p, size_t size, size_t index) {
+  if (p != nullptr && index == 1) {
     munlock(p, size);
   }
   free(p);
 }
 
+bool CPUAllocator::UseGpu() const { return false; }
+
 #ifndef PADDLE_ONLY_CPU
 
-void* GPUAllocator::Alloc(size_t size) {
+void* GPUAllocator::Alloc(size_t& index, size_t size) {
   // CUDA documentation doesn't explain if cudaMalloc returns nullptr
   // if size is 0.  We just make sure it does.
-  if (size <= 0) {
-    return nullptr;
-  }
+  if (size <= 0) return nullptr;
 
+  size_t available = 0;
+  size_t capacity = 0;
+  paddle::platform::GpuMemoryUsage(available, capacity);
+
+  // Reserve memory for page tables, etc.
+  size_t reserving = capacity - paddle::platform::GpuMaxAllocSize();
+  size_t usable = available > reserving ? available - reserving : 0;
+
+  // If remaining size no less than expected size, using general
+  // cudaMalloc to allocate GPU memory.
   void* p = 0;
-  cudaError_t result =
-      FLAGS_use_pinned_memory ? cudaMallocHost(&p, size) : cudaMalloc(&p, size);
-  if (result != cudaSuccess) {
-    cudaGetLastError();  // clear error if there is any.
+  if (size <= usable) {
+    cudaError_t result = cudaMalloc(&p, size);
+    if (result == cudaSuccess) {
+      index = 0;
+      gpu_alloc_size_ += size;
+      return p;
+    }
+  }
+
+  // If remaining size less than expected size or cudaMalloc failed,
+  // cudaMallocHost will be considered as a fallback allocator.
+  //
+  // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
+  // of host fallback allocation. Allocates too much would reduce
+  // the amount of memory available to the underlying system for paging.
+  usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
+
+  if (size > usable) return nullptr;
+
+  cudaError_t result = cudaMallocHost(&p, size);
+  if (result == cudaSuccess) {
+    index = 1;
+    fallback_alloc_size_ += size;
+    return p;
   }
-  return result == cudaSuccess ? p : nullptr;
+
+  return nullptr;
 }
 
-void GPUAllocator::Free(void* p, size_t size) {
+void GPUAllocator::Free(void* p, size_t size, size_t index) {
+  cudaError_t err;
+
+  if (index == 0) {
+    PADDLE_ASSERT(gpu_alloc_size_ >= size);
+    gpu_alloc_size_ -= size;
+    err = cudaFree(p);
+  } else {
+    PADDLE_ASSERT(fallback_alloc_size_ >= size);
+    fallback_alloc_size_ -= size;
+    err = cudaFreeHost(p);
+  }
+
   // Purposefully allow cudaErrorCudartUnloading, because
   // that is returned if you ever call cudaFree after the
   // driver has already shutdown. This happens only if the
   // process is terminating, in which case we don't care if
   // cudaFree succeeds.
-  cudaError_t err = FLAGS_use_pinned_memory ? cudaFreeHost(p) : cudaFree(p);
   if (err != cudaErrorCudartUnloading) {
-    platform::throw_on_error(err, "cudaFree{Host} failed");
+    PADDLE_ENFORCE(err, "cudaFree{Host} failed in GPUAllocator::Free.");
   }
 }
 
+bool GPUAllocator::UseGpu() const { return true; }
+
 #endif  // PADDLE_ONLY_CPU
 
 }  // namespace detail
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index 184b383f7f..82ba322e05 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -20,31 +20,36 @@ namespace paddle {
 namespace memory {
 namespace detail {
 
-// SystemAllocator is the parent class of CPUAllocator and
-// GPUAllocator.  A BuddyAllocator object uses a SystemAllocator*
-// pointing to the underlying system allocator.  An alternative to
-// this class hierarchy is to pass a system allocator class to
-// BuddyAllocator as a template parameter.  This approach makes
-// BuddyAllocator a class template, and it's very complicated
-// algorithm would make the buddy_allocator.h messy.
+/**
+ * \brief SystemAllocator is the parent class of CPUAllocator and GPUAllocator.
+ *        A BuddyAllocator object uses a SystemAllocator* pointing to the
+ *        underlying system allocator.
+ */
 class SystemAllocator {
  public:
   virtual ~SystemAllocator() {}
-  virtual void* Alloc(size_t size) = 0;
-  virtual void Free(void* p, size_t size) = 0;
+  virtual void* Alloc(size_t& index, size_t size) = 0;
+  virtual void Free(void* p, size_t size, size_t index) = 0;
+  virtual bool UseGpu() const = 0;
 };
 
 class CPUAllocator : public SystemAllocator {
  public:
-  virtual void* Alloc(size_t size);
-  virtual void Free(void* p, size_t size);
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
 };
 
 #ifndef PADDLE_ONLY_CPU
 class GPUAllocator : public SystemAllocator {
  public:
-  virtual void* Alloc(size_t size);
-  virtual void Free(void* p, size_t size);
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+
+ private:
+  size_t gpu_alloc_size_ = 0;
+  size_t fallback_alloc_size_ = 0;
 };
 #endif  // PADDLE_ONLY_CPU
 
diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc
index 9bd5706a4e..ba44e06ddb 100644
--- a/paddle/memory/detail/system_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
@@ -25,7 +25,8 @@ DECLARE_bool(use_pinned_memory);
 void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
   bool freed = false;
   {
-    void* p = a.Alloc(size);
+    size_t index;
+    void* p = a.Alloc(index, size);
     if (size > 0) {
       EXPECT_NE(p, nullptr);
     } else {
@@ -35,7 +36,7 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
     int* i = static_cast<int*>(p);
     std::shared_ptr<int> ptr(i, [&](void* p) {
       freed = true;
-      a.Free(p, size);
+      a.Free(p, size, index);
     });
   }
   EXPECT_TRUE(freed);
@@ -56,14 +57,7 @@ TEST(CPUAllocator, LockMem) {
 }
 
 #ifndef PADDLE_ONLY_CPU
-TEST(GPUAllocator, NoStaging) {
-  FLAGS_use_pinned_memory = false;
-  paddle::memory::detail::GPUAllocator a;
-  TestAllocator(a, 2048);
-  TestAllocator(a, 0);
-}
-TEST(GPUAllocator, Staging) {
-  FLAGS_use_pinned_memory = true;
+TEST(GPUAllocator, Alloc) {
   paddle::memory::detail::GPUAllocator a;
   TestAllocator(a, 2048);
   TestAllocator(a, 0);
diff --git a/paddle/memory/memcpy.cc b/paddle/memory/memcpy.cc
new file mode 100644
index 0000000000..a19a3e3675
--- /dev/null
+++ b/paddle/memory/memcpy.cc
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/memcpy.h"
+
+#include <cstring>  // for memcpy
+
+namespace paddle {
+namespace memory {
+
+template <>
+void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
+                                                  platform::CPUPlace,
+                                                  const void* src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+
+#ifndef PADDLE_ONLY_CPU
+template <>
+void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::GPUPlace src_place,
+                                                  const void* src, size_t num,
+                                                  cudaStream_t stream) {
+  platform::SetDeviceId(src_place.device);
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
+}
+
+template <>
+void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::CPUPlace src_place,
+                                                  const void* src, size_t num,
+                                                  cudaStream_t stream) {
+  platform::SetDeviceId(dst_place.device);
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
+}
+
+template <>
+void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::GPUPlace src_place,
+                                                  const void* src, size_t num,
+                                                  cudaStream_t stream) {
+  if (dst_place == src_place) {
+    platform::SetDeviceId(src_place.device);
+    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
+  } else {
+    platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num,
+                            stream);
+  }
+}
+
+#endif  // PADDLE_ONLY_CPU
+
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/memcpy.h b/paddle/memory/memcpy.h
new file mode 100644
index 0000000000..2b9c0eada6
--- /dev/null
+++ b/paddle/memory/memcpy.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/platform/gpu_info.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace memory {
+
+/**
+ * \brief   Copy memory from one place to another place.
+ *
+ * \param[in]  DstPlace Destination allocation place (CPU).
+ * \param[in]  dst      Destination memory address.
+ * \param[in]  SrcPlace Source allocation place (CPU).
+ * \param[in]  src      Source memory address.
+ * \param[in]  num      memory size in bytes to copy.
+ *
+ */
+template <typename DstPlace, typename SrcPlace>
+void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
+
+#ifndef PADDLE_ONLY_CPU
+
+/**
+ * \brief   Copy memory from one place to another place.
+ *
+ * \param[in]  DstPlace Destination allocation place (CPU or GPU).
+ * \param[in]  dst      Destination memory address.
+ * \param[in]  SrcPlace Source allocation place (CPU or GPU).
+ * \param[in]  src      Source memory address.
+ * \param[in]  num      memory size in bytes to copy.
+ * \param[in]  stream   CUDA stream.
+ *
+ * \note    For GPU memory copy, CUDA stream need to be specified
+ *          for asynchronously memory copy.
+ *
+ */
+template <typename DstPlace, typename SrcPlace>
+void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
+          cudaStream_t stream);
+
+#endif  // PADDLE_ONLY_CPU
+
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 0d123d99e2..29bc26f9d3 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -13,47 +13,105 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/memory/memory.h"
+
+#include <algorithm>  // for transform
+#include <cstring>    // for memcpy
+#include <memory>     // for unique_ptr
+#include <mutex>      // for call_once
+
+#include "glog/logging.h"
+
 #include "paddle/memory/detail/buddy_allocator.h"
 #include "paddle/memory/detail/system_allocator.h"
-#include "paddle/platform/assert.h"
+#include "paddle/platform/gpu_info.h"
 
-#include <boost/variant.hpp>
+DECLARE_double(fraction_of_gpu_memory_to_use);
 
 namespace paddle {
 namespace memory {
 
-void* Alloc(platform::Place pl, size_t size) {
-#ifndef PADDLE_ONLY_CPU
-  if (paddle::platform::is_gpu_place(pl)) {
-    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    return detail::GetGPUBuddyAllocator(gpu_id)->Alloc(size);
-  }
-#endif  // PADDLE_ONLY_CPU
-  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  return detail::GetCPUBuddyAllocator()->Alloc(size);
+using BuddyAllocator = detail::BuddyAllocator;
+
+std::once_flag cpu_allocator_flag;
+std::once_flag gpu_allocator_flag;
+
+BuddyAllocator* GetCPUBuddyAllocator() {
+  static std::unique_ptr<BuddyAllocator> a{nullptr};
+
+  std::call_once(cpu_allocator_flag, [&]() {
+    a.reset(new BuddyAllocator(new detail::CPUAllocator,
+                               platform::CpuMinChunkSize(),
+                               platform::CpuMaxChunkSize()));
+  });
+
+  return a.get();
 }
 
-void Free(paddle::platform::Place pl, void* p) {
-#ifndef PADDLE_ONLY_CPU
-  if (paddle::platform::is_gpu_place(pl)) {
-    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    detail::GetGPUBuddyAllocator(gpu_id)->Free(p);
-  }
-#endif  // PADDLE_ONLY_CPU
-  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  detail::GetCPUBuddyAllocator()->Free(p);
+template <>
+void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
+  return GetCPUBuddyAllocator()->Alloc(size);
+}
+
+template <>
+void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
+  GetCPUBuddyAllocator()->Free(p);
+}
+
+template <>
+size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
+  return GetCPUBuddyAllocator()->Used();
 }
 
-size_t Used(paddle::platform::Place pl) {
 #ifndef PADDLE_ONLY_CPU
-  if (paddle::platform::is_gpu_place(pl)) {
-    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    return detail::GetGPUBuddyAllocator(gpu_id)->Used();
-  }
-#endif  // PADDLE_ONLY_CPU
-  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  return detail::GetCPUBuddyAllocator()->Used();
+
+BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
+  using BuddyAllocVec = std::vector<BuddyAllocator*>;
+  static std::unique_ptr<BuddyAllocVec, void (*)(BuddyAllocVec * p)> as{
+      new BuddyAllocVec, [](BuddyAllocVec* p) {
+        std::for_each(p->begin(), p->end(),
+                      [](BuddyAllocator* p) { delete p; });
+      }};
+
+  // GPU buddy allocators
+  auto& allocators = *as.get();
+
+  // GPU buddy allocator initialization
+  std::call_once(gpu_allocator_flag, [&]() {
+    int gpu_num = platform::GetDeviceCount();
+    allocators.reserve(gpu_num);
+    for (int gpu = 0; gpu < gpu_num; gpu++) {
+      platform::SetDeviceId(gpu);
+      allocators.emplace_back(new BuddyAllocator(new detail::GPUAllocator,
+                                                 platform::GpuMinChunkSize(),
+                                                 platform::GpuMaxChunkSize()));
+    }
+    VLOG(3) << "\n\nNOTE: each GPU device use "
+            << FLAGS_fraction_of_gpu_memory_to_use * 100 << "% of GPU memory.\n"
+            << "You can set environment variable '"
+            << platform::kEnvFractionGpuMemoryToUse
+            << "' to change the fraction of GPU usage.\n\n";
+  });
+
+  platform::SetDeviceId(gpu_id);
+  return allocators[gpu_id];
+}
+
+template <>
+void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
+  return GetGPUBuddyAllocator(place.device)->Alloc(size);
+}
+
+template <>
+void Free<platform::GPUPlace>(platform::GPUPlace place, void* p) {
+  GetGPUBuddyAllocator(place.device)->Free(p);
 }
 
+template <>
+size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
+  return GetGPUBuddyAllocator(place.device)->Used();
+}
+
+#endif  // PADDLE_ONLY_CPU
+
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index a33092bade..11bbb88187 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -19,9 +19,60 @@ limitations under the License. */
 namespace paddle {
 namespace memory {
 
-void* Alloc(paddle::platform::Place, size_t);
-void Free(paddle::platform::Place, void*);
-size_t Used(paddle::platform::Place);
+/**
+ * \brief   Allocate memory block in one place.
+ *
+ * \param[in]  place  Allocation place (CPU or GPU).
+ * \param[in]  size   Allocation size.
+ *
+ * \return  Allocated memory block address.
+ *
+ * \note    If return nullptr, it indicates memory allocation failed
+ *          because insufficient memory in current system. When Alloc
+ *          function is invoked, you must check the returned memory
+ *          address is valid or not.
+ */
+template <typename Place>
+void* Alloc(Place place, size_t size);
+
+/**
+ * \brief   Free memory block in one place.
+ *
+ * \param[in]  place  Allocation place (CPU or GPU).
+ * \param[in]  ptr    Memory block address to free.
+ *
+ */
+template <typename Place>
+void Free(Place place, void* ptr);
+
+/**
+ * \brief   Total size of used memory in one place.
+ *
+ * \param[in]  place  Allocation place (CPU or GPU).
+ *
+ */
+template <typename Place>
+size_t Used(Place place);
+
+/**
+ * \brief   Free memory block in one place.
+ *
+ * \note    In some cases, custom deleter is used to
+ *          deallocate the memory automatically for
+ *          std::unique_ptr<T> in tensor.h.
+ *
+ */
+template <typename T, typename Place>
+class PODDeleter {
+  static_assert(std::is_pod<T>::value, "T must be POD");
+
+ public:
+  explicit PODDeleter(Place place) : place_(place) {}
+  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
+
+ private:
+  Place place_;
+};
 
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
new file mode 100644
index 0000000000..53cc63a098
--- /dev/null
+++ b/paddle/memory/memory_test.cc
@@ -0,0 +1,138 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/memory.h"
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_data.h"
+
+#include "paddle/platform/cpu_info.h"
+#include "paddle/platform/gpu_info.h"
+#include "paddle/platform/place.h"
+
+#include <gtest/gtest.h>
+#include <unordered_map>
+
+inline bool is_aligned(void const *p) {
+  return 0 == (reinterpret_cast<uintptr_t>(p) & 0x3);
+}
+
+size_t align(size_t size, paddle::platform::CPUPlace place) {
+  size += sizeof(paddle::memory::detail::Metadata);
+  size_t alignment = paddle::platform::CpuMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+TEST(BuddyAllocator, CPUAllocation) {
+  void *p = nullptr;
+
+  EXPECT_EQ(p, nullptr);
+
+  paddle::platform::CPUPlace cpu;
+  p = paddle::memory::Alloc(cpu, 4096);
+
+  EXPECT_NE(p, nullptr);
+
+  paddle::memory::Free(cpu, p);
+}
+
+TEST(BuddyAllocator, CPUMultAlloc) {
+  paddle::platform::CPUPlace cpu;
+
+  std::unordered_map<void *, size_t> ps;
+
+  size_t total_size = paddle::memory::Used(cpu);
+  EXPECT_EQ(total_size, 0UL);
+
+  for (auto size :
+       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+    ps[paddle::memory::Alloc(cpu, size)] = size;
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(size, cpu);
+    total_size += aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
+  }
+
+  for (auto p : ps) {
+    EXPECT_EQ(is_aligned(p.first), true);
+    paddle::memory::Free(cpu, p.first);
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(p.second, cpu);
+    total_size -= aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
+  }
+}
+
+#ifndef PADDLE_ONLY_CPU
+
+size_t align(size_t size, paddle::platform::GPUPlace place) {
+  size += sizeof(paddle::memory::detail::Metadata);
+  size_t alignment = paddle::platform::GpuMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+TEST(BuddyAllocator, GPUAllocation) {
+  void *p = nullptr;
+
+  EXPECT_EQ(p, nullptr);
+
+  paddle::platform::GPUPlace gpu(0);
+  p = paddle::memory::Alloc(gpu, 4096);
+
+  EXPECT_NE(p, nullptr);
+
+  paddle::memory::Free(gpu, p);
+}
+
+TEST(BuddyAllocator, GPUMultAlloc) {
+  paddle::platform::GPUPlace gpu;
+
+  std::unordered_map<void *, size_t> ps;
+
+  size_t total_size = paddle::memory::Used(gpu);
+  EXPECT_EQ(total_size, 0UL);
+
+  for (auto size :
+       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+    ps[paddle::memory::Alloc(gpu, size)] = size;
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(gpu) == total_size) continue;
+
+    size_t aligned_size = align(size, gpu);
+    total_size += aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(gpu));
+  }
+
+  for (auto p : ps) {
+    EXPECT_EQ(is_aligned(p.first), true);
+    paddle::memory::Free(gpu, p.first);
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(gpu) == total_size) continue;
+
+    size_t aligned_size = align(p.second, gpu);
+    total_size -= aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(gpu));
+  }
+}
+
+#endif  // PADDLE_ONLY_CPU
diff --git a/paddle/operators/.clang-format b/paddle/operators/.clang-format
new file mode 100644
index 0000000000..47b8a85206
--- /dev/null
+++ b/paddle/operators/.clang-format
@@ -0,0 +1,5 @@
+---
+Language:        Cpp
+BasedOnStyle:  Google
+Standard:  Cpp11
+...
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
new file mode 100644
index 0000000000..a7c89787e4
--- /dev/null
+++ b/paddle/operators/CMakeLists.txt
@@ -0,0 +1,70 @@
+function(op_library TARGET)
+    # op_library is a function to create op library. The interface is same as
+    # cc_library. But it handle split GPU/CPU code and link some common library
+    # for ops.
+    set(cc_srcs)
+    set(cu_srcs)
+    set(op_common_deps operator op_registry)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN})
+
+    foreach(src ${op_library_SRCS})
+        if (${src} MATCHES ".*\\.cu$")
+            list(APPEND cu_srcs ${src})
+        elseif(${src} MATCHES ".*\\.cc$")
+            list(APPEND cc_srcs ${src})
+        else()
+            message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
+        endif()
+    endforeach()
+
+    list(LENGTH cc_srcs cc_srcs_len)
+    if (${cc_srcs_len} EQUAL 0)
+        message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
+    endif()
+
+    list(LENGTH cu_srcs cu_srcs_len)
+    list(LENGTH op_library_DEPS dep_len)
+    if (${cu_srcs_len} EQUAL 0 AND ${dep_len} EQUAL 0)
+        message(WARNING "The op library ${TARGET} not support GPU!")
+    endif()
+
+    if (WITH_GPU)
+        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
+                ${op_common_deps})
+    else()
+        cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS}
+                ${op_common_deps})
+    endif()
+endfunction()
+
+add_subdirectory(math)
+cc_test(gather_test SRCS gather_test.cc DEPS tensor)
+
+cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
+
+cc_library(net_op SRCS net_op.cc DEPS op_registry)
+cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
+
+op_library(add_op SRCS add_op.cc add_op.cu)
+
+op_library(mean_op SRCS mean_op.cc mean_op.cu)
+
+op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function)
+op_library(rowwise_add_op SRCS rowwise_add_op.cu rowwise_add_op.cc)
+
+op_library(sigmoid_op SRCS sigmoid_op.cc sigmoid_op.cu)
+op_library(softmax_op SRCS softmax_op.cc softmax_op.cu)
+op_library(gaussian_random_op SRCS gaussian_random_op.cc gaussian_random_op.cu)
+op_library(cross_entropy_op SRCS cross_entropy_op.cc cross_entropy_op.cu)
+op_library(fill_zeros_like_op SRCS fill_zeros_like_op.cc fill_zeros_like_op.cu)
+
+op_library(sgd_op SRCS sgd_op.cc sgd_op.cu)
+
+op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
+    DEPS framework_proto tensor op_registry operator net_op)
+op_library(uniform_random_op
+        SRCS uniform_random_op.cc uniform_random_op.cu)
diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc
new file mode 100644
index 0000000000..8ab748ed71
--- /dev/null
+++ b/paddle/operators/add_op.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/add_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AddOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims(),
+                      ctx.Input<Tensor>("Y")->dims(),
+                      "Two input of Add Op's dimension must be same.");
+    ctx.Output<Tensor>("Out")->Resize(ctx.Input<Tensor>("X")->dims());
+  }
+};
+
+class AddOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AddOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of add op");
+    AddInput("Y", "The second input of add op");
+    AddOutput("Out", "The output of add op");
+    AddComment(R"DOC(
+Two Element Add Operator.
+
+The equation is: Out = X + Y
+)DOC");
+  }
+};
+
+class AddOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(add_two, ops::AddOp, ops::AddOpMaker, add_two_grad, ops::AddOpGrad);
+
+REGISTER_OP_CPU_KERNEL(add_two,
+                       ops::AddKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/add_op.cu b/paddle/operators/add_op.cu
new file mode 100644
index 0000000000..cec5f558cb
--- /dev/null
+++ b/paddle/operators/add_op.cu
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/add_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(add_two,
+                       ops::AddKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/add_op.h b/paddle/operators/add_op.h
new file mode 100644
index 0000000000..a7307b6818
--- /dev/null
+++ b/paddle/operators/add_op.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class AddKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input0 = context.Input<Tensor>("X");
+    auto* input1 = context.Input<Tensor>("Y");
+    auto* output = context.Output<Tensor>("Out");
+
+    output->mutable_data<T>(context.GetPlace());
+
+    auto X = EigenVector<T>::Flatten(*input0);
+    auto Y = EigenVector<T>::Flatten(*input1);
+    auto Z = EigenVector<T>::Flatten(*output);
+
+    auto place = context.GetEigenDevice<Place>();
+
+    Z.device(place) = X + Y;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
new file mode 100644
index 0000000000..ab1e1c101a
--- /dev/null
+++ b/paddle/operators/cross_entropy_op.cc
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/cross_entropy_op.h"
+
+namespace paddle {
+namespace operators {
+
+class OnehotCrossEntropyOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto *X = ctx.Input<Tensor>("X");
+    auto *label = ctx.Input<Tensor>("label");
+
+    PADDLE_ENFORCE_EQ(X->dims().size(), 2, "X's dimension must be 2.");
+    PADDLE_ENFORCE_EQ(label->dims().size(), 1, "label's dimension must be 1.");
+    PADDLE_ENFORCE_EQ(X->dims()[0], label->dims()[0]);
+    ctx.Output<Tensor>("Y")->Resize({X->dims()[0]});
+  }
+};
+
+class OnehotCrossEntropyGradientOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto X = ctx.Input<Tensor>("X");
+
+    dX->Resize(X->dims());
+  }
+};
+
+class OnehotCrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  OnehotCrossEntropyOpMaker(framework::OpProto *proto,
+                            framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of OnehotCrossEntropyOp");
+    AddInput("label", "The second input of OnehotCrossEntropyOp");
+    AddOutput("Y", "The output of OnehotCrossEntropyOp");
+    AddComment(R"DOC(
+OnehotCrossEntropy Operator.
+
+                Y[i] = -log(X[i][j])
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(onehot_cross_entropy, ops::OnehotCrossEntropyOp,
+            ops::OnehotCrossEntropyOpMaker, onehot_cross_entropy_grad,
+            ops::OnehotCrossEntropyGradientOp);
+REGISTER_OP_CPU_KERNEL(onehot_cross_entropy,
+                       ops::OnehotCrossEntropyOpKernel<float>);
+REGISTER_OP_CPU_KERNEL(onehot_cross_entropy_grad,
+                       ops::OnehotCrossEntropyGradientOpKernel<float>);
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
new file mode 100644
index 0000000000..d999bfce58
--- /dev/null
+++ b/paddle/operators/cross_entropy_op.cu
@@ -0,0 +1,133 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/assert.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+__host__ __device__ T clipping_log(const T x) {
+  PADDLE_ASSERT(std::is_floating_point<T>::value);
+  const T kApproInf = 1e20;
+  T v = log(x);
+  if (v == INFINITY) {
+    return kApproInf;
+  }
+  if (v == -INFINITY) {
+    return -kApproInf;
+  }
+  return v;
+}
+
+template <typename T>
+__global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
+                                   const int N, const int D) {
+  // TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
+  // CUDA_1D_KERNEL_LOOP(i, N) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
+    Y[i] = -clipping_log(X[i * D + label[i]]);
+  }
+}
+
+// TODO(qingqing): make zero setting an common function.
+template <typename T>
+__global__ void zero(T* X, const int N) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    X[i] = 0.0;
+  }
+}
+
+template <typename T>
+__global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
+                                           const int* label, const int N,
+                                           const int D) {
+  // TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
+  // CUDA_1D_KERNEL_LOOP(i, N) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    int idx = i * D + label[i];
+    dX[idx] = -dY[i] / X[idx];
+  }
+}
+
+template <typename T>
+class OnehotCrossEntropyOpCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use GPUPlace.");
+
+    auto X = ctx.Input<Tensor>("X");
+    const T* Xdata = X->data<T>();
+    const int* label_data = ctx.Input<Tensor>("label")->data<int>();
+    auto Y = ctx.Output<Tensor>("Y");
+    Y->mutable_data<T>(ctx.GetPlace());
+    T* Ydata = Y->data<T>();
+
+    int N = X->dims()[0];
+    int D = X->dims()[1];
+    int block = 512;
+    int grid = (N + block - 1) / block;
+    // TODO(qingqing) launch kernel on specified stream
+    // base on ExecutionContext.
+    CrossEntropyKernel<T><<<grid, block>>>(Ydata, Xdata, label_data, N, D);
+  }
+};
+
+template <typename T>
+class OnehotCrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use GPUPlace.");
+
+    auto X = ctx.Input<Tensor>("X");
+    auto dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto dY = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto label = ctx.Input<Tensor>("label");
+
+    auto* dXdata = dX->template mutable_data<T>(ctx.GetPlace());
+    auto* dYdata = dY->template data<T>();
+    auto* Xdata = X->template data<T>();
+    auto* label_data = label->data<int>();
+
+    int N = X->dims()[0];
+    int D = X->dims()[1];
+    int block = 512;
+    int grid = (N * D + block - 1) / block;
+    zero<T><<<grid, block>>>(dXdata, N * D);
+
+    grid = (N + block - 1) / block;
+    // TODO(qingqing): launch kernel on specified stream
+    // base on ExecutionContext.
+    CrossEntropyGradientKernel<T><<<grid, block>>>(dXdata, dYdata, Xdata,
+                                                   label_data, N, D);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(onehot_cross_entropy,
+                       ops::OnehotCrossEntropyOpCUDAKernel<float>);
+REGISTER_OP_GPU_KERNEL(onehot_cross_entropy_grad,
+                       ops::OnehotCrossEntropyGradientOpCUDAKernel<float>);
diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h
new file mode 100644
index 0000000000..eb4d1348de
--- /dev/null
+++ b/paddle/operators/cross_entropy_op.h
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+inline T tolerable_value(const T x) {
+  static_assert(std::is_floating_point<T>::value,
+                "tolerable_value works only on float, "
+                "double and double double.");
+
+  const T kApproInf = 1e20;
+
+  if (x == INFINITY) {
+    return kApproInf;
+  }
+
+  if (x == -INFINITY) {
+    return -kApproInf;
+  }
+
+  return x;
+}
+
+template <typename T>
+class OnehotCrossEntropyOpKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    auto X = ctx.Input<Tensor>("X");
+    const T* Xdata = X->data<T>();
+    const int* label_data = ctx.Input<Tensor>("label")->data<int>();
+    auto Y = ctx.Output<Tensor>("Y");
+
+    Y->mutable_data<T>(ctx.GetPlace());
+
+    T* Ydata = Y->data<T>();
+
+    int batch_size = X->dims()[0];
+    int class_num = X->dims()[1];
+
+    for (int i = 0; i < batch_size; ++i) {
+      int index = i * class_num + label_data[i];
+      Ydata[i] = -tolerable_value(std::log(Xdata[index]));
+    }
+  }
+};
+
+template <typename T>
+class OnehotCrossEntropyGradientOpKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    auto X = ctx.Input<Tensor>("X");
+    auto dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto dY = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto label = ctx.Input<Tensor>("label");
+
+    auto* dXdata = dX->template mutable_data<T>(ctx.GetPlace());
+    auto* dYdata = dY->template data<T>();
+    auto* Xdata = X->template data<T>();
+    auto* label_data = label->data<int>();
+
+    const int batch_size = X->dims()[0];
+    const int class_num = X->dims()[1];
+
+    // TODO(qingqing): make zero setting an common function.
+    memset(dXdata, 0, sizeof(T) * batch_size * class_num);
+    for (int i = 0; i < batch_size; ++i) {
+      int index = i * class_num + label_data[i];
+      dXdata[index] = -tolerable_value(dYdata[i] / Xdata[index]);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc
new file mode 100644
index 0000000000..9d51f6e3a1
--- /dev/null
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/fill_zeros_like_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FillZerosLikeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    ctx.Output<framework::Tensor>("Dst")->Resize(
+        ctx.Input<framework::Tensor>("Src")->dims());
+  }
+};
+
+class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FillZerosLikeOpMaker(framework::OpProto *proto,
+                       framework::OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Src", "The input of fill-zeros-like op.");
+    AddOutput("Dst", "The varibale will be filled up with zeros.");
+    AddComment(R"DOC(
+Fill up a vriable with zeros.
+
+The output will have the same size with input.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, ops::FillZerosLikeOp,
+                             ops::FillZerosLikeOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    fill_zeros_like,
+    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/fill_zeros_like_op.cu b/paddle/operators/fill_zeros_like_op.cu
new file mode 100644
index 0000000000..fdbcf520a0
--- /dev/null
+++ b/paddle/operators/fill_zeros_like_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/fill_zeros_like_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    fill_zeros_like,
+    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/fill_zeros_like_op.h b/paddle/operators/fill_zeros_like_op.h
new file mode 100644
index 0000000000..fd380ca851
--- /dev/null
+++ b/paddle/operators/fill_zeros_like_op.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class FillZerosLikeKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* output = context.Output<framework::Tensor>("Dst");
+    output->mutable_data<T>(context.GetPlace());
+    auto t = framework::EigenVector<T>::Flatten(*output);
+    t.device(context.GetEigenDevice<Place>()) = t.constant(T(0));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/gather.h b/paddle/operators/gather.h
new file mode 100644
index 0000000000..d6e6990394
--- /dev/null
+++ b/paddle/operators/gather.h
@@ -0,0 +1,73 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <memory.h>
+#include <cstring>
+
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace operators {
+
+// Implementation of CPU copy
+template <typename T>
+void CPUGather(const T* params, const int* indices, const int slice_size,
+               const int index_size, T* output) {
+  const size_t slice_bytes = slice_size * sizeof(T);
+
+  for (int i = 0; i < index_size; ++i) {
+    int index_ = indices[i];
+    memcpy(output + i * slice_size, params + index_ * slice_size, slice_bytes);
+  }
+}
+
+// Implementation of GPU copy:
+template <typename T>
+void GPUGather(const T* src, const int* index, const int slice_size,
+               const int index_size, T* output);
+
+/**
+ * Return a new tensor from source tensor, gathered according to index
+ * input[src]: type-T source Tensor
+ * input[index]: type-int index Tensor (1-D)
+ * return: output tensor
+ */
+template <typename T>
+void Gather(const platform::Place& place, const paddle::framework::Tensor* src,
+            const paddle::framework::Tensor* index,
+            paddle::framework::Tensor* output) {
+  // check index of shape 1-D
+  PADDLE_ENFORCE(index->dims().size() == 1);
+  int index_size = index->dims()[0];
+
+  auto src_dims = src->dims();
+  paddle::framework::DDim output_dims(src_dims);
+  output_dims[0] = index_size;
+
+  // slice size
+  int slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+
+  // Gathering
+  if (platform::is_cpu_place(place)) {
+    CPUGather<T>(src->data<T>(), index->data<int>(), slice_size, index_size,
+                 output->data<T>());
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/gather_test.cc b/paddle/operators/gather_test.cc
new file mode 100644
index 0000000000..0ae1e99452
--- /dev/null
+++ b/paddle/operators/gather_test.cc
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/gather.h"
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/place.h"
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <string>
+
+TEST(Gather, GatherData) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  using namespace paddle::operators;
+
+  Tensor* src = new Tensor();
+  Tensor* index = new Tensor();
+  Tensor* output = new Tensor();
+
+  int* p_src = nullptr;
+  int* p_index = nullptr;
+  p_src = src->mutable_data<int>(make_ddim({3, 4}), CPUPlace());
+  p_index = index->mutable_data<int>(make_ddim({2}), CPUPlace());
+
+  for (int i = 0; i < 12; ++i) p_src[i] = i;
+  p_index[0] = 1;
+  p_index[1] = 0;
+
+  int* p_output = output->mutable_data<int>(make_ddim({2, 4}), CPUPlace());
+
+  Gather<int>(CPUPlace(), src, index, output);
+
+  for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
+  for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
+
+  delete src;
+  delete index;
+  delete output;
+}
diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
new file mode 100644
index 0000000000..a85363ad81
--- /dev/null
+++ b/paddle/operators/gaussian_random_op.cc
@@ -0,0 +1,83 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <random>
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CPUGaussianRandomKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    float mean = context.op_.GetAttr<float>("mean");
+    float std = context.op_.GetAttr<float>("std");
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+
+    unsigned int seed =
+        static_cast<unsigned int>(context.op_.GetAttr<int>("seed"));
+    std::minstd_rand engine;
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    engine.seed(seed);
+    std::normal_distribution<T> dist(mean, std);
+    ssize_t size = framework::product(tensor->dims());
+    for (ssize_t i = 0; i < size; ++i) {
+      data[i] = dist(engine);
+    }
+  }
+};
+
+class GaussianRandomOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext& context) const override {
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    auto dims = GetAttr<std::vector<int>>("dims");
+    PADDLE_ENFORCE(dims.size() > 0UL,
+                   "dims can be one int or array. dims must be set.");
+    tensor->Resize(framework::make_ddim(dims));
+  }
+};
+
+class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GaussianRandomOpMaker(framework::OpProto* proto,
+                        framework::OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "output matrix of random op");
+    AddComment(R"DOC(
+GaussianRandom operator.
+Use to initialize tensor with gaussian random generator.
+)DOC");
+
+    AddAttr<std::vector<int>>("dims", "The dimension of random tensor.");
+    AddAttr<float>("mean", "mean of random tensor.").SetDefault(.0f);
+    AddAttr<float>("std", "std of random tensor.").SetDefault(1.0f);
+    AddAttr<int>("seed",
+                 "Random seed of generator."
+                 "0 means use system wide seed")
+        .SetDefault(0);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp,
+                             ops::GaussianRandomOpMaker);
+REGISTER_OP_CPU_KERNEL(gaussian_random, ops::CPUGaussianRandomKernel<float>);
diff --git a/paddle/operators/gaussian_random_op.cu b/paddle/operators/gaussian_random_op.cu
new file mode 100644
index 0000000000..018a4bfcb2
--- /dev/null
+++ b/paddle/operators/gaussian_random_op.cu
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct GaussianGenerator {
+  T mean_, std_;
+  unsigned int seed_;
+
+  __host__ __device__ GaussianGenerator(T mean, T std, int seed)
+      : mean_(mean), std_(std), seed_(seed) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::normal_distribution<T> dist(mean_, std_);
+    rng.discard(n);
+    return dist(rng);
+  }
+};
+
+template <typename T>
+class GPUGaussianRandomKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+    unsigned int seed =
+        static_cast<unsigned int>(context.op_.GetAttr<int>("seed"));
+    if (seed == 0) {
+      std::random_device rd;
+      seed = rd();
+    }
+    T mean = static_cast<T>(context.op_.GetAttr<float>("mean"));
+    T std = static_cast<T>(context.op_.GetAttr<float>("std"));
+    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+    ssize_t N = framework::product(tensor->dims());
+    thrust::transform(index_sequence_begin, index_sequence_begin + N,
+                      thrust::device_ptr<T>(data),
+                      GaussianGenerator<T>(mean, std, seed));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_GPU_KERNEL(gaussian_random,
+                       paddle::operators::GPUGaussianRandomKernel<float>);
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
new file mode 100644
index 0000000000..ed51d416ed
--- /dev/null
+++ b/paddle/operators/math/CMakeLists.txt
@@ -0,0 +1,8 @@
+
+if(WITH_GPU)
+    nv_library(math_function SRCS math_function.cc math_function.cu DEPS cblas device_context)
+else()
+    cc_library(math_function SRCS math_function.cc DEPS cblas device_context)
+endif()
+
+nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc
new file mode 100644
index 0000000000..1e86fc3d16
--- /dev/null
+++ b/paddle/operators/math/math_function.cc
@@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <>
+void gemm<platform::CPUPlace, float>(const CBLAS_TRANSPOSE transA,
+                                     const CBLAS_TRANSPOSE transB, const int M,
+                                     const int N, const int K,
+                                     const float alpha, const float* A,
+                                     const float* B, const float beta, float* C,
+                                     platform::DeviceContext* context) {
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  cblas_sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
+              beta, C, ldc);
+}
+
+template <>
+void gemm<platform::CPUPlace, double>(const CBLAS_TRANSPOSE transA,
+                                      const CBLAS_TRANSPOSE transB, const int M,
+                                      const int N, const int K,
+                                      const double alpha, const double* A,
+                                      const double* B, const double beta,
+                                      double* C,
+                                      platform::DeviceContext* context) {
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  cblas_dgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
+              beta, C, ldc);
+}
+
+template <>
+void matmul<platform::CPUPlace, float>(const framework::Tensor& matrix_a,
+                                       bool trans_a,
+                                       const framework::Tensor& matrix_b,
+                                       bool trans_b, float alpha,
+                                       framework::Tensor* matrix_out,
+                                       float beta,
+                                       platform::DeviceContext* context) {
+  auto dim_a = matrix_a.dims();
+  auto dim_b = matrix_b.dims();
+  auto dim_out = matrix_out->dims();
+  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
+                 "The input and output of matmul be matrix");
+
+  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
+                     platform::is_cpu_place(matrix_b.place()) &&
+                     platform::is_cpu_place(matrix_out->place()),
+                 "Matrix must all be in CPUPlace");
+
+  int M = dim_out[0];
+  int N = dim_out[1];
+  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+
+  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
+  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
+
+  gemm<platform::CPUPlace, float>(
+      transA, transB, M, N, K, alpha, matrix_a.data<float>(),
+      matrix_b.data<float>(), beta, matrix_out->data<float>(), context);
+}
+
+template <>
+void matmul<platform::CPUPlace, double>(const framework::Tensor& matrix_a,
+                                        bool trans_a,
+                                        const framework::Tensor& matrix_b,
+                                        bool trans_b, double alpha,
+                                        framework::Tensor* matrix_out,
+                                        double beta,
+                                        platform::DeviceContext* context) {
+  auto dim_a = matrix_a.dims();
+  auto dim_b = matrix_b.dims();
+  auto dim_out = matrix_out->dims();
+  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
+                 "The input and output of matmul be matrix");
+
+  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
+                     platform::is_cpu_place(matrix_b.place()) &&
+                     platform::is_cpu_place(matrix_out->place()),
+                 "Matrix must all be in CPUPlace");
+
+  int M = dim_out[0];
+  int N = dim_out[1];
+  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+
+  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
+  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
+
+  gemm<platform::CPUPlace, double>(
+      transA, transB, M, N, K, alpha, matrix_a.data<double>(),
+      matrix_b.data<double>(), beta, matrix_out->data<double>(), context);
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu
new file mode 100644
index 0000000000..da40b27c94
--- /dev/null
+++ b/paddle/operators/math/math_function.cu
@@ -0,0 +1,127 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <>
+void gemm<platform::GPUPlace, float>(const CBLAS_TRANSPOSE transA,
+                                     const CBLAS_TRANSPOSE transB, const int M,
+                                     const int N, const int K,
+                                     const float alpha, const float* A,
+                                     const float* B, const float beta, float* C,
+                                     platform::DeviceContext* context) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  PADDLE_ENFORCE(platform::dynload::cublasSgemm(
+      reinterpret_cast<platform::CUDADeviceContext*>(context)->cublas_handle(),
+      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
+}
+
+template <>
+void gemm<platform::GPUPlace, double>(const CBLAS_TRANSPOSE transA,
+                                      const CBLAS_TRANSPOSE transB, const int M,
+                                      const int N, const int K,
+                                      const double alpha, const double* A,
+                                      const double* B, const double beta,
+                                      double* C,
+                                      platform::DeviceContext* context) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  PADDLE_ENFORCE(platform::dynload::cublasDgemm(
+      reinterpret_cast<platform::CUDADeviceContext*>(context)->cublas_handle(),
+      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
+}
+
+template <>
+void matmul<platform::GPUPlace, float>(const framework::Tensor& matrix_a,
+                                       bool trans_a,
+                                       const framework::Tensor& matrix_b,
+                                       bool trans_b, float alpha,
+                                       framework::Tensor* matrix_out,
+                                       float beta,
+                                       platform::DeviceContext* context) {
+  auto dim_a = matrix_a.dims();
+  auto dim_b = matrix_b.dims();
+  auto dim_out = matrix_out->dims();
+  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
+                 "The input and output of matmul be matrix");
+
+  PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
+                     platform::is_gpu_place(matrix_b.place()) &&
+                     platform::is_gpu_place(matrix_out->place()),
+                 "Matrix must all be in GPUPlace");
+
+  int M = dim_out[0];
+  int N = dim_out[1];
+  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+
+  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
+  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
+
+  gemm<platform::GPUPlace, float>(
+      transA, transB, M, N, K, alpha, matrix_a.data<float>(),
+      matrix_b.data<float>(), beta, matrix_out->data<float>(), context);
+}
+
+template <>
+void matmul<platform::GPUPlace, double>(const framework::Tensor& matrix_a,
+                                        bool trans_a,
+                                        const framework::Tensor& matrix_b,
+                                        bool trans_b, double alpha,
+                                        framework::Tensor* matrix_out,
+                                        double beta,
+                                        platform::DeviceContext* context) {
+  auto dim_a = matrix_a.dims();
+  auto dim_b = matrix_b.dims();
+  auto dim_out = matrix_out->dims();
+  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
+                 "The input and output of matmul be matrix");
+
+  PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
+                     platform::is_gpu_place(matrix_b.place()) &&
+                     platform::is_gpu_place(matrix_out->place()),
+                 "Matrix must all be in GPUPlace");
+
+  int M = dim_out[0];
+  int N = dim_out[1];
+  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+
+  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
+  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
+
+  gemm<platform::GPUPlace, double>(
+      transA, transB, M, N, K, alpha, matrix_a.data<double>(),
+      matrix_b.data<double>(), beta, matrix_out->data<double>(), context);
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h
new file mode 100644
index 0000000000..155589fadb
--- /dev/null
+++ b/paddle/operators/math/math_function.h
@@ -0,0 +1,82 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#ifdef PADDLE_USE_MKLML
+#include <mkl_cblas.h>
+#include <mkl_lapacke.h>
+#include <mkl_vml_functions.h>
+#endif
+
+#ifdef PADDLE_USE_MKL
+#include <mkl.h>
+#include <mkl_lapacke.h>
+#endif
+
+#ifdef PADDLE_USE_ATLAS
+extern "C" {
+#include <cblas.h>
+#include <clapack.h>
+}
+#endif
+
+#ifdef PADDLE_USE_OPENBLAS
+#include <cblas.h>
+#include <lapacke.h>
+#endif
+
+#ifndef LAPACK_FOUND
+extern "C" {
+#include <cblas.h>
+int LAPACKE_sgetrf(int matrix_layout, int m, int n, float* a, int lda,
+                   int* ipiv);
+int LAPACKE_dgetrf(int matrix_layout, int m, int n, double* a, int lda,
+                   int* ipiv);
+int LAPACKE_sgetri(int matrix_layout, int n, float* a, int lda,
+                   const int* ipiv);
+int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
+                   const int* ipiv);
+}
+#endif
+
+#include <cmath>
+
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// Support continuous memory now
+// If transA = N, and transB = N
+// Then matrixA: M * K, matrixB: K * N matrixC : M * N
+// For more detailed info, please refer to
+// http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html
+template <typename Place, typename T>
+void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
+          const int M, const int N, const int K, const T alpha, const T* A,
+          const T* B, const T beta, T* C, platform::DeviceContext* context);
+
+// matrix multiply with continuous memory
+template <typename Place, typename T>
+void matmul(const framework::Tensor& matrix_a, bool trans_a,
+            const framework::Tensor& matrix_b, bool trans_b, T alpha,
+            framework::Tensor* matrix_out, T beta,
+            platform::DeviceContext* context);
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/math_function_test.cc b/paddle/operators/math/math_function_test.cc
new file mode 100644
index 0000000000..6c020c4ff7
--- /dev/null
+++ b/paddle/operators/math/math_function_test.cc
@@ -0,0 +1,75 @@
+#include "paddle/operators/math/math_function.h"
+#include "gtest/gtest.h"
+
+#ifndef PADDLE_ONLY_CPU
+TEST(math_function, notrans_mul_trans) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor out_gpu;
+  paddle::framework::Tensor out;
+
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr, 6 * sizeof(float));
+
+  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  paddle::platform::DeviceContext* context =
+      new paddle::platform::CUDADeviceContext(*gpu_place);
+
+  input1_gpu.CopyFrom<float>(input1, *gpu_place);
+  input2_gpu.CopyFrom<float>(input1, *gpu_place);
+
+  out_gpu.mutable_data<float>({2, 2}, *gpu_place);
+
+  paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
+      input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0, context);
+
+  out.CopyFrom<float>(out_gpu, *cpu_place);
+
+  float* out_ptr = out.data<float>();
+  EXPECT_EQ(out_ptr[0], 5);
+  EXPECT_EQ(out_ptr[1], 14);
+  EXPECT_EQ(out_ptr[2], 14);
+  EXPECT_EQ(out_ptr[3], 50);
+}
+
+TEST(math_function, trans_mul_notrans) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor out_gpu;
+  paddle::framework::Tensor out;
+
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr, 6 * sizeof(float));
+
+  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  paddle::platform::DeviceContext* context =
+      new paddle::platform::CUDADeviceContext(*gpu_place);
+
+  input1_gpu.CopyFrom<float>(input1, *gpu_place);
+  input2_gpu.CopyFrom<float>(input1, *gpu_place);
+
+  out_gpu.mutable_data<float>({3, 3}, *gpu_place);
+
+  paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
+      input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0, context);
+
+  out.CopyFrom<float>(out_gpu, *cpu_place);
+
+  float* out_ptr = out.data<float>();
+  EXPECT_EQ(out_ptr[0], 9);
+  EXPECT_EQ(out_ptr[1], 12);
+  EXPECT_EQ(out_ptr[2], 15);
+  EXPECT_EQ(out_ptr[3], 12);
+  EXPECT_EQ(out_ptr[4], 17);
+  EXPECT_EQ(out_ptr[5], 22);
+  EXPECT_EQ(out_ptr[6], 15);
+  EXPECT_EQ(out_ptr[7], 22);
+  EXPECT_EQ(out_ptr[8], 29);
+}
+#endif
diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc
new file mode 100644
index 0000000000..d3d0e55a67
--- /dev/null
+++ b/paddle/operators/mean_op.cc
@@ -0,0 +1,61 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/mean_op.h"
+
+namespace paddle {
+namespace operators {
+
+class MeanOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input of MeanOp must be initialized.");
+    ctx.Output<Tensor>("Out")->Resize({1});
+  }
+};
+
+class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MeanOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of mean op");
+    AddOutput("Out", "The output of mean op").NotInGradient();
+    AddComment("Mean Operator");
+  }
+};
+
+class MeanGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    ctx.Output<Tensor>(framework::GradVarName("X"))
+        ->Resize(ctx.Input<Tensor>("X")->dims());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(mean, ops::MeanOp, ops::MeanOpMaker, mean_grad, ops::MeanGradOp);
+REGISTER_OP_CPU_KERNEL(mean,
+                       ops::MeanKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(mean_grad,
+                       ops::MeanGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/mean_op.cu b/paddle/operators/mean_op.cu
new file mode 100644
index 0000000000..7af624d81d
--- /dev/null
+++ b/paddle/operators/mean_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/mean_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(mean,
+                       ops::MeanKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(mean_grad,
+                       ops::MeanGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/mean_op.h b/paddle/operators/mean_op.h
new file mode 100644
index 0000000000..9848af280b
--- /dev/null
+++ b/paddle/operators/mean_op.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class MeanKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+
+    output->mutable_data<T>(context.GetPlace());
+
+    auto X = EigenVector<T>::Flatten(*input);
+    auto y = EigenScalar<T>::From(*output);
+    auto& place = context.GetEigenDevice<Place>();
+
+    y.device(place) = X.mean();
+  }
+};
+
+template <typename Place, typename T>
+class MeanGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto OG = context.Input<Tensor>(framework::GradVarName("Out"));
+    PADDLE_ENFORCE(framework::product(OG->dims()) == 1,
+                   "Mean Gradient should be scalar");
+    auto IG = context.Output<Tensor>(framework::GradVarName("X"));
+    IG->mutable_data<T>(context.GetPlace());
+
+    T ig_size = (T)framework::product(IG->dims());
+    Eigen::DSizes<int, 1> bcast(ig_size);
+
+    EigenVector<T>::Flatten(*IG).device(context.GetEigenDevice<Place>()) =
+        (EigenVector<T>::From(*OG) / ig_size).broadcast(bcast);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
new file mode 100644
index 0000000000..173cc3850c
--- /dev/null
+++ b/paddle/operators/mul_op.cc
@@ -0,0 +1,90 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/mul_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class MulOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto dim0 = ctx.Input<Tensor>("X")->dims();
+    auto dim1 = ctx.Input<Tensor>("Y")->dims();
+    PADDLE_ENFORCE_EQ(dim0.size(), 2,
+                      "input X(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("X"));
+    PADDLE_ENFORCE_EQ(dim1.size(), 2,
+                      "input Y(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("Y"));
+    PADDLE_ENFORCE_EQ(
+        dim0[1], dim1[0],
+        "First matrix's width must be equal with second matrix's height.");
+    ctx.Output<Tensor>("Out")->Resize({dim0[0], dim1[1]});
+  }
+};
+
+class MulOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of mul op");
+    AddInput("Y", "The second input of mul op");
+    AddOutput("Out", "The output of mul op");
+    AddComment(R"DOC(
+Two Element Mul Operator.
+
+The equation is: Out = X * Y
+)DOC");
+  }
+};
+
+class MulOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
+                            "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx.Input<Tensor>("X")->dims();
+    auto y_dims = ctx.Input<Tensor>("Y")->dims();
+    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
+    auto *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *y_grad = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    PADDLE_ENFORCE(x_dims[0] == out_dims[0],
+                   "Out@GRAD M X N must equal to X dims 0, M ");
+    PADDLE_ENFORCE(y_dims[1] == out_dims[1],
+                   "Out@GRAD M X N must equal to Y dims 1, N ");
+
+    x_grad->Resize(x_dims);
+    y_grad->Resize(y_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
+REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(mul_grad,
+                       ops::MulGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/mul_op.cu b/paddle/operators/mul_op.cu
new file mode 100644
index 0000000000..a81444dbe6
--- /dev/null
+++ b/paddle/operators/mul_op.cu
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/mul_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(mul_grad,
+                       ops::MulGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
new file mode 100644
index 0000000000..8facc02814
--- /dev/null
+++ b/paddle/operators/mul_op.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/math/math_function.h"
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class MulKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<Tensor>("X");
+    auto* Y = context.Input<Tensor>("Y");
+    auto* Z = context.Output<Tensor>("Out");
+    Z->mutable_data<T>(context.GetPlace());
+    auto* device_context =
+        const_cast<platform::DeviceContext*>(context.device_context_);
+    math::matmul<Place, T>(*X, false, *Y, false, 1, Z, 0, device_context);
+  }
+};
+
+template <typename Place, typename T>
+class MulGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* X = ctx.Input<Tensor>("X");
+    auto* Y = ctx.Input<Tensor>("Y");
+    auto* dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dY = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    dX->mutable_data<T>(ctx.GetPlace());
+    dY->mutable_data<T>(ctx.GetPlace());
+    auto* device_context =
+        const_cast<platform::DeviceContext*>(ctx.device_context_);
+    // dX = dOut * Y'. dX: M x K, dOut : M x N, Y : K x N
+    math::matmul<Place, T>(*dOut, false, *Y, true, 1, dX, 0, device_context);
+    // dY = X' * dOut. dY: K x N, dOut : M x N, X : M x K
+    math::matmul<Place, T>(*X, true, *dOut, false, 1, dY, 0, device_context);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/net_op.cc b/paddle/operators/net_op.cc
new file mode 100644
index 0000000000..a7d7105110
--- /dev/null
+++ b/paddle/operators/net_op.cc
@@ -0,0 +1,98 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#include "paddle/operators/net_op.h"
+#include <set>
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+const char NetOp::kAll[] = "all";
+
+void NetOp::CompleteAddOp(bool calc) {
+  add_op_done_ = true;
+  if (!calc) return;
+  std::set<std::string> input_set;
+  std::set<std::string> output_set;
+  for (auto& op : ops_) {
+    for (auto& ipt : op->Inputs()) {
+      for (auto& var_name : ipt.second) {
+        if (!Contains(output_set, var_name)) {  // Not other op's output
+          input_set.insert(var_name);
+        } else {
+          intermediate_outputs_.insert(var_name);
+        }
+      }
+    }
+
+    for (auto& opt : op->Outputs()) {
+      for (auto& var_name : opt.second) {
+        output_set.insert(var_name);
+      }
+    }
+  }
+  auto& inputs = inputs_[kAll];
+  inputs.reserve(input_set.size());
+  std::copy(input_set.begin(), input_set.end(), std::back_inserter(inputs));
+  auto& outputs = outputs_[kAll];
+  outputs.reserve(output_set.size());
+  std::copy(output_set.begin(), output_set.end(), std::back_inserter(outputs));
+}
+
+std::string NetOp::DebugString() const {
+  std::ostringstream os;
+  os << OperatorBase::DebugString() << std::endl;
+  for (auto& op : ops_) {
+    std::istringstream is(op->DebugString());
+    for (std::string line; std::getline(is, line);) {
+      os << "    " << line << std::endl;
+    }
+  }
+  return os.str();
+}
+
+bool NetOp::IsNetOp() const { return true; }
+
+std::vector<std::string> NetOp::OutputVars(bool has_intermediate) const {
+  if (has_intermediate) {
+    return this->outputs_.at(kAll);
+  }
+  auto& all = this->outputs_.at(kAll);
+  std::vector<std::string> ret_val;
+  for (auto& each : all) {
+    if (!Contains(intermediate_outputs_, each)) {
+      ret_val.push_back(each);
+    }
+  }
+  return ret_val;
+}
+
+NetOp::NetOp(const std::string& type,
+             const framework::OperatorBase::VarNameMap& inputs,
+             const framework::OperatorBase::VarNameMap& outputs,
+             const framework::AttributeMap& attrs)
+    : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+std::unique_ptr<framework::OperatorBase> NetOp::Clone() const {
+  PADDLE_ENFORCE(
+      add_op_done_,
+      "Must clone a sealed NetOp, invoke Net::CompleteAddOp before clone");
+  return std::unique_ptr<OperatorBase>(new NetOp(*this));
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/net_op.h b/paddle/operators/net_op.h
new file mode 100644
index 0000000000..3d3f996ef5
--- /dev/null
+++ b/paddle/operators/net_op.h
@@ -0,0 +1,133 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+/**
+ * @brief Network is also a type of Operator
+ *
+ * It will manage the operators it has.
+ *
+ * Network is the container and controller of a set of operators.
+
+ * A network object knows all Operators belonging to this network. Variables,
+ * which are inputs and outputs of these operators, are created and managed by a
+ * hierarchy of Scope objects.
+ *
+ * This is the base class of network, all the networks should implement the APIs
+ * it defines.
+ */
+class NetOp : public framework::OperatorBase {
+ public:
+  static const char kAll[];
+  NetOp() : framework::OperatorBase("plain_net", {}, {}, {}) {}
+  NetOp(const std::string& type, const VarNameMap& inputs,
+        const VarNameMap& outputs, const framework::AttributeMap& attrs);
+
+  NetOp(const NetOp& o) : framework::OperatorBase(o.type_, {}, {}, o.attrs_) {
+    this->ops_.reserve(o.ops_.size());
+    std::transform(
+        o.ops_.begin(), o.ops_.end(), std::back_inserter(this->ops_),
+        [](const std::unique_ptr<framework::OperatorBase>& op) {
+          return std::unique_ptr<framework::OperatorBase>(op->Clone());
+        });
+    this->CompleteAddOp();
+  }
+
+  /**
+   * Infer all the operators' input and output variables' shapes, will be called
+   * before every mini-batch
+   */
+  void InferShape(const framework::Scope& scope) const override {
+    for (auto& op : ops_) {
+      op->InferShape(scope);
+    }
+  }
+
+  /**
+   * @brief Run the network.
+   *
+   * Run all the operators with the `scope`, if no scope is provided, default
+   * scope will be used instead. If no OpContext is provicded, default context
+   * will be used.
+   */
+  void Run(const framework::Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    for (auto& op : ops_) {
+      op->Run(scope, dev_ctx);
+    }
+  }
+
+  bool SupportGPU() const override {
+    for (auto& op : ops_) {
+      if (!op->SupportGPU()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  void AppendOp(const framework::OperatorBase& op) { AppendOp(op.Clone()); }
+
+  /**
+   * @brief Add an operator by ptr
+   */
+  void AppendOp(std::unique_ptr<framework::OperatorBase> op) {
+    PADDLE_ENFORCE(!add_op_done_,
+                   "Cannot AppendOp when this network is sealed");
+    PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
+    ops_.push_back(std::move(op));
+  }
+
+  void InsertOp(size_t pos, std::unique_ptr<framework::OperatorBase> op) {
+    PADDLE_ENFORCE(!add_op_done_,
+                   "Cannot InsertOp when this network is sealed");
+    PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
+    PADDLE_ENFORCE_LE(pos, ops_.size(), "Out of range");
+    ops_.insert(ops_.begin() + pos, std::move(op));
+  }
+
+  void InsertOp(size_t pos, const framework::OperatorBase& op) {
+    InsertOp(pos, op.Clone());
+  }
+
+  void CompleteAddOp(bool calculate = true);
+
+  std::string DebugString() const override;
+
+  bool IsNetOp() const override;
+  std::vector<std::string> OutputVars(bool has_intermediate) const override;
+
+  std::unique_ptr<framework::OperatorBase> Clone() const override;
+
+  std::vector<std::unique_ptr<framework::OperatorBase>> ops_;
+
+ private:
+  bool add_op_done_{false};
+  std::set<std::string> intermediate_outputs_;
+
+  template <typename T, typename KeyType>
+  static bool Contains(T container, KeyType key) {
+    return container.find(key) != container.end();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/framework/net_design.md b/paddle/operators/net_op_design.md
similarity index 100%
rename from paddle/framework/net_design.md
rename to paddle/operators/net_op_design.md
diff --git a/paddle/operators/net_op_test.cc b/paddle/operators/net_op_test.cc
new file mode 100644
index 0000000000..99019754a9
--- /dev/null
+++ b/paddle/operators/net_op_test.cc
@@ -0,0 +1,88 @@
+#include "paddle/operators/net_op.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace operators {
+using Scope = framework::Scope;
+using DeviceContext = platform::DeviceContext;
+
+static int infer_shape_cnt = 0;
+static int run_cnt = 0;
+
+class TestOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+  DEFINE_OP_CLONE_METHOD(TestOp);
+  void InferShape(const Scope& scope) const override { ++infer_shape_cnt; }
+  void Run(const Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    ++run_cnt;
+  }
+};
+
+template <typename T>
+void AssertSameVectorWithoutOrder(const std::vector<T>& expected,
+                                  const std::vector<T>& actual) {
+  ASSERT_EQ(expected.size(), actual.size());
+  std::unordered_set<T> expected_set;
+  for (auto& tmp : expected) {
+    expected_set.insert(tmp);
+  }
+  for (auto& act : actual) {
+    ASSERT_NE(expected_set.end(), expected_set.find(act));
+  }
+}
+
+TEST(OpKernel, all) {
+  auto net = std::make_shared<NetOp>();
+  ASSERT_NE(net, nullptr);
+
+  net->AppendOp(std::unique_ptr<TestOp>(
+      new TestOp("test", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
+                 {{"Out", {"y"}}}, {})));
+  net->AppendOp(std::unique_ptr<TestOp>(
+      new TestOp("test", {{"X", {"y"}}, {"W", {"w2"}}, {"b", {"b2"}}},
+                 {{"Out", {"z"}}}, {})));
+
+  net->CompleteAddOp();
+  AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"},
+                               net->Inputs(NetOp::kAll));
+  AssertSameVectorWithoutOrder({"y", "z"}, net->Outputs(NetOp::kAll));
+
+  auto final_outs = net->OutputVars(false);
+
+  ASSERT_EQ(final_outs.size(), 1UL);
+  ASSERT_EQ(final_outs[0], "z");
+}
+
+TEST(NetOp, insert_op) {
+  NetOp net;
+  auto op1 = std::unique_ptr<framework::NOP>(
+      new framework::NOP("empty", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
+                         {{"Out", {"y"}}}, {}));
+  net.AppendOp(*op1);
+  net.InsertOp(0, *op1);
+  ASSERT_EQ(2UL, net.ops_.size());
+  net.InsertOp(2, std::move(op1));
+  ASSERT_EQ(3UL, net.ops_.size());
+}
+
+TEST(NetOp, Clone) {
+  NetOp net;
+  net.AppendOp(
+      std::unique_ptr<framework::NOP>(new framework::NOP{"empty", {}, {}, {}}));
+  net.AppendOp(std::unique_ptr<framework::NOP>(
+      new framework::NOP{"empty2", {}, {}, {}}));
+  net.CompleteAddOp(true);
+  auto new_net_op = net.Clone();
+  ASSERT_NE(new_net_op, nullptr);
+  ASSERT_TRUE(new_net_op->IsNetOp());
+  auto* new_net = static_cast<NetOp*>(new_net_op.get());
+  ASSERT_EQ(2, new_net->ops_.size());
+  ASSERT_EQ(new_net->ops_[0]->Type(), "empty");
+  ASSERT_EQ(new_net->ops_[1]->Type(), "empty2");
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
new file mode 100644
index 0000000000..78ce0ba3c0
--- /dev/null
+++ b/paddle/operators/recurrent_op.cc
@@ -0,0 +1,239 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/recurrent_op.h"
+
+#include <cstring>
+#include <sstream>
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/net_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Scope = framework::Scope;
+using Variable = framework::Variable;
+using Tensor = framework::Tensor;
+
+void RecurrentAlgorithm::InferShape(const Scope& scope) const {
+  seq_len_ = scope.FindVar((arg_->inlinks[0]).external)
+                 ->GetMutable<Tensor>()
+                 ->dims()[0];
+  CreateScopes(scope);
+  auto step_scopes = GetStepScopes(scope);
+  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_,
+                     true /*infer_shape_mode*/);
+  InitMemories(step_scopes[0], true /*infer_shape_mode*/);
+
+  for (size_t i = 0; i < seq_len_; i++) {
+    if (i > 0) {
+      rnn::LinkMemories(step_scopes, arg_->memories, i, -1,
+                        true /*infer_shape_mode*/);
+    }
+    (*stepnet_)->InferShape(*step_scopes[i]);
+  }
+  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_,
+                     true /*infer_shape_mode*/);
+}
+
+void RecurrentAlgorithm::Run(const Scope& scope,
+                             const platform::DeviceContext& dev_ctx) const {
+  auto step_scopes = GetStepScopes(scope);
+  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_,
+                     false /*infer_shape_mode*/);
+  InitMemories(step_scopes[0], false /*infer_shape_mode*/);
+
+  for (size_t step_id = 0; step_id < seq_len_; step_id++) {
+    // create output alias variables
+    if (step_id > 0) {
+      rnn::LinkMemories(step_scopes, arg_->memories, step_id, -1,
+                        false /*infer_shape_mode*/);
+    }
+    (*stepnet_)->Run(*step_scopes[step_id], dev_ctx);
+  }
+  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_,
+                     false /*infer_shape_mode*/);
+}
+
+void RecurrentAlgorithm::CreateScopes(const Scope& scope) const {
+  // TODO(superjom) Only two scopes are needed for inference, this case will be
+  // supported later.
+  auto step_scopes_var = scope.FindVar(arg_->step_scopes);
+  PADDLE_ENFORCE(step_scopes_var != nullptr, "");
+  auto step_scopes = step_scopes_var->GetMutable<std::vector<Scope*>>();
+
+  // Now all variables in scope must be created outside of op.
+  PADDLE_ENFORCE_NOT_NULL(stepnet_);
+  PADDLE_ENFORCE(!(*stepnet_)->Outputs().empty(), "stepnet_ op has no outputs");
+  PADDLE_ENFORCE(!(*stepnet_)->Outputs().empty(), "net_op has no outputs");
+
+  if (seq_len_ > step_scopes->size()) {
+    for (size_t i = step_scopes->size(); i < seq_len_; ++i) {
+      auto& step_scope = scope.NewScope();
+
+      // create step net's temp inputs
+      for (auto& input : (*stepnet_)->Inputs()) {
+        // the weight are located in parent scope
+        for (auto& var_name : input.second) {
+          if (!step_scope.FindVar(var_name)) {
+            step_scope.NewVar(var_name)->GetMutable<Tensor>();
+          }
+        }
+      }
+      // create stepnet's outputs
+      for (const auto& output : (*stepnet_)->Outputs()) {
+        for (auto& var_name : output.second) {
+          step_scope.NewVar(var_name);
+        }
+      }
+      step_scopes->emplace_back(&step_scope);
+    }
+  }
+}
+
+void RecurrentAlgorithm::InitMemories(Scope* step_scope,
+                                      bool infer_shape_mode) const {
+  for (auto& attr : arg_->memories) {
+    Tensor* pre_mem = step_scope->NewVar(attr.pre_var)->GetMutable<Tensor>();
+    PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
+                   "memory [%s]'s boot variable [%s] not exists", attr.var,
+                   attr.boot_var);
+    Tensor* boot_mem = step_scope->FindVar(attr.boot_var)->GetMutable<Tensor>();
+    if (infer_shape_mode) {
+      pre_mem->Resize(boot_mem->dims());
+      PADDLE_ENFORCE_EQ(pre_mem->dims().size(), 2);
+    } else {
+      pre_mem->ShareDataWith<float>(*boot_mem);
+    }
+  }
+}
+
+const rnn::ArgumentName RecurrentOp::kArgName{
+    "step_net", "step_scopes",  "inlinks",
+    "outlinks", "inlink_alias", "outlink_alias",
+    "memories", "pre_memories", "boot_memories"};
+
+const rnn::ArgumentName RecurrentGradientOp::kArgName{
+    "step_net",    "step_scopes",  "outlink@grad",
+    "inlink@grad", "inlink_alias", "outlink_alias",
+    "memories",    "pre_memories", "boot_memories@grad"};
+
+RecurrentOp::RecurrentOp(const std::string& type,
+                         const framework::OperatorBase::VarNameMap& inputs,
+                         const framework::OperatorBase::VarNameMap& outputs,
+                         const framework::AttributeMap& attrs)
+    : OperatorBase(type, inputs, outputs, attrs) {
+  rnn::InitArgument(kArgName, &arg_, *this);
+  alg_.Init(&arg_, &stepnet_);
+}
+
+class RecurrentAlgorithmProtoAndCheckerMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  RecurrentAlgorithmProtoAndCheckerMaker(framework::OpProto* proto,
+                                         framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    const auto& name = RecurrentOp::kArgName;
+    // inputs and outputs stored in proto
+    AddInput(name.inlinks,
+             "the inputs that need to be segmented for each step.")
+        .AsDuplicable();
+    AddInput(name.boot_memories, "variables to initialize memories.")
+        .AsDuplicable();
+
+    AddOutput(name.outlinks, "the outputs that need to concated for all steps.")
+        .AsDuplicable();
+    AddOutput(name.step_scopes, "step scopes");
+
+    // Attributes stored in AttributeMap
+    AddAttr<std::vector<std::string>>(name.inlink_alias, "alias of inlinks");
+    AddAttr<std::vector<std::string>>(name.outlink_alias, "alias of outlinks");
+    AddAttr<std::vector<std::string>>(name.pre_memories,
+                                      "names of pre-memories");
+    AddAttr<std::vector<std::string>>(name.memories, "names of memories");
+
+    AddComment("This is a recurrent group operator.");
+  }
+};
+
+void RecurrentGradientAlgorithm::Run(
+    const Scope& scope, const platform::DeviceContext& dev_ctx) const {
+  auto step_scopes = GetStepScopes(scope);
+  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_,
+                     false /*infer_shape_mode*/);
+  for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) {
+    if (static_cast<size_t>(step_id) != seq_len_ - 1) {
+      rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1,
+                        false /*infer_shape_mode*/);
+    }
+    (*stepnet_)->Run(*step_scopes[step_id], dev_ctx);
+  }
+  LinkBootMemoryGradients(step_scopes[0], false);
+  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_,
+                     false /*infer_shape_mode*/);
+}
+
+void RecurrentGradientAlgorithm::LinkBootMemoryGradients(
+    Scope* step_scope, bool infer_shape_mode) const {
+  for (auto& attr : arg_->memories) {
+    PADDLE_ENFORCE(step_scope->FindVar(attr.var) != nullptr,
+                   "memory variable [%s] does not exists", attr.var);
+    PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
+                   "boot variable [%s] does not exists", attr.boot_var);
+    Tensor* mem_grad = step_scope->NewVar(attr.var)->GetMutable<Tensor>();
+    Tensor* boot_mem_grad =
+        step_scope->NewVar(attr.boot_var)->GetMutable<Tensor>();
+    if (infer_shape_mode) {
+      boot_mem_grad->Resize(mem_grad->dims());
+    } else {
+      boot_mem_grad->ShareDataWith<float>(*mem_grad);
+    }
+  }
+}
+
+void RecurrentGradientAlgorithm::InferShape(const Scope& scope) const {
+  seq_len_ = scope.FindVar((arg_->inlinks[0]).external)
+                 ->GetMutable<Tensor>()
+                 ->dims()[0];
+  auto step_scopes = GetStepScopes(scope);
+  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_,
+                     true /*infer_shape_mode*/);
+  for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) {
+    if (static_cast<size_t>(step_id) != seq_len_ - 1) {
+      rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1,
+                        true /*infer_shape_mode*/);
+    }
+    (*stepnet_)->InferShape(*step_scopes[step_id]);
+  }
+  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_,
+                     true /*infer_shape_mode*/);
+  LinkBootMemoryGradients(step_scopes[0], true /*infer_shape_mode*/);
+}
+
+RecurrentGradientOp::RecurrentGradientOp(
+    const std::string& type, const framework::OperatorBase::VarNameMap& inputs,
+    const framework::OperatorBase::VarNameMap& outputs,
+    const framework::AttributeMap& attrs)
+    : OperatorBase(type, inputs, outputs, attrs) {
+  rnn::InitArgument(kArgName, &arg_, *this);
+  alg_.Init(&arg_, &stepnet_);
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(
+    recurrent_op, paddle::operators::RecurrentOp,
+    paddle::operators::RecurrentAlgorithmProtoAndCheckerMaker);
diff --git a/paddle/operators/recurrent_op.h b/paddle/operators/recurrent_op.h
new file mode 100644
index 0000000000..bcfa817de8
--- /dev/null
+++ b/paddle/operators/recurrent_op.h
@@ -0,0 +1,190 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/operator.h"
+#include "paddle/operators/net_op.h"
+#include "paddle/operators/rnn/recurrent_op_utils.h"
+
+namespace paddle {
+namespace operators {
+
+// The sequence format in RecurrentOp is Tensor<seq_len, batch_size, dim> now.
+// TODO(Yan Chunwei):
+// 1. No-padding computing for sequences with indifinite length in one batch.
+// 2. Hierarchical RNN for sequence with sub-sequence.
+// 3. Internal Memory.
+// 4. More Complex RNN architecture, such as Gated Feedback RNN.
+//    Refer to: https://arxiv.org/pdf/1502.02367.pdf
+
+class RecurrentAlgorithm {
+ public:
+  void Run(const framework::Scope& scope,
+           const platform::DeviceContext& dev_ctx) const;
+
+  void Init(rnn::Argument* arg,
+            std::unique_ptr<framework::OperatorBase>* stepnet) {
+    PADDLE_ENFORCE_NOT_NULL(stepnet, "stepnet should be set before.");
+    arg_ = arg;
+    stepnet_ = stepnet;
+  }
+
+  /**
+   * InferShape must be called before Run.
+   */
+  void InferShape(const framework::Scope& scope) const;
+
+ protected:
+  /*
+   * The step scopes will be stored in the father scope as a variable.
+   *
+   * NOTE the scopes are reused in both the forward and backward, so just
+   * create once and expand its size if more steps need.
+   */
+  void CreateScopes(const framework::Scope& scope) const;
+
+  const std::vector<framework::Scope*>& GetStepScopes(
+      const framework::Scope& scope) const {
+    return *scope.FindVar(arg_->step_scopes)
+                ->GetMutable<std::vector<framework::Scope*>>();
+  }
+
+  void InitMemories(framework::Scope* step_scopes, bool infer_shape_mode) const;
+
+ private:
+  std::unique_ptr<framework::OperatorBase>* stepnet_;
+  rnn::Argument* arg_;
+  mutable size_t seq_len_;
+};
+
+class RecurrentGradientAlgorithm {
+  /**
+   * RNN's backward alogorithm.
+   *
+   * To accelerate the development of RecurrentGradientOp, we decouple RNN's
+   * algorithm and `OperatorBase`'s implementation, the former contains the core
+   * implementation of a RNN, and will keep stable even if the framework changes
+   * a
+   * lot, and the latter is a wrapper acts like an dapter for it to make RNN an
+   * operator.
+   */
+ public:
+  void Init(rnn::Argument* arg,
+            std::unique_ptr<framework::OperatorBase>* stepnet) {
+    PADDLE_ENFORCE_NOT_NULL(stepnet, "stepnet should be set before.");
+    arg_ = std::move(arg);
+    stepnet_ = stepnet;
+  }
+
+  void Run(const framework::Scope& scope,
+           const platform::DeviceContext& dev_ctx) const;
+
+  void LinkBootMemoryGradients(framework::Scope* step_scopes,
+                               bool infer_shape_mode) const;
+
+  /**
+   * InferShape must be called before Run.
+   */
+  void InferShape(const framework::Scope& scope) const;
+
+ protected:
+  inline const std::vector<framework::Scope*>& GetStepScopes(
+      const framework::Scope& scope) const {
+    return *scope.FindVar(arg_->step_scopes)
+                ->GetMutable<std::vector<framework::Scope*>>();
+  }
+
+ private:
+  rnn::Argument* arg_;
+  mutable size_t seq_len_;
+  std::unique_ptr<framework::OperatorBase>* stepnet_;
+};
+
+class RecurrentOp : public framework::OperatorBase {
+ public:
+  RecurrentOp(const std::string& type, const VarNameMap& inputs,
+              const VarNameMap& outputs, const framework::AttributeMap& attrs);
+
+  RecurrentOp(const RecurrentOp& o)
+      : framework::OperatorBase(
+            static_cast<const framework::OperatorBase&>(o)) {
+    // TODO(yuyang18): Implement copy ctor well.
+    PADDLE_THROW("Not implemented");
+  }
+  /**
+   * InferShape must be called before Run.
+   */
+  void InferShape(const framework::Scope& scope) const override {
+    alg_.InferShape(scope);
+  }
+
+  void Run(const framework::Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    alg_.Run(scope, dev_ctx);
+  }
+
+  void set_stepnet(std::unique_ptr<OperatorBase> net) {
+    stepnet_ = std::move(net);
+  }
+  const OperatorBase& stepnet() const { return *stepnet_; }
+
+  static const rnn::ArgumentName kArgName;
+
+ private:
+  RecurrentAlgorithm alg_;
+  rnn::Argument arg_;
+  std::unique_ptr<OperatorBase> stepnet_;
+};
+
+class RecurrentGradientOp : public framework::OperatorBase {
+ public:
+  RecurrentGradientOp(const std::string& type, const VarNameMap& inputs,
+                      const VarNameMap& outputs,
+                      const framework::AttributeMap& attrs);
+
+  RecurrentGradientOp(const RecurrentGradientOp& o)
+      : framework::OperatorBase(
+            static_cast<const framework::OperatorBase&>(o)) {
+    // TODO(yuyang18): Implement Copy ctor.
+    PADDLE_THROW("Not Implemented");
+  }
+
+  /**
+   * InferShape must be called before Run.
+   */
+  void InferShape(const framework::Scope& scope) const override {
+    alg_.InferShape(scope);
+  }
+
+  void Run(const framework::Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    alg_.Run(scope, dev_ctx);
+  }
+
+  static const rnn::ArgumentName kArgName;
+
+  void set_stepnet(std::unique_ptr<OperatorBase> net) {
+    stepnet_ = std::move(net);
+  }
+  const OperatorBase& stepnet() const { return *stepnet_; }
+
+ private:
+  RecurrentGradientAlgorithm alg_;
+  std::unique_ptr<OperatorBase> stepnet_;
+  rnn::Argument arg_;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/rnn/recurrent_op_utils.cc b/paddle/operators/rnn/recurrent_op_utils.cc
new file mode 100644
index 0000000000..a9b65c30f2
--- /dev/null
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
@@ -0,0 +1,160 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/rnn/recurrent_op_utils.h"
+
+namespace paddle {
+namespace operators {
+namespace rnn {
+
+namespace f = paddle::framework;
+
+using Tensor = framework::Tensor;
+
+void SegmentInputs(const std::vector<Scope*>& step_scopes,
+                   const std::vector<Link>& inlinks, const size_t seq_len,
+                   bool infer_shape_mode) {
+  PADDLE_ENFORCE(!inlinks.empty(), "no in links are provided.");
+  for (size_t i = 0; i < inlinks.size(); ++i) {
+    auto input_var = step_scopes[0]->FindVar(inlinks[i].external);
+    PADDLE_ENFORCE(input_var != nullptr, "input link [%s] is not in scope.",
+                   inlinks[i].external);
+
+    Tensor* input = input_var->GetMutable<Tensor>();
+    f::DDim dims = input->dims();
+    PADDLE_ENFORCE(static_cast<size_t>(dims[0]) == seq_len,
+                   "all the inlinks must have same length");
+    f::DDim step_dims = slice_ddim(dims, 1, dims.size());
+    for (size_t j = 0; j < seq_len; j++) {
+      Tensor* step_input =
+          step_scopes[j]->NewVar(inlinks[i].internal)->GetMutable<Tensor>();
+      if (!infer_shape_mode) {
+        *step_input = input->Slice<float>(j, j + 1);
+      }
+      step_input->Resize(step_dims);
+    }
+  }
+}
+
+void ConcatOutputs(const std::vector<Scope*>& step_scopes,
+                   const std::vector<Link>& outlinks, const size_t seq_len,
+                   bool infer_shape_mode) {
+  for (size_t i = 0; i < outlinks.size(); i++) {
+    auto output_var = step_scopes[0]->FindVar(outlinks[i].external);
+    PADDLE_ENFORCE(output_var != nullptr, "output link [%s] is not in scope.",
+                   outlinks[i].external);
+    Tensor* output = output_var->GetMutable<Tensor>();
+
+    if (infer_shape_mode) {
+      auto step_scope_var = step_scopes[0]->FindVar(outlinks[i].internal);
+      PADDLE_ENFORCE(step_scope_var != nullptr, "%s not in scope",
+                     outlinks[i].internal);
+      f::DDim step_dims = step_scope_var->template GetMutable<Tensor>()->dims();
+      std::vector<int> dims_vec = vectorize(step_dims);
+      dims_vec.insert(dims_vec.begin(), seq_len);
+      output->Resize(f::make_ddim(dims_vec));
+    } else {
+      output->mutable_data<float>(platform::CPUPlace());
+      for (size_t j = 0; j < seq_len; j++) {
+        Tensor* step_output =
+            step_scopes[j]->FindVar(outlinks[i].internal)->GetMutable<Tensor>();
+        // TODO(luotao02) data type and platform::DeviceContext() should set
+        // correctly
+        (output->Slice<float>(j, j + 1))
+            .CopyFrom<float>(*step_output, platform::CPUPlace());
+      }
+    }
+  }
+}
+
+void LinkMemories(const std::vector<Scope*>& scopes,
+                  const std::vector<rnn::MemoryAttr>& memories,
+                  const size_t step_id, const int offset,
+                  bool infer_shape_mode) {
+  PADDLE_ENFORCE_LT(step_id, scopes.size(),
+                    "step [%d] is out of range of step scopes' size [%d]",
+                    step_id, scopes.size());
+  PADDLE_ENFORCE_GE(static_cast<int>(step_id) + offset, 0,
+                    "offset [%d] must be large than -[%d]", offset, step_id);
+  PADDLE_ENFORCE_LT(
+      step_id + offset, scopes.size(),
+      "offset [%d] is out of range, it must be less than (%d - %d)", offset,
+      scopes.size(), step_id);
+  auto scope = scopes[step_id];
+  auto linked_scope = scopes[step_id + offset];
+  for (auto& attr : memories) {
+    auto mem = scope->FindVar(attr.pre_var)->GetMutable<Tensor>();
+    auto linked_mem = linked_scope->FindVar(attr.var)->GetMutable<Tensor>();
+    if (infer_shape_mode) {
+      mem->Resize(linked_mem->dims());
+    } else {
+      mem->ShareDataWith<float>(*linked_mem);
+    }
+  }
+}
+
+void InitArgument(const ArgumentName& name, Argument* arg,
+                  const framework::OperatorBase& op) {
+  arg->step_scopes = op.Output(name.step_scopes);
+
+  auto inlinks = op.Inputs(name.inlinks);
+  auto inlink_alias = op.GetAttr<std::vector<std::string>>(name.inlink_alias);
+  PADDLE_ENFORCE(inlinks.size() == inlink_alias.size(),
+                 "the size of inlinks and inlink_alias don't match:%d,%d",
+                 inlinks.size(), inlink_alias.size());
+  for (size_t i = 0; i < inlinks.size(); ++i) {
+    rnn::Link link;
+    link.external = inlinks[i];
+    link.internal = inlink_alias[i];
+    (arg->inlinks).push_back(link);
+  }
+
+  auto outlinks = op.Outputs(name.outlinks);
+  auto outlink_alias = op.GetAttr<std::vector<std::string>>(name.outlink_alias);
+  PADDLE_ENFORCE(outlinks.size() == outlink_alias.size(),
+                 "the size of outlinks and outlink_alias don't match:%d,%d",
+                 outlinks.size(), outlink_alias.size());
+  for (size_t i = 0; i < outlinks.size(); ++i) {
+    rnn::Link link;
+    link.external = outlinks[i];
+    link.internal = outlink_alias[i];
+    (arg->outlinks).push_back(link);
+  }
+
+  auto boot_memories = op.Inputs(name.boot_memories);
+
+  // attributes
+  auto memories = op.GetAttr<std::vector<std::string>>(name.memories);
+  auto pre_memories = op.GetAttr<std::vector<std::string>>(name.pre_memories);
+
+  PADDLE_ENFORCE(memories.size() == boot_memories.size(),
+                 "the size of memories, boot_memories don't match:%d,%d",
+                 memories.size(), boot_memories.size());
+  PADDLE_ENFORCE(pre_memories.size() == boot_memories.size(),
+                 "the size of pre_memories, boot_memories don't match:%d,%d",
+                 pre_memories.size(), boot_memories.size());
+  PADDLE_ENFORCE(memories.size() > 0, "more than 1 memories should be set");
+
+  for (size_t i = 0; i < memories.size(); ++i) {
+    rnn::MemoryAttr mem_attr;
+    mem_attr.var = memories[i];
+    mem_attr.pre_var = pre_memories[i];
+    mem_attr.boot_var = boot_memories[i];
+    (arg->memories).push_back(mem_attr);
+  }
+}
+
+}  // namespace rnn
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/rnn/recurrent_op_utils.h b/paddle/operators/rnn/recurrent_op_utils.h
new file mode 100644
index 0000000000..17941c503c
--- /dev/null
+++ b/paddle/operators/rnn/recurrent_op_utils.h
@@ -0,0 +1,94 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+namespace rnn {
+
+using Scope = framework::Scope;
+
+/**
+ * Memory of a RNN (same as the role of `Momory` in PaddlePaddle).
+ *
+ * Memory attributes cached by this op, dims will be infered from
+ * boot memories in father scope. Other attributes are copied from Op's proto
+ * attributes.
+ */
+struct MemoryAttr {
+  // name of current state variable
+  std::string var;
+  // name of previous step's state variable
+  std::string pre_var;
+  // name of the variables to init this memory (same role of `boot_layer` in
+  // PaddlePaddle), which is store in father's scope.
+  std::string boot_var;
+};
+
+struct Link {
+  // input or output links name.
+  std::string internal;
+  // alias to avoid duplicate keys in scopes.
+  std::string external;
+};
+
+struct Argument {
+  std::string step_net;
+  std::string step_scopes;
+  std::vector<Link> inlinks;
+  std::vector<Link> outlinks;
+  std::vector<rnn::MemoryAttr> memories;
+};
+
+struct ArgumentName {
+  std::string step_net;
+  std::string step_scopes;
+  std::string inlinks;
+  std::string outlinks;
+  std::string inlink_alias;   // the alias of inlinks in step net.
+  std::string outlink_alias;  // the alias of outlinks in step net.
+  std::string memories;       // the memory name
+  std::string pre_memories;   // the previous memory name
+  std::string boot_memories;  // the boot memory name
+};
+
+/**
+ * Prepare inputs for each step net.
+ */
+void SegmentInputs(const std::vector<Scope*>& step_scopes,
+                   const std::vector<Link>& inlinks, const size_t seq_len,
+                   bool infer_shape_mode);
+
+/**
+ * Process outputs of step nets and merge to variables.
+ */
+void ConcatOutputs(const std::vector<Scope*>& step_scopes,
+                   const std::vector<Link>& outlinks, const size_t seq_len,
+                   bool infer_shape_mode);
+
+void LinkMemories(const std::vector<Scope*>& step_scopes,
+                  const std::vector<MemoryAttr>& memories, const size_t step_id,
+                  const int offset, bool infer_shape_mode);
+
+void InitArgument(const ArgumentName& name, Argument* arg,
+                  const framework::OperatorBase& op);
+
+}  // namespace rnn
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/rnn_design.md b/paddle/operators/rnn_design.md
new file mode 100644
index 0000000000..3d38b9a0ad
--- /dev/null
+++ b/paddle/operators/rnn_design.md
@@ -0,0 +1,239 @@
+# RNN 变长输入设计
+对变长序列的学习，现有主流框架比如 tensorflow, pytorch, caffe2, mxnet 等均使用了padding的方式，
+即将一个mini-batch内不同长度的序列补0到固定长度参与计算。
+
+现有Paddle包括 `RecurrentLayerGroup` 在内的RNN均实现了无padding的变长序列支持，本文也将基于该模块的思路，设计重构后的变长序列支持。
+
+## 背景介绍
+由于tensor必须有明确的shape，因此基于tensor 的主流框架在存储变长序列时，
+必须用zero-padding的方式将变长序列补全为固定shape的tensor。
+
+由于padding是一种框架实现变长序列的妥协， 从用户角度，在使用RNN类模型时自然会比较介意padding的存在，
+因此会有pytorch中对非padding方式变长序列支持长篇的讨论[3]。
+
+由于padding对内存和计算会有额外的消耗，tensorflow和mxnet均使用了bucketing来进行优化[1][2]，
+但不管是padding还是bucket，对于用户都是额外的使用负担。
+
+因此，**paddle原生支持变长序列的方式，能直接满足用户对变长序列的最直接的需求，在当前主流平台中可以算是一大优势**。
+
+但对变长序列的支持，需要对目前框架做一些修改，下面讨论如何在最小修改下支持变长序列。
+
+## 多层序列数据格式 `LODTensor`
+目前 Paddle 会将一个mini-batch内的数据存储在一维的内存上，
+额外使用 `Argument.sequenceStartPositions` 来存储每个句子的信息。
+
+Paddle里使用 `Argument.subSequenceStartPositions` 来存储2层的序列信息，更高维度的序列则无法直接支持；
+
+为了支持 `N-level` 序列的存储，本文将序列信息定义成如下数据结构:
+
+```c++
+std::shared_ptr<std::vector<std::vector<int>>> lod_start_pos_;
+```
+
+或者更明确的定义
+
+```c++
+typedef std::vector<int> level_t;
+std::vector<level_t> lod_start_pos;
+```
+
+这里的每一个 `level_t` 存储一个粒度(level)的偏移信息，和paddle目前做法一致。
+
+为了更透明地传递序列信息，我们引入了一种新的tensor 称为 `LODTensor`[4]，
+其关于tensor相关的接口都直接继承自 `Tensor`，但另外添加了序列相关接口。
+如此，在操作一个 `LODTensor` 时，普通 `Op` 直接当成 `Tensor` 使用，
+而操作序列的 `Op` 会额外操作 `LODTensor` 的变长序列操作的相关接口。
+
+`LODTensor` 具体定义如下：
+
+```c++
+class LODTensor : public Tensor {
+public:
+  size_t Levels() const { return seq_start_positions_.size(); }
+  size_t Elements(int level = 0) const {
+    return seq_start_positions_[level].size();
+  }
+  // slice of level[elem_begin: elem_end]
+  // NOTE low performance in slice seq_start_positions_.
+  // TODO should call Tensor's Slice.
+  LODTensor LODSlice(int level, int elem_begin, int elem_end) const;
+
+  // slice with tensor's data shared with this.
+  LODTensor LODSliceShared(int level, int elem_begin, int elem_end) const;
+
+  // copy other's lod_start_pos_, to share LOD info.
+  // NOTE the LOD info sould not be changed.
+  void ShareConstLODFrom(const LODTensor &other) {
+    lod_start_pos_ = other.lod_start_pos_;
+  }
+  // copy other's lod_start_pos_'s content, free to mutate.
+  void ShareMutableLODFrom(const LODTensor &other) {
+    lod_start_pos_ = std::make_shared <
+                     std::vector<std::vector<int>>(other.lod_start_pos_.begin(),
+                                                   other.lod_start_pos_.end());
+  }
+
+private:
+  std::shared_ptr<std::vector<std::vector<int>>> lod_start_pos_;
+};
+```
+
+其中， `lod_start_pos_` 使用了 `shared_ptr` 来减少存储和复制的代价，
+可以认为 `LODTensor` 是 `Tensor` 的扩展，几乎完全兼容原始 `Tensor` 的使用。
+
+## 框架支持
+### 框架现有的 `Tensor` 调用替换为 `LODTensor`
+为了实现 `LODTensor` 的传递，框架里很多 `Tensor` 都需要变成 `LODTensor`，
+简单实现，直接 **把之前所有的`Tensor` 全部替换成 `LODTensor`，这里可以直接修改 `pybind.cc` 里面创建`Tensor`的接口**。
+
+此外，用户有可能需要感知序列的存在（比如序列的可视化需要解析模型中输出的序列），因此一些序列操作的API也需要暴露到 python 层。
+
+### `lod_start_pos` 随着Op调用链传递
+框架需要支持下列特性，以实现`lod_start_pos`的传递：
+
+1. 以 `shared_ptr` 的方式实现传递
+    - 不修改 `lod_start_pos` 内容的作为 consumer
+    - 修改 `lod_start_pos` 的作为 producer
+    - 约定 consumer 只需要复制传递过来的 `shared_ptr`
+      - producer 需要创建自己的独立的内存，以存储自己独立的修改，并暴露 `shared_ptr` 给后续 consumer
+    - 由于传递过程是以复制`shared_ptr`的方式实现，因此框架只需要传递一次 `lod_start_pos`
+
+2. 对于不感知 `lod_start_pos` 的Op足够透明
+3. 需要修改 `lod_start_pos` 的producer Op可以在 `Run` 时更新自己的 `lod_start_pos` 数据 
+
+具体的设计分为以下3小节
+
+#### `load_start_pos` 的传递
+
+- 对于不需要修改 `lod_start_pos` 的情况，调用 LODTensor的 `ShareConstLODFrom` 接口实现复制
+- 需要修改的，调用`ShareMutableLODFrom` 接口自己分配内存以存储修改
+
+#### 框架透明
+传递这一步需要加入到网络跑之前的初始化操作中，并且只需要初始化一次，基于当前框架设计的初步方案如下
+
+- 在 Op 的 `attrs` 中添加一项 `do_mutate_lod_info` 的属性，默认为 `false`
+  - 有需要修改 `lod_start_pos` 的Op需要在定义 `OpProto` 时设置为 `true`
+- `OperatorBase` 的 `InferShape` 中会读取 `do_mutate_lod_info` ，并且调用 `LODTensor` 相关的方法实现 `lod_start_pos` 的复制。
+- `OperatorBase` 中添加一个 member `is_lod_inited{false}` 来保证传递只进行一次
+
+一些逻辑如下
+
+```c++
+class OperatorBase {
+public:
+  // ...
+  void InferShape() {
+    if (!is_load_inited) {
+      bool do_mutate_lod_info = GetAttr<bool>("do_mutate_load_info");
+      // find a input having LOD to copy
+      auto lod_input = ValidLODInput();
+      for (auto &output : outputs) {
+        if (do_mutate_load_info) {
+          output.ShareMutableLODFrom(lod_input);
+        } else {
+          output.ShareConstLODFrom(load_input);
+        }
+      }
+      is_pod_inited = true;
+    }
+
+    // call op's InferShape
+    // ...
+  }
+
+private:
+  // ...
+  bool is_lod_inited{false};
+};
+```
+
+如此，`lod_start_pos` 的信息的传递对非OLD的Op的实现是完全透明的。
+
+#### `lod_start_pos` 的更新
+上一小节介绍到，对于需要修改 `load_start_pos` 的Op，`OperatorBase` 会分配一块自己的内存以存储修改，
+Op在 `Run` 的实现中，操作更新自己的 `load_start_pos` ，
+而所有依赖其 outputs 的 op 会通过共享的指针自动获取到其更新。
+
+## 根据长度排序
+按照长度排序后，从前往后的时间步的batch size会自然地递减，可以直接塞入 Net 做batch计算
+
+比如原始的输入：
+
+```
+origin:
+xxxx
+xx
+xxx
+
+-> sorted:
+xxxx
+xxx
+xx
+```
+
+经过 `SegmentInputs` 之后，每个会有4个时间步，每个时间步的输入如下（纵向排列）
+
+```
+0    1    2    3
+x    x    x    x
+x    x    x
+x    x
+```
+
+为了追踪排序前后序列的变化，这里用
+```c++
+struct SortedSeqItem {
+   void *start{nullptr};
+   void *end{nullptr};
+};
+
+std::vector<SortedSeqItem> sorted_seqs;
+```
+来追踪序列排序后的位置，并添加一个新的接口 
+
+```c++
+std::vector<SortedSeqItem> SortBySeqLen(const LODTensor& tensor);
+```
+
+由于输入序列的顺序变化，以下现有的接口需要针对性地修改：
+
+- InitMemories, memory需要根据 `sorted_seqs` 重新排列
+- SetmentInputs
+- ConcatOutputs
+
+此外，由于 `sorted_seqs` 需要被 `RecurrentGradientOp` 复用，因此会变成 `RecurrentOp` 一个新的output输出，
+之后作为 `RecurrentGradientOp` 的一个输入传入。
+
+## InitMemories
+由于序列顺序的变化，`boot_memories` 的batch上的element的顺序也需要对应重新排列。
+
+## SegmentInputs
+`SegmentInputs` 会依赖 `sorted_seqs` 的信息，将原始的序列按照排序后的序列顺序，从横向切割，转为每个step中的inputs。
+
+即下面的转变：
+```
+origin:
+xxxx
+xx
+xxx
+
+   |
+   |
+  \ /
+   !
+0    1    2    3
+x    x    x    x
+x    x    x
+x    x
+```
+## ConcatOutputs
+`ConcatOutputs` 需要
+
+- 将每个时间步的输出重新还原为原始输入的序列顺序（以防止Infer阶段顺序打乱）
+- 将每个序列concat 为规则的mini-batch表示
+
+## 参考文献
+1. [Tensorflow Bucketing](https://www.tensorflow.org/versions/r0.12/api_docs/python/contrib.training/bucketing)
+2. [mxnet Bucketing](http://mxnet.io/how_to/bucketing.html)
+3. [variable length input in RNN scenario](https://discuss.pytorch.org/t/about-the-variable-length-input-in-rnn-scenario/345/5)
+4. [Level of details](https://en.wikipedia.org/wiki/Level_of_detail)
diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
new file mode 100644
index 0000000000..6825dce332
--- /dev/null
+++ b/paddle/operators/rowwise_add_op.cc
@@ -0,0 +1,82 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/rowwise_add_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class RowwiseAddOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto dim0 = ctx.Input<Tensor>("X")->dims();
+    auto dim1 = ctx.Input<Tensor>("b")->dims();
+
+    PADDLE_ENFORCE(dim0.size() == 2, "Input 0 must be matrix");
+    PADDLE_ENFORCE(dim1.size() == 1, "The second input must be vector");
+    PADDLE_ENFORCE(dim0[1] == dim1[0], "The width of two input must be same");
+    PADDLE_ENFORCE(ctx.OutputSize("Out") == 1, "The output size must be 1");
+    ctx.Output<Tensor>("Out")->Resize(ctx.Input<Tensor>("X")->dims());
+  }
+};
+
+class RowwiseAddOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RowwiseAddOpMaker(framework::OpProto *proto,
+                    framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The left input of row-wise add op, must be matrix");
+    AddInput("b", "The right input of row-wise add op, must be vector");
+    AddOutput("Out", "The output of row-wise add op");
+    AddComment(R"DOC(Row-wise Add operator
+
+for i in xrange(X.shape[0]):
+  Out = X[i] + b
+)DOC");
+  }
+};
+class RowwiseAddGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "X should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("b"), "b should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
+                            "Input(Out@GRAD) should not be null");
+    auto dims0 = ctx.Input<Tensor>("X")->dims();
+    auto dims1 = ctx.Input<Tensor>("b")->dims();
+    PADDLE_ENFORCE_EQ(1, dims1.size(), "b dims should be 1")
+    ctx.Output<Tensor>(framework::GradVarName("X"))->Resize(dims0);
+    ctx.Output<Tensor>(framework::GradVarName("b"))->Resize(dims1);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(rowwise_add, ops::RowwiseAddOp, ops::RowwiseAddOpMaker,
+            rowwise_add_grad, ops::RowwiseAddGradOp);
+REGISTER_OP_CPU_KERNEL(
+    rowwise_add, ops::RowwiseAddKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    rowwise_add_grad,
+    ops::RowwiseAddGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/rowwise_add_op.cu b/paddle/operators/rowwise_add_op.cu
new file mode 100644
index 0000000000..cbc61ad3e1
--- /dev/null
+++ b/paddle/operators/rowwise_add_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/rowwise_add_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    rowwise_add, ops::RowwiseAddKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h
new file mode 100644
index 0000000000..1cbd8bb31a
--- /dev/null
+++ b/paddle/operators/rowwise_add_op.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class RowwiseAddKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto out = context.Output<Tensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+
+    auto input = EigenMatrix<T>::From(*context.Input<Tensor>("X"));
+    auto bias = EigenVector<T>::From(*context.Input<Tensor>("b"));
+    auto output = EigenMatrix<T>::From(*out);
+
+    const int bias_size = bias.dimension(0);
+    const int rest_size = input.size() / bias_size;
+    Eigen::DSizes<int, 1> one_d(input.size());
+    Eigen::DSizes<int, 1> bcast(rest_size);
+    output.reshape(one_d).device(context.GetEigenDevice<Place>()) =
+        input.reshape(one_d) + bias.broadcast(bcast).reshape(one_d);
+  }
+};
+
+template <typename Place, typename T>
+class RowwiseAddGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* dOut = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dX = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* db = context.Output<Tensor>(framework::GradVarName("b"));
+    dX->mutable_data<T>(context.GetPlace());
+    db->mutable_data<T>(context.GetPlace());
+
+    auto OutGrad = EigenMatrix<T>::From(*dOut);
+    auto place = context.GetEigenDevice<Place>();
+    EigenMatrix<T>::From(*dX).device(place) = OutGrad;
+
+    // https://eigen.tuxfamily.org/dox/unsupported/TensorBase_8h_source.html
+    // colwise add
+    Eigen::array<int, 1> dims{{0}}; /* dimension to reduce */
+    EigenVector<T>::Flatten(*db).device(place) = OutGrad.sum(dims);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/scatter.h b/paddle/operators/scatter.h
new file mode 100644
index 0000000000..6b542675c2
--- /dev/null
+++ b/paddle/operators/scatter.h
@@ -0,0 +1,92 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cstring>
+
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+// Implementation of CPU copy
+template <typename T>
+void CPUScatterUpdate(const paddle::framework::Tensor* src, const int* index,
+                      const size_t index_size,
+                      paddle::framework::Tensor* output) {
+  paddle::framework::DDim output_dims = output->dims();
+
+  for (size_t i = 0; i < index_size; ++i) {
+    int index_ = index[i];
+
+    paddle::framework::Tensor src_ = *src;
+    paddle::framework::Tensor output_ = *output;
+    if (index_size > 1) src_ = src->Slice<T>(i, i + 1);
+    if (output_dims[0] > 1) output_ = output->Slice<T>(index_, index_ + 1);
+
+    auto X = EigenVector<T>::Flatten(src_);
+    auto Y = EigenVector<T>::Flatten(output_);
+
+    Y = X + Y;
+  }
+}
+
+// Implementation of GPU scatter:
+template <typename T>
+void GPUScatterUpdate(const T* src, const int* index, const int slice_size,
+                      const int index_size, T* output);
+
+/**
+ * Return a updated tensor from source tensor, scattered according to index:
+ * dst[i] += src[index[i]]
+ * input[src]: type-T source Tensor
+ * input[index]: type-int index Tensor (1-D)
+ * return: output tensor
+ */
+template <typename T>
+void ScatterUpdate(const platform::Place& place,
+                   const paddle::framework::Tensor* src,
+                   const paddle::framework::Tensor* index,
+                   paddle::framework::Tensor* output) {
+  // check index of shape 1-D
+  PADDLE_ENFORCE(index->dims().size() == 1);
+  int index_size = index->dims()[0];
+
+  auto src_dims = src->dims();
+  auto dst_dims = output->dims();
+
+  // check src shape and dst shape should match
+  for (int i = 1; i < src_dims.size(); i++)
+    PADDLE_ENFORCE(src_dims[i] == dst_dims[i]);
+
+  // slice size
+  size_t slice_size = 1;
+  for (int i = 0; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+
+  if (platform::is_cpu_place(place)) {
+    CPUScatterUpdate<T>(src, index->data<int>(), index_size, output);
+  } else {
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/scatter_test.cc b/paddle/operators/scatter_test.cc
new file mode 100644
index 0000000000..26fdaff146
--- /dev/null
+++ b/paddle/operators/scatter_test.cc
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/scatter.h"
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/place.h"
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <string>
+
+TEST(scatter, ScatterUpdate) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  using namespace paddle::operators;
+
+  Tensor* src = new Tensor();
+  Tensor* index = new Tensor();
+  Tensor* output = new Tensor();
+
+  float* p_src = nullptr;
+  int* p_index = nullptr;
+  p_src = src->mutable_data<float>(make_ddim({1, 4}), CPUPlace());
+  p_index = index->mutable_data<int>(make_ddim({1}), CPUPlace());
+
+  for (size_t i = 0; i < 4; ++i) p_src[i] = float(i);
+  p_index[0] = 1;
+
+  float* p_output = output->mutable_data<float>(make_ddim({4, 4}), CPUPlace());
+
+  ScatterUpdate<float>(CPUPlace(), src, index, output);
+
+  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], float(0));
+  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output->data<float>()[i], float(0));
+  for (size_t i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], float(i - 4));
+  for (size_t i = 4; i < 8; ++i)
+    EXPECT_EQ(output->data<float>()[i], float(i - 4));
+  for (size_t i = 8; i < 16; ++i) EXPECT_EQ(p_output[i], float(0));
+  for (size_t i = 8; i < 16; ++i) EXPECT_EQ(output->data<float>()[i], float(0));
+
+  delete src;
+  delete index;
+  delete output;
+}
diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
new file mode 100644
index 0000000000..ad267e7f08
--- /dev/null
+++ b/paddle/operators/sgd_op.cc
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sgd_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SGDOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(
+        ctx.Input<Tensor>("param")->dims() == ctx.Input<Tensor>("grad")->dims(),
+        "Two input of SGD Op's dimension must be same.");
+    ctx.Output<Tensor>("param_out")->Resize(ctx.Input<Tensor>("param")->dims());
+  }
+};
+
+class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SGDOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("param", "input parameter");
+    AddInput("grad", "input gradient");
+    AddOutput("param_out", "output parameter");
+    AddAttr<float>("learning_rate", "learning rate of sgd");
+    AddComment(R"DOC(
+
+Simplest sgd algorithm.
+
+param_out = param - learning_rate * grad;
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(sgd, ops::SGDOp, ops::SGDOpMaker);
+REGISTER_OP_CPU_KERNEL(sgd,
+                       ops::SGDOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu
new file mode 100644
index 0000000000..f5ba6d3c29
--- /dev/null
+++ b/paddle/operators/sgd_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/sgd_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(sgd,
+                       ops::SGDOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h
new file mode 100644
index 0000000000..a0b5000ffb
--- /dev/null
+++ b/paddle/operators/sgd_op.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class SGDOpKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param = ctx.Input<Tensor>("param");
+    auto grad = ctx.Input<Tensor>("grad");
+    auto param_out = ctx.Output<Tensor>("param_out");
+    float lr = ctx.op_.GetAttr<float>("learning_rate");
+
+    param_out->mutable_data<T>(ctx.GetPlace());
+
+    auto p = EigenVector<T>::Flatten(*param);
+    auto g = EigenVector<T>::Flatten(*grad);
+    auto o = EigenVector<T>::Flatten(*param_out);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    o.device(place) = p - lr * g;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/sigmoid_op.cc b/paddle/operators/sigmoid_op.cc
new file mode 100644
index 0000000000..761c6de8d4
--- /dev/null
+++ b/paddle/operators/sigmoid_op.cc
@@ -0,0 +1,61 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/sigmoid_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SigmoidOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    ctx.Output<Tensor>("Y")->Resize(ctx.Input<Tensor>("X")->dims());
+  }
+};
+
+class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SigmoidOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "sigmoid input");
+    AddOutput("Y", "sigmoid output");
+    AddComment("Sigmoid function");
+  }
+};
+
+class SigmoidOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    ctx.Output<Tensor>(framework::GradVarName("X"))
+        ->Resize(ctx.Input<Tensor>("Y")->dims());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(sigmoid, ops::SigmoidOp, ops::SigmoidOpMaker, sigmoid_grad,
+            ops::SigmoidOpGrad);
+REGISTER_OP_CPU_KERNEL(sigmoid,
+                       ops::SigmoidKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    sigmoid_grad, ops::SigmoidGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/sigmoid_op.cu b/paddle/operators/sigmoid_op.cu
new file mode 100644
index 0000000000..1a50dfe14a
--- /dev/null
+++ b/paddle/operators/sigmoid_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/sigmoid_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(sigmoid,
+                       ops::SigmoidKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    sigmoid_grad, ops::SigmoidGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/sigmoid_op.h b/paddle/operators/sigmoid_op.h
new file mode 100644
index 0000000000..b01a9b3f23
--- /dev/null
+++ b/paddle/operators/sigmoid_op.h
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class SigmoidKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto input = context.Input<Tensor>("X");
+    auto output = context.Output<Tensor>("Y");
+    output->mutable_data<T>(context.GetPlace());
+
+    // The clipping is used in Paddle's raw implenmention
+    auto X = EigenVector<T>::Flatten(*input);
+    auto Y = EigenVector<T>::Flatten(*output);
+    auto place = context.GetEigenDevice<Place>();
+
+    Y.device(place) = 1. / (1. + (-X).exp());
+  }
+};
+
+template <typename Place, typename T>
+class SigmoidGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto Y_t = context.Input<Tensor>("Y");
+    auto dY_t = context.Input<Tensor>(framework::GradVarName("Y"));
+    auto dX_t = context.Output<Tensor>(framework::GradVarName("X"));
+
+    dX_t->mutable_data<T>(context.GetPlace());
+
+    auto dX = EigenVector<T>::Flatten(*dX_t);
+    auto Y = EigenVector<T>::Flatten(*Y_t);
+    auto dY = EigenVector<T>::Flatten(*dY_t);
+    dX.device(context.GetEigenDevice<Place>()) = dY * Y * (1. - Y);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
new file mode 100644
index 0000000000..40c51a64c4
--- /dev/null
+++ b/paddle/operators/softmax_op.cc
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/softmax_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SoftmaxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.Input<Tensor>("X")->dims().size() == 2UL,
+                   "The input of softmax op must be matrix");
+    ctx.Output<Tensor>("Y")->Resize(ctx.Input<Tensor>("X")->dims());
+  }
+};
+
+class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SoftmaxOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "input of softmax");
+    AddOutput("Y", "output of softmax");
+    AddComment("Softmax Op");
+  }
+};
+
+class SoftmaxOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputVar("Y") != nullptr, "Input(Y) should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Y")),
+                            "Input(Y@GRAD) should not be null");
+    PADDLE_ENFORCE(ctx.Input<Tensor>("Y")->dims() ==
+                       ctx.Input<Tensor>(framework::GradVarName("Y"))->dims(),
+                   "the shape of Input(0) and Input(1) should be the same");
+    ctx.Output<Tensor>(framework::GradVarName("X"))
+        ->Resize(ctx.Input<Tensor>("Y")->dims());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, softmax_grad,
+            ops::SoftmaxOpGrad);
+REGISTER_OP_CPU_KERNEL(softmax,
+                       ops::SoftmaxKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    softmax_grad, ops::SoftmaxGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/softmax_op.cu b/paddle/operators/softmax_op.cu
new file mode 100644
index 0000000000..2e99a89699
--- /dev/null
+++ b/paddle/operators/softmax_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/softmax_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(softmax,
+                       ops::SoftmaxKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    softmax_grad, ops::SoftmaxGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
new file mode 100644
index 0000000000..4fa6b59540
--- /dev/null
+++ b/paddle/operators/softmax_op.h
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class SoftmaxKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto input = context.Input<Tensor>("X");
+    auto output = context.Output<Tensor>("Y");
+    output->mutable_data<T>(context.GetPlace());
+
+    auto logits = EigenMatrix<T>::From(*input);
+    auto softmax = EigenMatrix<T>::From(*output);
+
+    const int kBatchDim = 0;
+    const int kClassDim = 1;
+
+    const int batch_size = logits.dimension(kBatchDim);
+    const int num_classes = logits.dimension(kClassDim);
+
+    Eigen::DSizes<int, 1> along_class(kClassDim);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+
+    auto shifted_logits = (logits -
+                           logits.maximum(along_class)
+                               .eval()
+                               .reshape(batch_by_one)
+                               .broadcast(one_by_class));
+
+    softmax.device(context.GetEigenDevice<Place>()) = shifted_logits.exp();
+
+    softmax.device(context.GetEigenDevice<Place>()) =
+        (softmax *
+         softmax.sum(along_class)
+             .inverse()
+             .eval()
+             .reshape(batch_by_one)
+             .broadcast(one_by_class));
+  }
+};
+
+template <typename Place, typename T>
+class SoftmaxGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    std::shared_ptr<Tensor> scale_ = std::make_shared<Tensor>();
+
+    auto Y = context.Input<Tensor>("Y");
+    auto dY = context.Input<Tensor>(framework::GradVarName("Y"));
+    auto dX = context.Output<Tensor>(framework::GradVarName("X"));
+    dX->mutable_data<T>(context.GetPlace());
+
+    const int batch_size = Y->dims()[0];
+    const int class_num = Y->dims()[1];
+
+    Eigen::DSizes<int, 1> along_class(1);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, class_num);
+
+    auto Y_eigen = EigenMatrix<T>::From(*Y);
+    auto dY_eigen = EigenMatrix<T>::From(*dY);
+    auto dX_eigen = EigenMatrix<T>::From(*dX);
+    auto place = context.GetEigenDevice<Place>();
+
+    auto dot = (Y_eigen * dY_eigen)
+                   .sum(along_class)
+                   .eval()
+                   .reshape(batch_by_one)
+                   .broadcast(one_by_class);
+    dX_eigen.device(place) = (dY_eigen - dot) * Y_eigen;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
new file mode 100644
index 0000000000..29491137e6
--- /dev/null
+++ b/paddle/operators/uniform_random_op.cc
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <random>
+#include <type_traits>
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
+// Use std::random and thrust::random(thrust is a std library in CUDA) to
+// implement uniform random.
+template <typename T>
+class CPUUniformRandomKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+    unsigned int seed =
+        static_cast<unsigned int>(context.op_.GetAttr<int>("seed"));
+    std::minstd_rand engine;
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    engine.seed(seed);
+    std::uniform_real_distribution<T> dist(
+        static_cast<T>(context.op_.GetAttr<float>("min")),
+        static_cast<T>(context.op_.GetAttr<float>("max")));
+    ssize_t size = framework::product(tensor->dims());
+    for (ssize_t i = 0; i < size; ++i) {
+      data[i] = dist(engine);
+    }
+  }
+};
+
+class UniformRandomOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext& ctx) const override {
+    PADDLE_ENFORCE(GetAttr<float>("min") < GetAttr<float>("max"),
+                   "uniform_random's min must less then max");
+    auto* tensor = ctx.Output<framework::Tensor>("Out");
+    auto dims = GetAttr<std::vector<int>>("dims");
+    tensor->Resize(framework::make_ddim(dims));
+  }
+};
+
+class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  UniformRandomOpMaker(framework::OpProto* proto,
+                       framework::OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "The output tensor of uniform random op");
+    AddComment(R"DOC(Uniform random operator.
+Used to initialize tensor with uniform random generator.
+)DOC");
+    AddAttr<std::vector<int>>("dims", "the dimension of random tensor");
+    AddAttr<float>("min", "Minimum value of uniform random").SetDefault(-1.0f);
+    AddAttr<float>("max", "Maximun value of uniform random").SetDefault(1.0f);
+    AddAttr<int>("seed",
+                 "Random seed of uniform random. "
+                 "0 means generate a seed by system")
+        .SetDefault(0);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(uniform_random, paddle::operators::UniformRandomOp,
+                             paddle::operators::UniformRandomOpMaker);
+REGISTER_OP_CPU_KERNEL(uniform_random,
+                       paddle::operators::CPUUniformRandomKernel<float>);
diff --git a/paddle/operators/uniform_random_op.cu b/paddle/operators/uniform_random_op.cu
new file mode 100644
index 0000000000..1d6709934c
--- /dev/null
+++ b/paddle/operators/uniform_random_op.cu
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct UniformGenerator {
+  T min_, max_;
+  unsigned int seed_;
+
+  __host__ __device__ UniformGenerator(T min, T max, int seed)
+      : min_(min), max_(max), seed_(seed) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(min_, max_);
+    rng.discard(n);
+    return dist(rng);
+  }
+};
+
+// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
+// Use std::random and thrust::random(thrust is a std library in CUDA) to
+// implement uniform random.
+template <typename T>
+class GPUUniformRandomKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+    unsigned int seed =
+        static_cast<unsigned int>(context.op_.GetAttr<int>("seed"));
+    if (seed == 0) {
+      std::random_device rd;
+      seed = rd();
+    }
+    T min = static_cast<T>(context.op_.GetAttr<float>("min"));
+    T max = static_cast<T>(context.op_.GetAttr<float>("max"));
+    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+    ssize_t N = framework::product(tensor->dims());
+    thrust::transform(index_sequence_begin, index_sequence_begin + N,
+                      thrust::device_ptr<T>(data),
+                      UniformGenerator<T>(min, max, seed));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_GPU_KERNEL(uniform_random,
+                       paddle::operators::GPUUniformRandomKernel<float>);
diff --git a/paddle/optimizer/optimizer.cc b/paddle/optimizer/optimizer.cc
index 54662dc378..eb7125adee 100644
--- a/paddle/optimizer/optimizer.cc
+++ b/paddle/optimizer/optimizer.cc
@@ -44,8 +44,8 @@ paddle_optimizer* paddle_create_optimizer(const unsigned char* config_proto,
                                           const int state_len) {
   paddle_optimizer* optimizer = new paddle_optimizer;
   std::string config(config_proto, config_proto + config_proto_len);
-  Tensor* parameter =
-      new Tensor(reinterpret_cast<float*>(param_buffer), num_bytes);
+  Tensor* parameter = new Tensor(reinterpret_cast<float*>(param_buffer),
+                                 num_bytes / sizeof(float));
   optimizer->impl = ParameterOptimizer::Create(config, parameter);
   if (state != nullptr) {
     std::string s(state, state + state_len);
@@ -65,7 +65,8 @@ int paddle_update_parameter(paddle_optimizer* o,
                             int num_bytes) {
   // TOOD(zhihong): datatype not work. need to add the runtime datatype
   auto grad_type = reinterpret_cast<const float*>(grad_buffer);
-  Tensor* gradient = new Tensor(const_cast<float*>(grad_type), num_bytes);
+  Tensor* gradient =
+      new Tensor(const_cast<float*>(grad_type), num_bytes / sizeof(float));
   o->impl->Update(gradient);
   return PADDLE_SUCCESS;
 }
diff --git a/paddle/optimizer/parameter_optimizer_test.cpp b/paddle/optimizer/parameter_optimizer_test.cpp
index 4e6254d9e4..edf4ae37a9 100644
--- a/paddle/optimizer/parameter_optimizer_test.cpp
+++ b/paddle/optimizer/parameter_optimizer_test.cpp
@@ -1,3 +1,19 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
 #include "parameter_optimizer.h"
 #include <cmath>
 #include <map>
@@ -5,21 +21,18 @@
 #include "gtest/gtest.h"
 #include "lr_policy.h"
 
-using namespace paddle;
-using namespace paddle::optimizer;
-
-Tensor* FillTensor(size_t size) {
-  Tensor* param = new Tensor(size);
-  Tensor& p = *param;
+paddle::optimizer::Tensor* FillTensor(size_t size) {
+  paddle::optimizer::Tensor* param = new paddle::optimizer::Tensor(size);
+  paddle::optimizer::Tensor& p = *param;
   for (size_t i = 0; i < p.size(); ++i) {
     p[i] = (float)rand() / (float)RAND_MAX;
   }
   return param;
 }
 
-Tensor* FixedTensor(size_t size) {
-  Tensor* param = new Tensor(size);
-  Tensor& p = *param;
+paddle::optimizer::Tensor* FixedTensor(size_t size) {
+  paddle::optimizer::Tensor* param = new paddle::optimizer::Tensor(size);
+  paddle::optimizer::Tensor& p = *param;
   for (size_t i = 0; i < p.size(); ++i) {
     p[i] = i;
   }
@@ -28,7 +41,8 @@ Tensor* FixedTensor(size_t size) {
 
 class OptimizerTest : public testing::Test {
 public:
-  // init tensor shape
+  virtual ~OptimizerTest() {}
+  // init paddle::optimizer::Tensor shape
   const size_t kSize = 5;
 
   virtual void SetUp() {
@@ -38,34 +52,36 @@ public:
   virtual void TearDown() {}
 
   void CreateSGD() {
-    Tensor* parameter = FixedTensor(kSize);
-    config_.set_optimizer(OptimizerConfig::SGD);
+    paddle::optimizer::Tensor* parameter = FixedTensor(kSize);
+    config_.set_optimizer(paddle::OptimizerConfig::SGD);
     config_.mutable_sgd()->set_momentum(0.0);
     config_.mutable_sgd()->set_decay(0.0);
     config_.mutable_sgd()->set_nesterov(false);
-    config_.set_lr_policy(OptimizerConfig::Const);
+    config_.set_lr_policy(paddle::OptimizerConfig::Const);
     config_.mutable_const_lr()->set_learning_rate(0.1);
     std::string str = config_.SerializeAsString();
-    ParameterOptimizer* opt = ParameterOptimizer::Create(str, parameter);
+    paddle::optimizer::ParameterOptimizer* opt =
+        paddle::optimizer::ParameterOptimizer::Create(str, parameter);
     opts_.push_back(opt);
   }
 
   void CreateAdam() {
-    Tensor* parameter = FixedTensor(kSize);
-    config_.set_optimizer(OptimizerConfig::Adam);
+    paddle::optimizer::Tensor* parameter = FixedTensor(kSize);
+    config_.set_optimizer(paddle::OptimizerConfig::Adam);
     config_.mutable_adam()->set_beta_1(0.9);
     config_.mutable_adam()->set_beta_2(0.1);
     config_.mutable_adam()->set_epsilon(1e-3);
     config_.mutable_adam()->set_decay(0.0);
-    config_.set_lr_policy(OptimizerConfig::Const);
+    config_.set_lr_policy(paddle::OptimizerConfig::Const);
     config_.mutable_const_lr()->set_learning_rate(0.1);
     std::string str = config_.SerializeAsString();
-    ParameterOptimizer* opt = ParameterOptimizer::Create(str, parameter);
+    paddle::optimizer::ParameterOptimizer* opt =
+        paddle::optimizer::ParameterOptimizer::Create(str, parameter);
     opts_.push_back(opt);
   }
 
   void TestGetWeight() {
-    Tensor* p = FixedTensor(kSize);
+    paddle::optimizer::Tensor* p = FixedTensor(kSize);
     for (size_t i = 0; i < opts_.size(); ++i) {
       int s = 0;
       float* newp = (float*)opts_[i]->get_weight(&s);
@@ -76,7 +92,7 @@ public:
   }
 
   void TestUpdate() {
-    Tensor* g = FixedTensor(kSize);
+    paddle::optimizer::Tensor* g = FixedTensor(kSize);
     for (size_t i = 0; i < opts_.size(); ++i) {
       opts_[i]->Update(g);
     }
@@ -91,8 +107,8 @@ public:
   }
 
 private:
-  std::vector<ParameterOptimizer*> opts_;
-  OptimizerConfig config_;
+  std::vector<paddle::optimizer::ParameterOptimizer*> opts_;
+  paddle::OptimizerConfig config_;
 };
 
 TEST_F(OptimizerTest, TestGetWeight) { TestGetWeight(); }
diff --git a/paddle/optimizer/serialization_test.cpp b/paddle/optimizer/serialization_test.cpp
index d2454140dc..e4d97cbdba 100644
--- a/paddle/optimizer/serialization_test.cpp
+++ b/paddle/optimizer/serialization_test.cpp
@@ -1,19 +1,32 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
 #include "serialization.h"
 #include "gtest/gtest.h"
 
-using namespace paddle;
-using namespace paddle::optimizer;
-
 TEST(TensorToProto, Case1) {
-  Tensor t(3), t1(3);
+  paddle::optimizer::Tensor t(3), t1(3);
   for (size_t i = 0; i < t.size(); ++i) {
     t[i] = i;
     t1[i] = 0;
   }
 
-  TensorProto proto;
-  TensorToProto(t, &proto);
-  ProtoToTensor(proto, &t1);
+  paddle::TensorProto proto;
+  paddle::optimizer::TensorToProto(t, &proto);
+  paddle::optimizer::ProtoToTensor(proto, &t1);
   for (size_t i = 0; i < t1.size(); ++i) {
     EXPECT_EQ(t1[i], t[i]);
   }
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index 9a9092af9b..79d2158334 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -672,4 +672,24 @@ void Argument::subArgFrom(const Argument& input,
   }
 }
 
+void Argument::reorganizeSeqInfo(
+    const ICpuGpuVectorPtr seqStartPos,
+    const ICpuGpuVectorPtr subSeqStartPos,
+    std::vector<std::vector<int>>& reorganizedSeqInfo) {
+  int* seqStarts = seqStartPos->getMutableData(false);
+  int* subSeqStarts = subSeqStartPos->getMutableData(false);
+
+  int seqNum = seqStartPos->getSize() - 1;
+  reorganizedSeqInfo.resize(seqNum, std::vector<int>());
+  int seqIdx = 0;
+  for (size_t i = 0; i < subSeqStartPos->getSize(); ++i) {
+    reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
+    if (subSeqStarts[i] == seqStarts[seqIdx + 1]) {
+      seqIdx++;
+      if (seqIdx == seqNum) return;
+      reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
+    }
+  }
+}
+
 }  // namespace paddle
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
index be87175658..38797a76f5 100644
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -318,6 +318,30 @@ struct Argument {
    */
   void printValueString(std::ostream& stream,
                         const std::string& prefix = "") const;
+
+  /**
+   * @brief reorganizeSeqInfo will reorganize sequenceStartPositions and
+   * subSequenceStartPositions into a 2 dimensional arrary: reorganizedSeqInfo.
+   *
+   * @param seqStartPos: sequenceStartPositions of an Argument.
+   * @param subSeqStartPos: subSequenceStartPositions of an Argument.
+   * @param the reorganized sequence start position information.
+   *
+   * Examples:
+   * seqStartPos: [0, 4, 15, 20, 28]
+   * subSeqStartPos: [0, 3, 4, 5, 7, 10, 15, 20, 22, 23, 25, 28]
+   * reorganizedSeqInfo:
+   *   [
+   *     [0,3,4],
+   *     [4,5,7,10,15],
+   *     [15,20],
+   *     [20,22,23,25,28]
+   *   ]
+   */
+  static void reorganizeSeqInfo(
+      const ICpuGpuVectorPtr seqStartPos,
+      const ICpuGpuVectorPtr subSeqStartPos,
+      std::vector<std::vector<int>>& reorganizedSeqInfo);
 };
 
 }  // namespace paddle
diff --git a/paddle/parameter/Parameter.cpp b/paddle/parameter/Parameter.cpp
index ebe36d4937..f031109501 100644
--- a/paddle/parameter/Parameter.cpp
+++ b/paddle/parameter/Parameter.cpp
@@ -48,7 +48,8 @@ Parameter::Parameter(const ParameterConfig& config, bool useGpu, bool doInit)
       deviceId_(-1),
       sharedCount_(0),
       updateCounter_(0),
-      updated_(false) {
+      updated_(false),
+      headerFormat_(PARAM_FORMAT_ORIGINAL) {
   setID(-1); /* capture uninitialized id */
   if (useGpu_ && FLAGS_parallel_nn) {
     /* gpu environment is specified by device property */
@@ -285,7 +286,7 @@ bool Parameter::save(const std::string& filename) const {
 bool Parameter::save(std::ostream& s) const {
   CpuVector vec(*bufs_[PARAMETER_VALUE].get());
   Header header;
-  header.version = kFormatVersion;
+  header.format = headerFormat_;
   header.valueSize = sizeof(real);
   header.size = getSize();
 
@@ -344,8 +345,9 @@ bool Parameter::load(std::istream& s) {
   Header header;
   CHECK(s.read(reinterpret_cast<char*>(&header), sizeof(header)))
       << "Fail to read parameter " << getName();
-  CHECK_EQ(header.version, kFormatVersion) << "Incorrect format version: "
-                                           << header.version;
+  CHECK(isHeaderFormatSupported(header.format)) << "Incorrect format version: "
+                                                << header.format;
+  headerFormat_ = header.format;
   CHECK_EQ(header.size, getSize())
       << "The size (" << header.size << ") in the file does not match the size "
       << "(" << getSize() << ") of the parameter: " << getName();
diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
index 0bac76f068..321f4275d8 100644
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -34,6 +34,20 @@ limitations under the License. */
 
 namespace paddle {
 
+typedef enum {
+  /// The paddle original basic format
+  PARAM_FORMAT_ORIGINAL = 0,
+
+  /// See mkldnn_memory_format_t in
+  /// https://github.com/01org/mkl-dnn/blob/master/include/mkldnn_types.h
+  /// for a detailed description.
+  /// 2D weights tensor in the format (output channels, input channels).
+  PARAM_FORMAT_MKLDNN_OI,
+
+  /// The total format items numbers
+  PARAM_FORMAT_ITEMS,
+} PARAM_FORMAT;
+
 class SparsePrefetchRowCpuMatrix;
 
 class Parameter;
@@ -51,7 +65,10 @@ public:
   size_t getSize() const { return config_.size(); }
 
   bool isFullSize() const {
-    return this->getSize() == bufs_[PARAMETER_VALUE]->getSize();
+    if (bufs_[PARAMETER_VALUE]) {
+      return this->getSize() == bufs_[PARAMETER_VALUE]->getSize();
+    }
+    return false;
   }
 
   inline bool useGpu() const { return useGpu_; }
@@ -242,14 +259,30 @@ public:
   /// Initialize the value to 0
   void zeroMem();
 
-  static const int kFormatVersion = 0;
   /// file header structure
   struct Header {
-    int32_t version;     // = 0, file format version
+    int32_t format;      // = PARAM_FORMAT
     uint32_t valueSize;  // = sizeof(real)
     uint64_t size;       // = getSize()
   };
 
+  /**
+   * @brief Is the header format supported.
+   */
+  static bool isHeaderFormatSupported(int32_t fmt) {
+    return fmt < PARAM_FORMAT_ITEMS;
+  }
+
+  /**
+   * @brief Get the format in header.
+   */
+  int getHeaderFormat() { return headerFormat_; }
+
+  /**
+   * @brief Set the format in header.
+   */
+  void setHeaderFormat(int32_t fmt) { headerFormat_ = fmt; }
+
   /**
    * @brief  Parameter Update Hook.
    *
@@ -321,6 +354,9 @@ protected:
   bool updated_;
   SparseFormat format_;
 
+  /// The header format for saving or loading param
+  int32_t headerFormat_;
+
   std::vector<std::shared_ptr<IParameterUpdaterHook>> updaterHooks_;
 
 public:
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 358d14f455..120eb1e4af 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -1,15 +1,24 @@
-add_subdirectory(dynload)
+cc_library(cpu_info SRCS cpu_info.cc DEPS gflags glog)
+cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
 
-nv_test(cuda_test SRCS cuda_test.cu)
+nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog)
 
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
+add_subdirectory(dynload)
+
+cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece)
+cc_test(environment_test SRCS environment_test.cc DEPS stringpiece)
+
 IF(WITH_GPU)
     set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
 ELSE()
     set(GPU_CTX_DEPS)
 ENDIF()
 
-cc_library(device_context SRCS device_context.cc DEPS place eigen3 ${GPU_CTX_DEPS})
-nv_test(device_context_test SRCS device_context_test.cc DEPS device_context glog gflags)
+# memcpy deoends on device_context, here add deps individually for
+# avoiding cycle dependencies
+cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
+    system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS})
+nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info)
diff --git a/paddle/platform/cpu_info.cc b/paddle/platform/cpu_info.cc
new file mode 100644
index 0000000000..78e1fa9df5
--- /dev/null
+++ b/paddle/platform/cpu_info.cc
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/cpu_info.h"
+
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#else
+#include <unistd.h>
+#endif
+
+#include "gflags/gflags.h"
+
+DEFINE_double(fraction_of_cpu_memory_to_use, 1,
+              "Default use 100% of CPU memory for PaddlePaddle,"
+              "reserve the rest for page tables, etc");
+
+namespace paddle {
+namespace platform {
+
+inline size_t CpuTotalPhysicalMemory() {
+#ifdef __APPLE__
+  int mib[2];
+  mib[0] = CTL_HW;
+  mib[1] = HW_MEMSIZE;
+  int64_t size = 0;
+  size_t len = sizeof(size);
+  if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size;
+  return 0L;
+#else
+  int64_t pages = sysconf(_SC_PHYS_PAGES);
+  int64_t page_size = sysconf(_SC_PAGE_SIZE);
+  return pages * page_size;
+#endif
+}
+
+size_t CpuMaxAllocSize() {
+  // For distributed systems, it requires configuring and limiting
+  // the fraction of memory to use.
+  return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
+}
+
+size_t CpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 4 KB.
+  return 1 << 12;
+}
+
+size_t CpuMaxChunkSize() {
+  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory.
+  return CpuMaxAllocSize() / 32;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/cuda.h b/paddle/platform/cpu_info.h
similarity index 50%
rename from paddle/platform/cuda.h
rename to paddle/platform/cpu_info.h
index 96889abf9e..8df7c7b4bc 100644
--- a/paddle/platform/cuda.h
+++ b/paddle/platform/cpu_info.h
@@ -14,37 +14,19 @@ limitations under the License. */
 
 #pragma once
 
-#ifndef PADDLE_ONLY_CPU
-
-#include <thrust/system/cuda/error.h>
-#include <thrust/system_error.h>
+#include <stddef.h>
 
 namespace paddle {
 namespace platform {
 
-inline void throw_on_error(cudaError_t e, const char* message) {
-  if (e) {
-    throw thrust::system_error(e, thrust::cuda_category(), message);
-  }
-}
-
-inline int GetDeviceCount(void) {
-  int count;
-  throw_on_error(cudaGetDeviceCount(&count), "cudaGetDeviceCount failed");
-  return count;
-}
+//! Get the maximum allocation size for a machine.
+size_t CpuMaxAllocSize();
 
-inline int GetCurrentDeviceId(void) {
-  int device_id;
-  throw_on_error(cudaGetDevice(&device_id), "cudaGetDevice failed");
-  return device_id;
-}
+//! Get the minimum chunk size for buddy allocator.
+size_t CpuMinChunkSize();
 
-inline void SetDeviceId(int device_id) {
-  throw_on_error(cudaSetDevice(device_id), "cudaSetDevice failed");
-}
+//! Get the maximum chunk size for buddy allocator.
+size_t CpuMaxChunkSize();
 
 }  // namespace platform
 }  // namespace paddle
-
-#endif  // PADDLE_ONLY_CPU
diff --git a/paddle/platform/cpu_info_test.cc b/paddle/platform/cpu_info_test.cc
new file mode 100644
index 0000000000..8fb195aa7c
--- /dev/null
+++ b/paddle/platform/cpu_info_test.cc
@@ -0,0 +1,21 @@
+#include "paddle/platform/cpu_info.h"
+#include "paddle/string/printf.h"
+
+#include <ostream>
+#include <sstream>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+DECLARE_double(fraction_of_cpu_memory_to_use);
+
+TEST(CpuMemoryUsage, Print) {
+  std::stringstream ss;
+  size_t memory_size = paddle::platform::CpuMaxAllocSize() / 1024 / 1024 / 1024;
+  float use_percent = FLAGS_fraction_of_cpu_memory_to_use * 100;
+
+  std::cout << paddle::string::Sprintf("\n%.2f %% of CPU Memory Usage: %d GB\n",
+                                       use_percent, memory_size)
+            << std::endl;
+}
diff --git a/paddle/platform/cuda_test.cu b/paddle/platform/cuda_test.cu
deleted file mode 100644
index 4067dda2f1..0000000000
--- a/paddle/platform/cuda_test.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-#include <cuda_runtime.h>
-#include <stdio.h>
-#include "gtest/gtest.h"
-
-#define CHECK_ERR(x)                 \
-  if (x != cudaSuccess) {            \
-    fprintf(stderr,                  \
-            "%s in %s at line %d\n", \
-            cudaGetErrorString(err), \
-            __FILE__,                \
-            __LINE__);               \
-    exit(-1);                        \
-  }
-
-__global__ void vecAdd(float *d_A, float *d_B, float *d_C, int n) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < n) {
-    d_C[i] = d_A[i] + d_B[i];
-  }
-}
-
-TEST(Cuda, Equality) {
-  int n = 10;
-  // Memory allocation for h_A, h_B and h_C (in the host)
-  float h_A[10] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 0.0};
-  float h_B[10] = {0.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0};
-  float h_C[10];
-  float *d_A, *d_B, *d_C;
-  cudaError_t err;
-  // Memory allocation for d_A, d_B and d_C (in the device)
-  err = cudaMalloc((void **)&d_A, sizeof(float) * n);
-  CHECK_ERR(err);
-
-  err = cudaMalloc((void **)&d_B, sizeof(float) * n);
-  CHECK_ERR(err);
-
-  err = cudaMalloc((void **)&d_C, sizeof(float) * n);
-  CHECK_ERR(err);
-
-  // Copying memory to device
-  err = cudaMemcpy(d_A, h_A, sizeof(float) * n, cudaMemcpyHostToDevice);
-  CHECK_ERR(err);
-
-  err = cudaMemcpy(d_B, h_B, sizeof(float) * n, cudaMemcpyHostToDevice);
-  CHECK_ERR(err);
-
-  // Calling the kernel
-  vecAdd<<<ceil(n / 256.0), 256>>>(d_A, d_B, d_C, n);
-
-  // Copying results back to host
-  err = cudaMemcpy(h_C, d_C, sizeof(float) * n, cudaMemcpyDeviceToHost);
-  CHECK_ERR(err);
-
-  EXPECT_EQ(h_C[0], 1.0);
-  for (int i = 1; i < n - 1; ++i) {
-    EXPECT_EQ(h_C[i], 11.0);
-  }
-  EXPECT_EQ(h_C[9], 1.0);
-}
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index 960ef0a595..ad212c5b2c 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -10,21 +10,146 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/platform/device_context.h"
+#include "paddle/memory/memory.h"
 
 namespace paddle {
 namespace platform {
 
 template <>
-Eigen::DefaultDevice* DeviceContext::get_eigen_device<Eigen::DefaultDevice>() {
-  return reinterpret_cast<CPUDeviceContext*>(this)->eigen_device();
+Eigen::DefaultDevice* DeviceContext::get_eigen_device<Eigen::DefaultDevice>()
+    const {
+  return reinterpret_cast<const CPUDeviceContext*>(this)->eigen_device();
 }
 
+CPUDeviceContext::CPUDeviceContext() {
+  eigen_device_.reset(new Eigen::DefaultDevice());
+}
+
+CPUDeviceContext::CPUDeviceContext(CPUPlace place) {
+  eigen_device_.reset(new Eigen::DefaultDevice());
+}
+
+Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
+  return eigen_device_.get();
+}
+
+Place CPUDeviceContext::GetPlace() const { return CPUPlace(); }
+
 #ifndef PADDLE_ONLY_CPU
+
+class EigenCudaStreamDevice : public Eigen::StreamInterface {
+ public:
+  EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
+    Eigen::initializeDeviceProp();
+  }
+  ~EigenCudaStreamDevice() override {}
+
+  void Reinitialize(const cudaStream_t* cuda_stream, GPUPlace place) {
+    stream_ = cuda_stream;
+    place_ = place;
+    device_prop_ = &Eigen::m_deviceProperties[place.device];
+  }
+
+  const cudaStream_t& stream() const override { return *stream_; }
+
+  const cudaDeviceProp& deviceProperties() const override {
+    return *device_prop_;
+  }
+
+  void* allocate(size_t num_bytes) const override {
+    return paddle::memory::Alloc(place_, num_bytes);
+  }
+
+  void deallocate(void* buffer) const override {
+    paddle::memory::Free(place_, buffer);
+  }
+
+  void* scratchpad() const override {
+    if (scratch_ == NULL) {
+      scratch_ = allocate(Eigen::kCudaScratchSize + sizeof(unsigned int));
+    }
+    return scratch_;
+  }
+
+  unsigned int* semaphore() const override {
+    if (semaphore_ == NULL) {
+      char* scratch =
+          static_cast<char*>(scratchpad()) + Eigen::kCudaScratchSize;
+      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+      PADDLE_ENFORCE(
+          cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_));
+    }
+    return semaphore_;
+  }
+
+ private:
+  GPUPlace place_;
+  const cudaStream_t* stream_;         // not owned;
+  const cudaDeviceProp* device_prop_;  // not owned;
+  mutable void* scratch_;
+  mutable unsigned int* semaphore_;
+};
+
 template <>
-Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() {
-  return reinterpret_cast<CUDADeviceContext*>(this)->eigen_device();
+Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() const {
+  return reinterpret_cast<const CUDADeviceContext*>(this)->eigen_device();
 }
-#endif
+
+CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) {
+  SetDeviceId(place_.device);
+  PADDLE_ENFORCE(cudaStreamCreate(&stream_));
+  eigen_stream_.reset(new EigenCudaStreamDevice());
+  eigen_stream_->Reinitialize(&stream_, place);
+  eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
+}
+
+CUDADeviceContext::~CUDADeviceContext() {
+  SetDeviceId(place_.device);
+  Wait();
+  if (cublas_handle_) {
+    PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_));
+  }
+
+  if (cudnn_handle_) {
+    PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
+  }
+
+  eigen_stream_.reset();
+  eigen_device_.reset();
+  PADDLE_ENFORCE(cudaStreamDestroy(stream_));
+}
+
+Place CUDADeviceContext::GetPlace() const { return place_; }
+
+void CUDADeviceContext::Wait() const {
+  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
+}
+
+Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
+  return eigen_device_.get();
+}
+
+cublasHandle_t CUDADeviceContext::cublas_handle() {
+  if (!cublas_handle_) {
+    SetDeviceId(place_.device);
+    PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_));
+    PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_));
+  }
+  return cublas_handle_;
+}
+
+cudnnHandle_t CUDADeviceContext::cudnn_handle() {
+  if (!cudnn_handle_) {
+    SetDeviceId(place_.device);
+    PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
+    PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_));
+  }
+  return cudnn_handle_;
+}
+
+cudaStream_t CUDADeviceContext::stream() { return stream_; }
+
+#endif  // PADDLE_ONLY_CPU
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 7de07d06be..11528e1194 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -11,17 +11,18 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/enforce.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/place.h"
+
 #ifndef PADDLE_ONLY_CPU
-#include "paddle/platform/cuda.h"
 #include "paddle/platform/dynload/cublas.h"
 #include "paddle/platform/dynload/cudnn.h"
-#include "paddle/platform/dynload/curand.h"
+#include "paddle/platform/gpu_info.h"
 #define EIGEN_USE_GPU
 #endif
-#include <paddle/platform/place.h>
 #include <memory>
-#include <unsupported/Eigen/CXX11/Tensor>
+#include "paddle/platform/place.h"
+#include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
 namespace platform {
@@ -32,149 +33,62 @@ class DeviceContext {
   virtual Place GetPlace() const = 0;
 
   template <typename DeviceType>
-  DeviceType* get_eigen_device();
+  DeviceType* get_eigen_device() const;
 };
 
 class CPUDeviceContext : public DeviceContext {
  public:
-  Eigen::DefaultDevice* eigen_device() {
-    if (!eigen_device_) {
-      eigen_device_.reset(new Eigen::DefaultDevice());
-    }
-    return eigen_device_.get();
-  }
-
-  Place GetPlace() const override {
-    Place retv = CPUPlace();
-    return retv;
-  }
+  CPUDeviceContext();
+  explicit CPUDeviceContext(CPUPlace place);
+  virtual ~CPUDeviceContext() {}
+
+  Eigen::DefaultDevice* eigen_device() const;
+
+  Place GetPlace() const override;
 
  private:
   std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
 };
 
 #ifndef PADDLE_ONLY_CPU
+class EigenCudaStreamDevice;
 
-class GPUPlaceGuard {
+class CUDADeviceContext : public DeviceContext {
  public:
-  explicit GPUPlaceGuard(GPUPlace new_place) : previous_(GetCurrentDeviceId()) {
-    if (previous_ != new_place) {
-      paddle::platform::SetDeviceId(new_place.device);
-    }
-  }
+  explicit CUDADeviceContext(GPUPlace place);
+  virtual ~CUDADeviceContext();
 
-  ~GPUPlaceGuard() { paddle::platform::SetDeviceId(previous_.device); }
+  /*! \brief  Wait for all operations completion in the stream. */
+  void Wait() const;
 
- private:
-  GPUPlace previous_;
-};
+  /*! \brief  Return place in the device context. */
+  Place GetPlace() const override;
 
-class CUDADeviceContext : public DeviceContext {
- public:
-  explicit CUDADeviceContext(const GPUPlace gpu_place) : gpu_place_(gpu_place) {
-    GPUPlaceGuard guard(gpu_place_);
-    paddle::platform::throw_on_error(cudaStreamCreate(&stream_),
-                                     "cudaStreamCreate failed");
-    eigen_stream_.reset(new Eigen::CudaStreamDevice(&stream_));
-    eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
-  }
-
-  Place GetPlace() const override {
-    Place retv = GPUPlace();
-    return retv;
-  }
-
-  void Wait() {
-    paddle::platform::throw_on_error(cudaStreamSynchronize(stream_),
-                                     "cudaStreamSynchronize failed");
-  }
-
-  cudaStream_t stream() { return stream_; }
-
-  Eigen::GpuDevice* eigen_device() { return eigen_device_.get(); }
-
-  cublasHandle_t cublas_handle() {
-    if (!blas_handle_) {
-      GPUPlaceGuard guard(gpu_place_);
-      PADDLE_ENFORCE(paddle::platform::dynload::cublasCreate(&blas_handle_) ==
-                         CUBLAS_STATUS_SUCCESS,
-                     "cublasCreate failed");
-      PADDLE_ENFORCE(paddle::platform::dynload::cublasSetStream(
-                         blas_handle_, stream_) == CUBLAS_STATUS_SUCCESS,
-                     "cublasSetStream failed");
-    }
-    return blas_handle_;
-  }
-
-  cudnnHandle_t cudnn_handle() {
-    if (!dnn_handle_) {
-      GPUPlaceGuard guard(gpu_place_);
-      PADDLE_ENFORCE(paddle::platform::dynload::cudnnCreate(&dnn_handle_) ==
-                         CUDNN_STATUS_SUCCESS,
-                     "cudnnCreate failed");
-      PADDLE_ENFORCE(paddle::platform::dynload::cudnnSetStream(
-                         dnn_handle_, stream_) == CUDNN_STATUS_SUCCESS,
-                     "cudnnSetStream failed");
-    }
-    return dnn_handle_;
-  }
-
-  curandGenerator_t curand_generator() {
-    if (!rand_generator_) {
-      GPUPlaceGuard guard(gpu_place_);
-      PADDLE_ENFORCE(paddle::platform::dynload::curandCreateGenerator(
-                         &rand_generator_, CURAND_RNG_PSEUDO_DEFAULT) ==
-                         CURAND_STATUS_SUCCESS,
-                     "curandCreateGenerator failed");
-      PADDLE_ENFORCE(
-          paddle::platform::dynload::curandSetPseudoRandomGeneratorSeed(
-              rand_generator_, random_seed_) == CURAND_STATUS_SUCCESS,
-          "curandSetPseudoRandomGeneratorSeed failed");
-      PADDLE_ENFORCE(paddle::platform::dynload::curandSetStream(
-                         rand_generator_, stream_) == CURAND_STATUS_SUCCESS,
-                     "curandSetStream failed");
-    }
-    return rand_generator_;
-  }
-
-  ~CUDADeviceContext() {
-    Wait();
-    if (blas_handle_) {
-      PADDLE_ENFORCE(paddle::platform::dynload::cublasDestroy(blas_handle_) ==
-                         CUBLAS_STATUS_SUCCESS,
-                     "cublasDestroy failed");
-    }
-
-    if (dnn_handle_) {
-      PADDLE_ENFORCE(paddle::platform::dynload::cudnnDestroy(dnn_handle_) ==
-                         CUDNN_STATUS_SUCCESS,
-                     "cudnnDestroy failed");
-    }
-
-    if (rand_generator_) {
-      PADDLE_ENFORCE(paddle::platform::dynload::curandDestroyGenerator(
-                         rand_generator_) == CURAND_STATUS_SUCCESS,
-                     "curandDestroyGenerator failed");
-    }
-    eigen_stream_.reset();
-    eigen_device_.reset();
-    paddle::platform::throw_on_error(cudaStreamDestroy(stream_),
-                                     "cudaStreamDestroy failed");
-  }
+  /*! \brief  Return eigen device in the device context. */
+  Eigen::GpuDevice* eigen_device() const;
 
- private:
-  GPUPlace gpu_place_;
-  cudaStream_t stream_;
+  // clang-format off
+  /*! \brief  Return cublas handle in the device context. */
+  cublasHandle_t    cublas_handle();
 
-  std::unique_ptr<Eigen::CudaStreamDevice> eigen_stream_;
-  std::unique_ptr<Eigen::GpuDevice> eigen_device_;
+  /*! \brief  Return cudnn  handle in the device context. */
+  cudnnHandle_t     cudnn_handle();
 
-  cublasHandle_t blas_handle_{nullptr};
+  /*! \brief  Return cuda stream in the device context. */
+  cudaStream_t      stream();
+  // clang-format on
 
-  cudnnHandle_t dnn_handle_{nullptr};
+ private:
+  GPUPlace place_;
+
+  std::unique_ptr<Eigen::GpuDevice> eigen_device_;
+  std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
 
-  int random_seed_;
-  curandGenerator_t rand_generator_{nullptr};
+  // clang-format off
+  cudaStream_t       stream_{nullptr};
+  cudnnHandle_t      cudnn_handle_{nullptr};
+  cublasHandle_t     cublas_handle_{nullptr};
+  // clang-format on
 };
 
 #endif
diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc
index af2ce17fc2..5883a55272 100644
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
@@ -15,32 +15,35 @@ limitations under the License. */
 #include "paddle/platform/device_context.h"
 #include "gtest/gtest.h"
 
-using DEVICE_GPU = Eigen::GpuDevice;
 TEST(Device, Init) {
+  using paddle::platform::DeviceContext;
+  using paddle::platform::CUDADeviceContext;
+  using paddle::platform::GPUPlace;
+
   int count = paddle::platform::GetDeviceCount();
   for (int i = 0; i < count; i++) {
-    paddle::platform::DeviceContext* device_context =
-        new paddle::platform::CUDADeviceContext(i);
+    DeviceContext* device_context = new CUDADeviceContext(GPUPlace(i));
     Eigen::GpuDevice* gpu_device =
-        device_context->template get_eigen_device<DEVICE_GPU>();
+        device_context->template get_eigen_device<Eigen::GpuDevice>();
     ASSERT_NE(nullptr, gpu_device);
     delete device_context;
   }
 }
 
 TEST(Device, CUDADeviceContext) {
+  using paddle::platform::CUDADeviceContext;
+  using paddle::platform::GPUPlace;
+
   int count = paddle::platform::GetDeviceCount();
   for (int i = 0; i < count; i++) {
-    paddle::platform::CUDADeviceContext* device_context =
-        new paddle::platform::CUDADeviceContext(i);
+    CUDADeviceContext* device_context = new CUDADeviceContext(GPUPlace(i));
     Eigen::GpuDevice* gpu_device = device_context->eigen_device();
     ASSERT_NE(nullptr, gpu_device);
     cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
     ASSERT_NE(nullptr, cudnn_handle);
     cublasHandle_t cublas_handle = device_context->cublas_handle();
     ASSERT_NE(nullptr, cublas_handle);
-    curandGenerator_t curand_handle = device_context->curand_generator();
-    ASSERT_NE(nullptr, curand_handle);
+    ASSERT_NE(nullptr, device_context->stream());
     delete device_context;
   }
 }
diff --git a/paddle/platform/dynload/cublas.cc b/paddle/platform/dynload/cublas.cc
index 4e3dfdaefb..9cd2a1f565 100644
--- a/paddle/platform/dynload/cublas.cc
+++ b/paddle/platform/dynload/cublas.cc
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #include <paddle/platform/dynload/cublas.h>
 
 namespace paddle {
diff --git a/paddle/platform/dynload/cublas.h b/paddle/platform/dynload/cublas.h
index c44b7240a8..9d8343c0b5 100644
--- a/paddle/platform/dynload/cublas.h
+++ b/paddle/platform/dynload/cublas.h
@@ -48,13 +48,13 @@ extern void *cublas_dso_handle;
   };                                                                \
   extern DynLoad__##__name __name
 #else
-#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
-  struct DynLoad__##__name {                     \
-    inline template <typename... Args>           \
-    cublasStatus_t operator()(Args... args) {    \
-      return __name(args...);                    \
-    }                                            \
-  };                                             \
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)     \
+  struct DynLoad__##__name {                         \
+    template <typename... Args>                      \
+    inline cublasStatus_t operator()(Args... args) { \
+      return __name(args...);                        \
+    }                                                \
+  };                                                 \
   extern DynLoad__##__name __name
 #endif
 
@@ -62,12 +62,12 @@ extern void *cublas_dso_handle;
   DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)
 
 #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
-  __macro(cublasSgemv);                   \
-  __macro(cublasDgemv);                   \
-  __macro(cublasSgemm);                   \
-  __macro(cublasDgemm);                   \
-  __macro(cublasSgeam);                   \
-  __macro(cublasDgeam);                   \
+  __macro(cublasSgemv_v2);                \
+  __macro(cublasDgemv_v2);                \
+  __macro(cublasSgemm_v2);                \
+  __macro(cublasDgemm_v2);                \
+  __macro(cublasSgeam_v2);                \
+  __macro(cublasDgeam_v2);                \
   __macro(cublasCreate_v2);               \
   __macro(cublasDestroy_v2);              \
   __macro(cublasSetStream_v2);            \
diff --git a/paddle/platform/dynload/cudnn.cc b/paddle/platform/dynload/cudnn.cc
index 8b5e15b5ef..d3e4cb567d 100644
--- a/paddle/platform/dynload/cudnn.cc
+++ b/paddle/platform/dynload/cudnn.cc
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #include <paddle/platform/dynload/cudnn.h>
 
 namespace paddle {
@@ -25,4 +39,4 @@ CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
 
 }  // namespace dynload
 }  // namespace platform
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
diff --git a/paddle/platform/dynload/curand.cc b/paddle/platform/dynload/curand.cc
index 5c1fab992c..d05dd88126 100644
--- a/paddle/platform/dynload/curand.cc
+++ b/paddle/platform/dynload/curand.cc
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #include <paddle/platform/dynload/curand.h>
 
 namespace paddle {
@@ -10,6 +24,7 @@ void *curand_dso_handle;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
-}
-}
-}
\ No newline at end of file
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/dynload/curand.h b/paddle/platform/dynload/curand.h
index d8c46bc41e..7bfe0778c7 100644
--- a/paddle/platform/dynload/curand.h
+++ b/paddle/platform/dynload/curand.h
@@ -55,6 +55,7 @@ extern void *curand_dso_handle;
   __macro(curandSetPseudoRandomGeneratorSeed); \
   __macro(curandGenerateUniform);              \
   __macro(curandGenerateUniformDouble);        \
+  __macro(curandGenerateNormal);               \
   __macro(curandDestroyGenerator);
 
 CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
diff --git a/paddle/platform/dynload/dynamic_loader.cc b/paddle/platform/dynload/dynamic_loader.cc
index dd914e006d..ae9a0a982c 100644
--- a/paddle/platform/dynload/dynamic_loader.cc
+++ b/paddle/platform/dynload/dynamic_loader.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include <string>
 #include "gflags/gflags.h"
 #include "glog/logging.h"
-#include "paddle/framework/enforce.h"
+#include "paddle/platform/enforce.h"
 
 DEFINE_string(cudnn_dir, "",
               "Specify path for loading libcudnn.so. For instance, "
diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
new file mode 100644
index 0000000000..81448897e9
--- /dev/null
+++ b/paddle/platform/enforce.h
@@ -0,0 +1,237 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <dlfcn.h>     // for dladdr
+#include <execinfo.h>  // for backtrace
+#include <iomanip>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+#include "paddle/string/printf.h"
+#include "paddle/string/to_string.h"
+
+#ifdef __GNUC__
+#include <cxxabi.h>  // for __cxa_demangle
+#endif
+
+#ifndef PADDLE_ONLY_CPU
+
+#include "paddle/platform/dynload/cublas.h"
+#include "paddle/platform/dynload/cudnn.h"
+#include "paddle/platform/dynload/curand.h"
+
+#include <cublas_v2.h>
+#include <cudnn.h>
+#include <curand.h>
+#include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
+
+#endif  // PADDLE_ONLY_CPU
+
+namespace paddle {
+namespace platform {
+
+namespace {
+#ifdef __GNUC__
+inline std::string demangle(std::string name) {
+  int status = -4;  // some arbitrary value to eliminate the compiler warning
+  std::unique_ptr<char, void (*)(void*)> res{
+      abi::__cxa_demangle(name.c_str(), NULL, NULL, &status), std::free};
+  return (status == 0) ? res.get() : name;
+}
+#else
+inline std::string demangle(std::string name) { return name; }
+#endif
+}
+
+struct EnforceNotMet : public std::exception {
+  std::exception_ptr exp_;
+  std::string err_str_;
+  EnforceNotMet(std::exception_ptr e, const char* f, int l) : exp_(e) {
+    static constexpr int TRACE_STACK_LIMIT = 100;
+    try {
+      std::rethrow_exception(exp_);
+    } catch (const std::exception& exp) {
+      std::ostringstream sout;
+
+      sout << string::Sprintf("%s at [%s:%d]", exp.what(), f, l) << std::endl;
+      sout << "PaddlePaddle Call Stacks: " << std::endl;
+
+      void* call_stack[TRACE_STACK_LIMIT];
+      auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
+      auto symbols = backtrace_symbols(call_stack, size);
+
+      Dl_info info;
+      for (int i = 0; i < size; ++i) {
+        if (dladdr(call_stack[i], &info)) {
+          auto demangled = demangle(info.dli_sname);
+          auto addr_offset = static_cast<char*>(call_stack[i]) -
+                             static_cast<char*>(info.dli_saddr);
+          sout << string::Sprintf("%-3d %*0p %s + %zd\n", i,
+                                  2 + sizeof(void*) * 2, call_stack[i],
+                                  demangled, addr_offset);
+        } else {
+          sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2,
+                                  call_stack[i]);
+        }
+      }
+      free(symbols);
+      err_str_ = sout.str();
+    }
+  }
+
+  const char* what() const noexcept { return err_str_.c_str(); }
+};
+
+// Because most enforce conditions would evaluate to true, we can use
+// __builtin_expect to instruct the C++ compiler to generate code that
+// always forces branch prediction of true.
+// This generates faster binary code. __builtin_expect is since C++11.
+// For more details, please check https://stackoverflow.com/a/43870188/724872.
+#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
+
+template <typename... Args>
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    int stat, const Args&... args) {
+  if (UNLIKELY(!(stat))) {
+    throw std::runtime_error(string::Sprintf(args...));
+  }
+}
+
+#ifndef PADDLE_ONLY_CPU
+
+template <typename... Args>
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    cudaError_t e, const Args&... args) {
+  if (UNLIKELY(e)) {
+    throw thrust::system_error(e, thrust::cuda_category(),
+                               string::Sprintf(args...));
+  }
+}
+
+template <typename... Args>
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    curandStatus_t stat, const Args&... args) {
+  if (stat != CURAND_STATUS_SUCCESS) {
+    throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
+                               string::Sprintf(args...));
+  }
+}
+
+template <typename... Args>
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    cudnnStatus_t stat, const Args&... args) {
+  if (stat == CUDNN_STATUS_SUCCESS) {
+    return;
+  } else {
+    throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) +
+                             string::Sprintf(args...));
+  }
+}
+
+template <typename... Args>
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    cublasStatus_t stat, const Args&... args) {
+  std::string err;
+  if (stat == CUBLAS_STATUS_SUCCESS) {
+    return;
+  } else if (stat == CUBLAS_STATUS_NOT_INITIALIZED) {
+    err = "CUBLAS: not initialized, ";
+  } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) {
+    err = "CUBLAS: alloc failed, ";
+  } else if (stat == CUBLAS_STATUS_INVALID_VALUE) {
+    err = "CUBLAS: invalid value, ";
+  } else if (stat == CUBLAS_STATUS_ARCH_MISMATCH) {
+    err = "CUBLAS: arch mismatch, ";
+  } else if (stat == CUBLAS_STATUS_MAPPING_ERROR) {
+    err = "CUBLAS: mapping error, ";
+  } else if (stat == CUBLAS_STATUS_EXECUTION_FAILED) {
+    err = "CUBLAS: execution failed, ";
+  } else if (stat == CUBLAS_STATUS_INTERNAL_ERROR) {
+    err = "CUBLAS: internal error, ";
+  } else if (stat == CUBLAS_STATUS_NOT_SUPPORTED) {
+    err = "CUBLAS: not supported, ";
+  } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) {
+    err = "CUBLAS: license error, ";
+  }
+  throw std::runtime_error(err + string::Sprintf(args...));
+}
+
+#endif  // PADDLE_ONLY_CPU
+
+template <typename T>
+inline void throw_on_error(T e) {
+  throw_on_error(e, "");
+}
+
+#define PADDLE_THROW(...)                                              \
+  do {                                                                 \
+    throw ::paddle::platform::EnforceNotMet(                           \
+        std::make_exception_ptr(                                       \
+            std::runtime_error(paddle::string::Sprintf(__VA_ARGS__))), \
+        __FILE__, __LINE__);                                           \
+  } while (0)
+
+#define PADDLE_ENFORCE(...)                                             \
+  do {                                                                  \
+    try {                                                               \
+      ::paddle::platform::throw_on_error(__VA_ARGS__);                  \
+    } catch (...) {                                                     \
+      throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
+                                              __FILE__, __LINE__);      \
+    }                                                                   \
+  } while (0)
+
+/*
+ * Some enforce helpers here, usage:
+ *    int a = 1;
+ *    int b = 2;
+ *    PADDLE_ENFORCE_EQ(a, b);
+ *
+ *    will raise an expression described as follows:
+ *    "enforce a == b failed, 1 != 2" with detailed stack information.
+ *
+ *    extra messages is also supported, for example:
+ *    PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
+ */
+
+#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__)
+#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__)
+#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__)
+#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__)
+#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
+#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
+#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                            \
+  PADDLE_ENFORCE(nullptr != (__VAL), #__VAL " should not be null\n%s", \
+                 paddle::string::Sprintf("" __VA_ARGS__));
+
+#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...)        \
+  PADDLE_ENFORCE(__VAL0 __CMP __VAL1,                                         \
+                 "enforce %s " #__CMP " %s failed, %s " #__INV_CMP " %s\n%s", \
+                 #__VAL0, #__VAL1, paddle::string::to_string(__VAL0),         \
+                 paddle::string::to_string(__VAL1),                           \
+                 paddle::string::Sprintf("" __VA_ARGS__));
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/enforce_test.cc b/paddle/platform/enforce_test.cc
new file mode 100644
index 0000000000..80bdee3d9d
--- /dev/null
+++ b/paddle/platform/enforce_test.cc
@@ -0,0 +1,216 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <array>
+#include <iostream>
+#include <memory>
+
+#include "gtest/gtest.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/string/piece.h"
+
+using StringPiece = paddle::string::Piece;
+using paddle::string::HasPrefix;
+
+TEST(ENFORCE, OK) {
+  PADDLE_ENFORCE(true, "Enforce is ok %d now %f", 123, 0.345);
+  size_t val = 1;
+  const size_t limit = 10;
+  PADDLE_ENFORCE(val < limit, "Enforce is OK too");
+}
+
+TEST(ENFORCE, FAILED) {
+  bool caught_exception = false;
+  try {
+    PADDLE_ENFORCE(false, "Enforce is not ok %d at all", 123);
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    EXPECT_TRUE(
+        HasPrefix(StringPiece(error.what()), "Enforce is not ok 123 at all"));
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
+TEST(ENFORCE, NO_ARG_OK) {
+  int a = 2;
+  int b = 2;
+  PADDLE_ENFORCE_EQ(a, b);
+  // test enforce with extra message.
+  PADDLE_ENFORCE_EQ(a, b, "some thing wrong %s", "info");
+}
+
+TEST(ENFORCE_EQ, NO_EXTRA_MSG_FAIL) {
+  int a = 2;
+  bool caught_exception = false;
+  try {
+    PADDLE_ENFORCE_EQ(a, 1 + 3);
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    HasPrefix(StringPiece(error.what()), "enforce a == 1 + 3 failed, 2 != 4");
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
+TEST(ENFORCE_EQ, EXTRA_MSG_FAIL) {
+  int a = 2;
+  bool caught_exception = false;
+  try {
+    PADDLE_ENFORCE_EQ(a, 1 + 3, "%s size not match", "their");
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    HasPrefix(StringPiece(error.what()),
+              "enforce a == 1 + 3 failed, 2 != 4\ntheir size not match");
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
+TEST(ENFORCE_NE, OK) {
+  PADDLE_ENFORCE_NE(1, 2);
+  PADDLE_ENFORCE_NE(1.0, 2UL);
+}
+TEST(ENFORCE_NE, FAIL) {
+  bool caught_exception = false;
+
+  try {
+    // 2UL here to check data type compatible
+    PADDLE_ENFORCE_NE(1.0, 1UL);
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    EXPECT_TRUE(HasPrefix(StringPiece(error.what()),
+                          "enforce 1.0 != 1UL failed, 1 == 1"))
+        << error.what() << " does not have expected prefix";
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
+TEST(ENFORCE_GT, OK) { PADDLE_ENFORCE_GT(2, 1); }
+TEST(ENFORCE_GT, FAIL) {
+  bool caught_exception = false;
+  try {
+    PADDLE_ENFORCE_GT(1, 2UL);
+
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    EXPECT_TRUE(
+        HasPrefix(StringPiece(error.what()), "enforce 1 > 2UL failed, 1 <= 2"));
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
+TEST(ENFORCE_GE, OK) {
+  PADDLE_ENFORCE_GE(2, 2UL);
+  PADDLE_ENFORCE_GE(3, 2UL);
+  PADDLE_ENFORCE_GE(3, 2);
+  PADDLE_ENFORCE_GE(3.21, 2UL);
+}
+TEST(ENFORCE_GE, FAIL) {
+  bool caught_exception = false;
+  try {
+    PADDLE_ENFORCE_GE(1, 2UL);
+
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    EXPECT_TRUE(
+        HasPrefix(StringPiece(error.what()), "enforce 1 >= 2UL failed, 1 < 2"));
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
+TEST(ENFORCE_LE, OK) {
+  PADDLE_ENFORCE_LE(1, 1);
+  PADDLE_ENFORCE_LE(1, 1UL);
+  PADDLE_ENFORCE_LE(2, 3UL);
+  PADDLE_ENFORCE_LE(2UL, 3);
+  PADDLE_ENFORCE_LE(2UL, 3.2);
+}
+TEST(ENFORCE_LE, FAIL) {
+  bool caught_exception = false;
+  try {
+    PADDLE_ENFORCE_GT(1, 2UL);
+
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    EXPECT_TRUE(
+        HasPrefix(StringPiece(error.what()), "enforce 1 > 2UL failed, 1 <= 2"));
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
+TEST(ENFORCE_LT, OK) {
+  PADDLE_ENFORCE_LT(3, 10);
+  PADDLE_ENFORCE_LT(2, 3UL);
+  PADDLE_ENFORCE_LT(2UL, 3);
+}
+TEST(ENFORCE_LT, FAIL) {
+  bool caught_exception = false;
+  try {
+    PADDLE_ENFORCE_LT(1UL, 0.12);
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    EXPECT_TRUE(HasPrefix(StringPiece(error.what()),
+                          "enforce 1UL < 0.12 failed, 1 >= 0.12"));
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
+TEST(ENFORCE_NOT_NULL, OK) {
+  int* a = new int;
+  PADDLE_ENFORCE_NOT_NULL(a);
+  delete a;
+}
+TEST(ENFORCE_NOT_NULL, FAIL) {
+  bool caught_exception = false;
+  try {
+    int* a = nullptr;
+    PADDLE_ENFORCE_NOT_NULL(a);
+
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    EXPECT_TRUE(HasPrefix(StringPiece(error.what()), "a should not be null"));
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
+struct Dims {
+  size_t dims_[4];
+
+  bool operator==(const Dims& o) const {
+    for (size_t i = 0; i < 4; ++i) {
+      if (dims_[i] != o.dims_[i]) return false;
+    }
+    return true;
+  }
+};
+
+std::ostream& operator<<(std::ostream& os, const Dims& d) {
+  for (size_t i = 0; i < 4; ++i) {
+    if (i == 0) {
+      os << "[";
+    }
+    os << d.dims_[i];
+    if (i == 4 - 1) {
+      os << "]";
+    } else {
+      os << ", ";
+    }
+  }
+  return os;
+}
+
+TEST(ENFORCE_USER_DEFINED_CLASS, EQ) {
+  Dims a{{1, 2, 3, 4}}, b{{1, 2, 3, 4}};
+  PADDLE_ENFORCE_EQ(a, b);
+}
+
+TEST(ENFORCE_USER_DEFINED_CLASS, NE) {
+  Dims a{{1, 2, 3, 4}}, b{{5, 6, 7, 8}};
+  ASSERT_THROW(PADDLE_ENFORCE_EQ(a, b), paddle::platform::EnforceNotMet);
+}
\ No newline at end of file
diff --git a/paddle/platform/environment.h b/paddle/platform/environment.h
new file mode 100644
index 0000000000..4edcce932e
--- /dev/null
+++ b/paddle/platform/environment.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <vector>
+
+#include "paddle/platform/enforce.h"
+#include "paddle/string/piece.h"
+
+extern char** environ;  // for environment variables
+
+namespace paddle {
+namespace platform {
+
+inline void SetEnvVariable(const std::string& name, const std::string& value) {
+  PADDLE_ENFORCE_NE(setenv(name.c_str(), value.c_str(), 1), -1,
+                    "Failed to set environment variable %s=%s", name, value);
+}
+
+inline void UnsetEnvVariable(const std::string& name) {
+  PADDLE_ENFORCE_NE(unsetenv(name.c_str()), -1,
+                    "Failed to unset environment variable %s", name);
+}
+
+inline bool IsEnvVarDefined(const std::string& name) {
+  return std::getenv(name.c_str()) != nullptr;
+}
+
+inline std::string GetEnvValue(const std::string& name) {
+  PADDLE_ENFORCE(IsEnvVarDefined(name),
+                 "Tried to access undefined environment variable %s", name);
+  return std::getenv(name.c_str());
+}
+
+inline std::vector<std::string> GetAllEnvVariables() {
+  std::vector<std::string> vars;
+  for (auto var = environ; *var != nullptr; ++var) {
+    auto tail = string::Index(*var, "=");
+    auto name = string::SubStr(*var, 0, tail).ToString();
+    vars.push_back(name);
+  }
+  return vars;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/environment_test.cc b/paddle/platform/environment_test.cc
new file mode 100644
index 0000000000..5f13652721
--- /dev/null
+++ b/paddle/platform/environment_test.cc
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/environment.h"
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+TEST(ENVIRONMENT, ACCESS) {
+  namespace platform = paddle::platform;
+  namespace string = paddle::string;
+
+  platform::SetEnvVariable("PADDLE_USE_ENV", "TRUE");
+
+  EXPECT_TRUE(platform::IsEnvVarDefined("PADDLE_USE_ENV"));
+  EXPECT_EQ(platform::GetEnvValue("PADDLE_USE_ENV"), "TRUE");
+
+  platform::UnsetEnvVariable("PADDLE_USE_ENV");
+  EXPECT_FALSE(platform::IsEnvVarDefined("PADDLE_USE_ENV"));
+
+  platform::SetEnvVariable("PADDLE_USE_ENV1", "Hello ");
+  platform::SetEnvVariable("PADDLE_USE_ENV2", "World, ");
+  platform::SetEnvVariable("PADDLE_USE_ENV3", "PaddlePaddle!");
+
+  std::string env_info;
+  auto vars = platform::GetAllEnvVariables();
+  for_each(vars.begin(), vars.end(), [&](const std::string& var) {
+    env_info += platform::GetEnvValue(var);
+  });
+
+  EXPECT_TRUE(string::Contains(env_info, "Hello World, PaddlePaddle!"));
+  platform::UnsetEnvVariable("PADDLE_USE_ENV1");
+  platform::UnsetEnvVariable("PADDLE_USE_ENV2");
+  platform::UnsetEnvVariable("PADDLE_USE_ENV3");
+
+  env_info.clear();
+  vars = platform::GetAllEnvVariables();
+  for_each(vars.begin(), vars.end(), [&](const std::string& var) {
+    env_info += platform::GetEnvValue(var);
+  });
+
+  EXPECT_FALSE(string::Contains(env_info, "Hello World, PaddlePaddle!"));
+  EXPECT_FALSE(platform::IsEnvVarDefined("PADDLE_USE_ENV1"));
+  EXPECT_FALSE(platform::IsEnvVarDefined("PADDLE_USE_ENV2"));
+  EXPECT_FALSE(platform::IsEnvVarDefined("PADDLE_USE_ENV3"));
+}
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
new file mode 100644
index 0000000000..be381a4e26
--- /dev/null
+++ b/paddle/platform/gpu_info.cc
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/gpu_info.h"
+
+#include "gflags/gflags.h"
+
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/environment.h"
+
+DEFINE_double(fraction_of_gpu_memory_to_use, 0.95,
+              "Default use 95% of GPU memory for PaddlePaddle,"
+              "reserve the rest for page tables, etc");
+
+namespace paddle {
+namespace platform {
+
+int GetDeviceCount() {
+  int count;
+  PADDLE_ENFORCE(
+      cudaGetDeviceCount(&count),
+      "cudaGetDeviceCount failed in paddle::platform::GetDeviceCount");
+  return count;
+}
+
+int GetCurrentDeviceId() {
+  int device_id;
+  PADDLE_ENFORCE(
+      cudaGetDevice(&device_id),
+      "cudaGetDevice failed in paddle::platform::GetCurrentDeviceId");
+  return device_id;
+}
+
+void SetDeviceId(int id) {
+  PADDLE_ENFORCE(cudaSetDevice(id),
+                 "cudaSetDevice failed in paddle::platform::SetDeviceId");
+}
+
+void GpuMemoryUsage(size_t &available, size_t &total) {
+  PADDLE_ENFORCE(cudaMemGetInfo(&available, &total),
+                 "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
+}
+
+size_t GpuMaxAllocSize() {
+  size_t total = 0;
+  size_t available = 0;
+
+  GpuMemoryUsage(available, total);
+
+  // Reserve the rest for page tables, etc.
+  return static_cast<size_t>(total * FLAGS_fraction_of_gpu_memory_to_use);
+}
+
+size_t GpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 256 bytes.
+  return 1 << 8;
+}
+
+size_t GpuMaxChunkSize() {
+  size_t total = 0;
+  size_t available = 0;
+
+  GpuMemoryUsage(available, total);
+
+  if (IsEnvVarDefined(kEnvFractionGpuMemoryToUse)) {
+    auto val = std::stod(GetEnvValue(kEnvFractionGpuMemoryToUse));
+    PADDLE_ENFORCE_GT(val, 0.0);
+    PADDLE_ENFORCE_LE(val, 1.0);
+    FLAGS_fraction_of_gpu_memory_to_use = val;
+  }
+
+  // Reserving the rest memory for page tables, etc.
+  size_t reserving = (1 - FLAGS_fraction_of_gpu_memory_to_use) * total;
+
+  // If available less than minimum chunk size, no usable memory exists.
+  available = std::max(available, GpuMinChunkSize()) - GpuMinChunkSize();
+
+  // If available less than reserving, no usable memory exists.
+  size_t usable = std::max(available, reserving) - reserving;
+
+  return usable;
+}
+
+void GpuMemcpyAsync(void *dst, const void *src, size_t count,
+                    enum cudaMemcpyKind kind, cudaStream_t stream) {
+  PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream),
+                 "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync");
+}
+
+void GpuMemcpySync(void *dst, const void *src, size_t count,
+                   enum cudaMemcpyKind kind) {
+  PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind),
+                 "cudaMemcpy failed in paddle::platform::GpuMemcpySync");
+  // note: cudaMemcpy may actually be asynchronous with respect to the caller,
+  //       block on stream 0 to make sure the copy has completed
+  PADDLE_ENFORCE(
+      cudaStreamSynchronize(0),
+      "cudaStreamSynchronize failed in paddle::platform::GpuMemcpySync");
+}
+
+void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
+                   size_t count, cudaStream_t stream) {
+  PADDLE_ENFORCE(
+      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream),
+      "cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeer");
+}
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
new file mode 100644
index 0000000000..ed2420b874
--- /dev/null
+++ b/paddle/platform/gpu_info.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifndef PADDLE_ONLY_CPU
+
+#include <cuda_runtime.h>
+#include <stddef.h>
+#include <string>
+
+namespace paddle {
+namespace platform {
+
+//! Environment variable: fraction of GPU memory to use on each device.
+const std::string kEnvFractionGpuMemoryToUse =
+    "PADDLE_FRACTION_GPU_MEMORY_TO_USE";
+
+//! Get the total number of GPU devices in system.
+int GetDeviceCount();
+
+//! Get the current GPU device id in system.
+int GetCurrentDeviceId();
+
+//! Set the GPU device id for next execution.
+void SetDeviceId(int device_id);
+
+//！Get the memory usage of current GPU device.
+void GpuMemoryUsage(size_t &available, size_t &total);
+
+//! Get the maximum allocation size of current GPU device.
+size_t GpuMaxAllocSize();
+
+//! Get the minimum chunk size for GPU buddy allocator.
+size_t GpuMinChunkSize();
+
+//! Get the maximum chunk size for GPU buddy allocator.
+size_t GpuMaxChunkSize();
+
+//! Copy memory from address src to dst asynchronously.
+void GpuMemcpyAsync(void *dst, const void *src, size_t count,
+                    enum cudaMemcpyKind kind, cudaStream_t stream);
+
+//! Copy memory from address src to dst synchronously.
+void GpuMemcpySync(void *dst, const void *src, size_t count,
+                   enum cudaMemcpyKind kind);
+
+//! Copy memory from one device to another device.
+void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
+                   size_t count, cudaStream_t stream);
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif  // PADDLE_ONLY_CPU
diff --git a/paddle/platform/place.cc b/paddle/platform/place.cc
index 0704820aa0..b31515e1f0 100644
--- a/paddle/platform/place.cc
+++ b/paddle/platform/place.cc
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include "paddle/platform/place.h"
 
 namespace paddle {
@@ -7,7 +21,7 @@ namespace detail {
 
 class PlacePrinter : public boost::static_visitor<> {
  public:
-  PlacePrinter(std::ostream &os) : os_(os) {}
+  explicit PlacePrinter(std::ostream &os) : os_(os) {}
   void operator()(const CPUPlace &) { os_ << "CPUPlace"; }
   void operator()(const GPUPlace &p) { os_ << "GPUPlace(" << p.device << ")"; }
 
diff --git a/paddle/platform/place.h b/paddle/platform/place.h
index 7cead18388..1117476bb3 100644
--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include <boost/variant.hpp>
 #include <iostream>
+#include "paddle/platform/variant.h"
 
 namespace paddle {
 namespace platform {
@@ -32,7 +32,7 @@ struct CPUPlace {
 
 struct GPUPlace {
   GPUPlace() : GPUPlace(0) {}
-  GPUPlace(int d) : device(d) {}
+  explicit GPUPlace(int d) : device(d) {}
 
   // needed for variant equality comparison
   inline bool operator==(const GPUPlace &o) const { return device == o.device; }
diff --git a/paddle/platform/variant.h b/paddle/platform/variant.h
new file mode 100644
index 0000000000..c2257af1b5
--- /dev/null
+++ b/paddle/platform/variant.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <boost/config.hpp>
+
+#ifndef PADDLE_ONLY_CPU
+
+// Because boost's variadic templates has bug on nvcc, boost will disable
+// variadic template support when GPU enabled on nvcc.
+// Define BOOST_NO_CXX11_VARIADIC_TEMPLATES on gcc/clang to generate same
+// function symbols.
+//
+// https://github.com/PaddlePaddle/Paddle/issues/3386
+#ifndef BOOST_NO_CXX11_VARIADIC_TEMPLATES
+#define BOOST_NO_CXX11_VARIADIC_TEMPLATES
+#endif
+#endif
+
+#include <boost/variant.hpp>
diff --git a/paddle/pserver/ParameterClient2.cpp b/paddle/pserver/ParameterClient2.cpp
index f7e391f763..54063a809a 100644
--- a/paddle/pserver/ParameterClient2.cpp
+++ b/paddle/pserver/ParameterClient2.cpp
@@ -65,7 +65,6 @@ void ParameterClient2::initThreads() {
     LOG(INFO) << "parallel_thread_num dosent need to set";
   }
   syncThreadPool_.reset(new SyncThreadPool(threadNum_));
-
   startThreads();
 }
 
@@ -224,6 +223,14 @@ void ParameterClient2::prepareSendData(
     request.set_cost(cost);
     request.set_batch_status(batchStatus);
     CHECK_EQ(request.blocks_size(), 0);
+    VLOG(10) << "request: trainer_id: " << request.trainer_id()
+             << " update_mode" << request.update_mode()
+             << " send_back_parameter: " << request.send_back_parameter()
+             << " send_back_parameter_type: "
+             << request.send_back_parameter_type()
+             << " num_samples: " << request.num_samples()
+             << " cost: " << request.cost()
+             << " batch_status: " << request.batch_status();
   }
   for (const auto& segments : parameterSegments) {
     const auto it = parameterMap_.find(segments.id);
@@ -251,11 +258,17 @@ void ParameterClient2::prepareSendData(
       CHECK(sendMat != nullptr) << "sendMat is nullptr";
 
       syncThreadPool_->exec([&](int tid, size_t numThreads) {
+        std::lock_guard<std::mutex> guard(sparseAutoGrowthMutex_);
         const auto& localIndices = prefetchMat->getLocalIndices();
         /// num of sparse rows
         size_t nLocalBlocks = localIndices.size();
         uint64_t beginDim = 0;
         uint64_t endDim = 0;
+
+        // FIXME(typhoonzero): let it resize first
+        prefetchMat->getLocalRow(nLocalBlocks + 1);
+        sendMat->getLocalRow(nLocalBlocks + 1);
+
         for (size_t row = 0; row < nLocalBlocks; ++row) {
           int64_t blockId = localIndices[row];  // local row -> sparse row
           int serverId = std::abs((blockId + nameHash) % serviceNum_);
@@ -275,7 +288,6 @@ void ParameterClient2::prepareSendData(
           block->set_begin_pos(row * blockSize);
           /// block len
           block->set_block_size(endDim - beginDim);
-
           if (sendingPara) {
             sendJob->parallelInputIovs[serverId].push_back(
                 {sendMat->getLocalRow(row), sizeof(real) * (size_t)blockSize});
diff --git a/paddle/pserver/ParameterClient2.h b/paddle/pserver/ParameterClient2.h
index 89b3ddd502..29b9eeacdd 100644
--- a/paddle/pserver/ParameterClient2.h
+++ b/paddle/pserver/ParameterClient2.h
@@ -583,6 +583,7 @@ protected:
 #ifndef PADDLE_DISABLE_TIMER
   uint64_t forwardbackwordTime_;
 #endif
+  std::mutex sparseAutoGrowthMutex_;
 
   /// map id to parameter used for decoding protobuf data
   std::unordered_map<size_t, ParameterPtr> parameterMap_;
diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp
index d7c1d4f788..54f5c4c0fb 100644
--- a/paddle/pserver/ParameterServer2.cpp
+++ b/paddle/pserver/ParameterServer2.cpp
@@ -1032,8 +1032,8 @@ void ParameterServer2::loadValueVector(const LoadValueRequest& request,
   Parameter::Header header;
   CHECK(fs.read(reinterpret_cast<char*>(&header), sizeof(header)))
       << "Fail to read parameters in pserver";
-  CHECK_EQ(header.version, Parameter::kFormatVersion)
-      << "Incorrect format version: " << header.version;
+  CHECK(Parameter::isHeaderFormatSupported(header.format))
+      << "Incorrect format version: " << header.format;
   CHECK_EQ(header.size, (size_t)size_)
       << "The size (" << header.size << ") in the file does not match the size "
       << "(" << size_ << ") of the pserver: " << serverId_;
@@ -1063,7 +1063,8 @@ void ParameterServer2::saveValueVector(const SaveValueRequest& request,
   CpuVector& vec = vectors_[PARAMETER_APPLY] ? *vectors_[PARAMETER_APPLY]
                                              : *vectors_[PARAMETER_VALUE];
   Parameter::Header header;
-  header.version = Parameter::kFormatVersion;
+  // TODO(TJ): save param headerFormat_
+  header.format = PARAM_FORMAT_ORIGINAL;
   header.valueSize = sizeof(real);
   header.size = size_;
 
diff --git a/paddle/pserver/test/CMakeLists.txt b/paddle/pserver/test/CMakeLists.txt
index 6e8f9c37f6..b66a00ba06 100644
--- a/paddle/pserver/test/CMakeLists.txt
+++ b/paddle/pserver/test/CMakeLists.txt
@@ -3,7 +3,7 @@ add_unittest_without_exec(socket_test
     SocketTest.cpp)
 
 add_test(NAME socket_test
-    COMMAND ${PROJ_ROOT}/paddle/.set_port.sh -p port
+    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port
         ${CMAKE_CURRENT_BINARY_DIR}/socket_test --loop_time=10)
 
 ####################### test_ProtoServer ####################
@@ -12,7 +12,7 @@ add_unittest_without_exec(test_ProtoServer
 
 IF(NOT ON_TRAVIS)
     add_test(NAME test_ProtoServer
-        COMMAND ${PROJ_ROOT}/paddle/.set_port.sh -p port
+        COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port
             ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoServer)
 ENDIF(NOT ON_TRAVIS)
 
@@ -24,5 +24,5 @@ ENDIF(NOT ON_TRAVIS)
 add_unittest_without_exec(test_ParameterServer2
     test_ParameterServer2.cpp)
 add_test(NAME test_ParameterServer2
-    COMMAND ${PROJ_ROOT}/paddle/.set_port.sh -p port -n 4
+    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port -n 4
         ${CMAKE_CURRENT_BINARY_DIR}/test_ParameterServer2)
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
deleted file mode 100644
index af85fdeecb..0000000000
--- a/paddle/pybind/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python)
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
deleted file mode 100644
index f9f87acf15..0000000000
--- a/paddle/pybind/pybind.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <Python.h>
-#include <paddle/framework/scope.h>
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-namespace pd = paddle::framework;
-
-PYBIND11_PLUGIN(core) {
-  py::module m("core", "C++ core of Paddle Paddle");
-
-  py::class_<pd::Variable>(m, "Variable", R"DOC(Variable Class.
-
-All parameter, weight, gradient are variables in Paddle.
-)DOC")
-      .def("is_int", [](const pd::Variable& var) { return var.IsType<int>(); })
-      .def("set_int",
-           [](pd::Variable& var, int val) -> void {
-             *var.GetMutable<int>() = val;
-           })
-      .def("get_int",
-           [](const pd::Variable& var) -> int { return var.Get<int>(); });
-
-  py::class_<pd::Scope, std::shared_ptr<pd::Scope>>(m, "Scope")
-      .def(py::init<const std::shared_ptr<pd::Scope>&>())
-      .def("get_var",
-           &pd::Scope::GetVariable,
-           py::return_value_policy::reference)
-      .def("create_var",
-           &pd::Scope::CreateVariable,
-           py::return_value_policy::reference);
-
-  return m.ptr();
-}
diff --git a/paddle/scripts/CMakeLists.txt b/paddle/scripts/CMakeLists.txt
index 66a46e1883..a52f06fe49 100644
--- a/paddle/scripts/CMakeLists.txt
+++ b/paddle/scripts/CMakeLists.txt
@@ -1,17 +1,15 @@
 configure_file(submit_local.sh.in
-    submit_local.sh
+    paddle
     @ONLY)
 
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/submit_local.sh DESTINATION bin
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/paddle DESTINATION bin
         PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
-            GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
-        RENAME paddle)
+            GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
 
 configure_file(tools/usage_stat/usage.sh
-    usage.sh
+    paddle_usage
     @ONLY)
 
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/usage.sh DESTINATION opt/paddle/bin
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/paddle_usage DESTINATION opt/paddle/bin
         PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
-            GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
-        RENAME paddle_usage)
+            GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index ab60f1a38d..2941662f34 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -31,54 +31,58 @@ Configuring cmake in /paddle/build ...
       -DWITH_DOC=OFF
       -DWITH_GPU=${WITH_GPU:-OFF}
       -DWITH_AVX=${WITH_AVX:-OFF}
-      -DWITH_GOLANG=${WITH_GOLANG:-OFF}
+      -DWITH_GOLANG=${WITH_GOLANG:-ON}
       -DWITH_SWIG_PY=ON
+      -DWITH_C_API=${WITH_C_API:-OFF}
+      -DWITH_PYTHON=${WITH_PYTHON:-ON}
+      -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
       -DCUDNN_ROOT=/usr/
       -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
       -DWITH_TESTING=${WITH_TESTING:-OFF}
       -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 ========================================
 EOF
+
+# Disable UNITTEST_USE_VIRTUALENV in docker because
+# docker environment is fully controlled by this script.
+# See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
 cmake .. \
       -DCMAKE_BUILD_TYPE=Release \
       -DWITH_DOC=OFF \
       -DWITH_GPU=${WITH_GPU:-OFF} \
       -DWITH_AVX=${WITH_AVX:-OFF} \
-      -DWITH_GOLANG=${WITH_GOLANG:-OFF} \
-      -DWITH_SWIG_PY=ON \
+      -DWITH_GOLANG=${WITH_GOLANG:-ON} \
+      -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
+      -DWITH_C_API=${WITH_C_API:-OFF} \
+      -DWITH_PYTHON=${WITH_PYTHON:-ON} \
       -DCUDNN_ROOT=/usr/ \
       -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF} \
       -DWITH_TESTING=${WITH_TESTING:-OFF} \
       -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 
 cat <<EOF
-========================================
+============================================
 Building in /paddle/build ...
    Build unit tests: ${WITH_TESTING:-OFF}
-========================================
+============================================
 EOF
 make -j `nproc`
-if [ ${WITH_TESTING:-OFF} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
-    pip uninstall -y py-paddle paddle || true
-    ctest --output-on-failure
-fi
-
 
+if [ ${WITH_TESTING:-OFF} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
 cat <<EOF
 ========================================
-Installing ...
+Running unit tests ...
 ========================================
 EOF
-make install
-pip install /usr/local/opt/paddle/share/wheels/*.whl
-paddle version
+    ctest --output-on-failure
+    # make install should also be test when unittest
+    make install -j `nproc`
+    pip install /usr/local/opt/paddle/share/wheels/*.whl
+    paddle version
+fi
 
 
-# To build documentation, we need to run cmake again after installing
-# PaddlePaddle.  This awkwardness is due to
-# https://github.com/PaddlePaddle/Paddle/issues/1854.  It also
-# describes a solution.
-if [[ ${WITH_DOC} == "ON" ]]; then
+if [[ ${WITH_DOC:-OFF} == "ON" ]]; then
     cat <<EOF
 ========================================
 Building documentation ...
@@ -93,7 +97,8 @@ EOF
           -DWITH_AVX=${WITH_AVX:-ON} \
           -DWITH_SWIG_PY=ON \
           -DWITH_STYLE_CHECK=OFF
-    make paddle_docs paddle_docs_cn
+    make -j `nproc` gen_proto_py
+    make -j `nproc` paddle_docs paddle_docs_cn
     popd
 fi
 
@@ -115,16 +120,6 @@ EOF
     /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
 fi
 
-# generate deb package for current build
-# FIXME(typhoonzero): should we remove paddle/scripts/deb ?
-cat <<EOF
-========================================
-Generating .deb package ...
-========================================
-EOF
-cpack -D CPACK_GENERATOR='DEB' ..
-
-
 cat <<EOF
 ========================================
 Generate /paddle/build/Dockerfile ...
@@ -144,18 +139,23 @@ EOF
 fi
 
 cat >> /paddle/build/Dockerfile <<EOF
-# Use different deb file when building different type of images
-ADD *.deb /
+ADD python/dist/*.whl /
 # run paddle version to install python packages first
 RUN apt-get update &&\
-    apt-get install -y python-pip && pip install -U pip && \
-    dpkg -i /*.deb ; apt-get install -f -y && \
+    apt-get install -y wget python-pip && pip install -U pip && \
+    pip install /*.whl; apt-get install -f -y && \
     apt-get clean -y && \
-    rm -f /*.deb && \
-    paddle version
+    rm -f /*.whl && \
+    paddle version && \
+    ldconfig
 ${DOCKERFILE_CUDNN_DSO}
 ${DOCKERFILE_GPU_ENV}
-
+ADD go/cmd/pserver/pserver /usr/bin/
+ADD go/cmd/master/master /usr/bin/
 # default command shows the paddle version and exit
 CMD ["paddle", "version"]
 EOF
+
+set +xe
+printf "If you need to install PaddlePaddle in develop docker image,"
+printf "please make install or pip install build/python/dist/*.whl.\n"
diff --git a/paddle/scripts/docker/build_android.sh b/paddle/scripts/docker/build_android.sh
index bfa10c9155..5584e29e2a 100644
--- a/paddle/scripts/docker/build_android.sh
+++ b/paddle/scripts/docker/build_android.sh
@@ -2,9 +2,9 @@
 
 set -xe
 
-mkdir -p /paddle/build
-cd /paddle/build
-rm -f /paddle/install 2>/dev/null || true
+mkdir -p /paddle/build_android
+cd /paddle/build_android
+rm -rf /paddle/install 2>/dev/null || true
 cmake -DCMAKE_SYSTEM_NAME=Android \
       -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
       -DANDROID_ABI=armeabi-v7a \
@@ -20,7 +20,4 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
       -DWITH_SWIG_PY=OFF \
       ..
 make -j `nproc`
-make install
-
-export PATH=/paddle/install/bin:/paddle/install/opt/paddle/bin:$PATH
-paddle version
+make install -j `nproc`
diff --git a/paddle/scripts/run_python_tests.sh b/paddle/scripts/run_python_tests.sh
deleted file mode 100755
index 1ed497aaec..0000000000
--- a/paddle/scripts/run_python_tests.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-pushd `dirname $0` > /dev/null
-SCRIPTPATH=$PWD
-popd > /dev/null
-
-USE_VIRTUALENV_FOR_TEST=$1; shift
-PYTHON=$1; shift
-
-if [ $USE_VIRTUALENV_FOR_TEST -ne 0 ]; then
-   rm -rf .test_env
-   virtualenv .test_env
-   unset PYTHONHOME
-   unset PYTHONPATH
-   source .test_env/bin/activate
-   PYTHON=python
-fi
-
-$PYTHON -m pip install $SCRIPTPATH/../dist/*.whl
-
-if [ "X${PADDLE_PACKAGE_DIR}" != "X" ]; then
-   $PYTHON -m pip install ${PADDLE_PACKAGE_DIR}/*.whl
-else
-   export PYTHONPATH=$SCRIPTPATH/../../python/
-fi
-
-$PYTHON -m pip install ipython==5.3
-
-for fn in "$@"
-do
-  echo "test $fn"
-  $PYTHON $fn
-  if [ $? -ne 0 ]; then
-    exit 1
-  fi
-done
-
-if [ $USE_VIRTUALENV_FOR_TEST -ne 0 ]; then
-    deactivate
-    rm -rf .test_env
-fi
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
old mode 100644
new mode 100755
index 12bf629ea9..26f9c0fcd4
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -18,6 +18,8 @@ function version(){
         echo "PaddlePaddle @PADDLE_VERSION@, compiled with"
         echo "    with_avx: @WITH_AVX@"
         echo "    with_gpu: @WITH_GPU@"
+        echo "    with_mkldnn: @WITH_MKLDNN"
+        echo "    with_mklml: @WITH_MKLML@"
         echo "    with_double: @WITH_DOUBLE@"
         echo "    with_python: @WITH_PYTHON@"
         echo "    with_rdma: @WITH_RDMA@"
@@ -54,8 +56,7 @@ if [ -z "${PADDLE_NO_STAT+x}" ]; then
     fi
 fi
 
-
-MYDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+PADDLE_BIN_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 
 if [ ! -z "${DEBUGGER}" ]; then
     echo "Using debug command ${DEBUGGER}"
@@ -91,34 +92,16 @@ else:
   sys.exit(0)
 EOF
 
-if [ $? -eq 1 ]; then  # Older version installed, or not installed at all
-    echo "First time run paddle, need to install some python dependencies."
-    # setuptools normalizes package version, so we need to use normalized
-    # package version for paddle python package
-    PYTHON_PADDLE_VERSION=$(python -c 'import packaging.version
-import setuptools
-print str(packaging.version.Version("@PADDLE_VERSION@"))
-' 2>/dev/null)
-    BASEDIR=$(dirname "$0")
-    pip install ${BASEDIR}/../opt/paddle/share/wheels/*-${PYTHON_PADDLE_VERSION}-*.whl
-    if [ $? -ne 0 ]; then
-	echo "pip install wheels failed. "
-	echo "Please use 'sudo paddle' at the first time you use PaddlePaddle"
-	echo "PaddlePaddle will install some python dependencies automatically."
-	exit 1
-    fi
-    echo "Python dependencies are installed."
-fi
 
 case "$1" in
     "train")
-        ${DEBUGGER} $MYDIR/../opt/paddle/bin/paddle_trainer ${@:2}
+        ${DEBUGGER} $PADDLE_BIN_PATH/paddle_trainer ${@:2}
         ;;
     "merge_model")
-        ${DEBUGGER} $MYDIR/../opt/paddle/bin/paddle_merge_model ${@:2}
+        ${DEBUGGER} $PADDLE_BIN_PATH/paddle_merge_model ${@:2}
         ;;
     "pserver")
-        ${DEBUGGER} $MYDIR/../opt/paddle/bin/paddle_pserver_main ${@:2}
+        ${DEBUGGER} $PADDLE_BIN_PATH/paddle_pserver_main ${@:2}
         ;;
     "dump_config")
         python -m paddle.utils.dump_config ${@:2}
@@ -127,7 +110,7 @@ case "$1" in
         python -m paddle.utils.make_model_diagram ${@:2}
         ;;
     "usage")
-        $MYDIR/../opt/paddle/bin/paddle_usage ${@:2}
+        $PADDLE_BIN_PATH/paddle_usage ${@:2}
         ;;
     "version")
         version
diff --git a/paddle/scripts/travis/build_android.sh b/paddle/scripts/travis/build_android.sh
new file mode 100755
index 0000000000..004067a8f5
--- /dev/null
+++ b/paddle/scripts/travis/build_android.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+set -e
+
+ANDROID_STANDALONE_TOOLCHAIN=$HOME/android-toolchain-gcc
+TMP_DIR=$HOME/$JOB/tmp
+mkdir -p $TMP_DIR
+cd $TMP_DIR
+wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip
+unzip -q android-ndk-r14b-linux-x86_64.zip
+chmod +x $TMP_DIR/android-ndk-r14b/build/tools/make-standalone-toolchain.sh
+$TMP_DIR/android-ndk-r14b/build/tools/make-standalone-toolchain.sh --force --arch=arm --platform=android-21 --install-dir=$ANDROID_STANDALONE_TOOLCHAIN
+cd $HOME
+rm -rf $TMP_DIR
+
+# Create the build directory for CMake.
+mkdir -p $TRAVIS_BUILD_DIR/build_android
+cd $TRAVIS_BUILD_DIR/build_android
+
+# Compile paddle binaries
+cmake -DCMAKE_SYSTEM_NAME=Android \
+      -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
+      -DANDROID_ABI=armeabi-v7a \
+      -DANDROID_ARM_NEON=ON \
+      -DANDROID_ARM_MODE=ON \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      -DWITH_STYLE_CHECK=OFF \
+      ..
+
+make -j `nproc`
diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh
index a443851580..dfcff38302 100755
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -5,15 +5,9 @@ set -e
 mkdir -p $TRAVIS_BUILD_DIR/build
 cd $TRAVIS_BUILD_DIR/build
 
-# Compile paddle binaries first
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_GOLANG=ON -DWITH_STYLE_CHECK=OFF
-
-mkdir output
-make -j `nproc`
-find .. -name '*whl' | xargs pip install  # install all wheels.
-rm -rf *
 # Compile Documentation only.
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON
+make -j `nproc` gen_proto_py
 make -j `nproc` paddle_docs paddle_docs_cn
 
 # check websites for broken links
@@ -35,6 +29,7 @@ TARGET_BRANCH="gh-pages"
 SOURCE_BRANCH="master"
 
 # Clone the repo to output directory
+mkdir output
 git clone $REPO output
 cd output
 
diff --git a/paddle/scripts/travis/check_style.sh b/paddle/scripts/travis/check_style.sh
index 4754bdd4c8..ec499a839a 100755
--- a/paddle/scripts/travis/check_style.sh
+++ b/paddle/scripts/travis/check_style.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 function abort(){
     echo "Your change doesn't follow PaddlePaddle's code style." 1>&2
-    echo "Please use pre-commit to reformat your code and git push again." 1>&2
+    echo "Please use pre-commit to check what is wrong." 1>&2
     exit 1
 }
 
@@ -13,8 +13,14 @@ export PATH=/usr/bin:$PATH
 pre-commit install
 clang-format --version
 
+# set up go environment for running gometalinter
+mkdir -p $GOPATH/src/github.com/PaddlePaddle/
+ln -sf $TRAVIS_BUILD_DIR $GOPATH/src/github.com/PaddlePaddle/Paddle
+cd  $GOPATH/src/github.com/PaddlePaddle/Paddle/go; glide install; cd -
+
 if ! pre-commit run -a ; then
-  git diff  --exit-code
+    git diff
+    exit 1
 fi
 
 trap : 0
diff --git a/paddle/setup.py.in b/paddle/setup.py.in
deleted file mode 100644
index 06d55d3abc..0000000000
--- a/paddle/setup.py.in
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from setuptools import setup, Extension
-
-setup(name="py_paddle",
-      version="${PADDLE_VERSION}",
-      packages=['py_paddle'],
-      include_package_data=True,
-      package_data={'py_paddle':['*.py','_swig_paddle.so']},
-      install_requires = [
-        'nltk>=3.2.2',
-        'numpy>=1.8.0',      # The numpy is required.
-        'protobuf==${PROTOBUF_VERSION}'    # The paddle protobuf version
-      ],
-      url='http://www.paddlepaddle.org/',
-      license='Apache 2.0',
-)
diff --git a/paddle/string/CMakeLists.txt b/paddle/string/CMakeLists.txt
index 5becf62672..60667b7287 100644
--- a/paddle/string/CMakeLists.txt
+++ b/paddle/string/CMakeLists.txt
@@ -2,3 +2,4 @@ cc_library(stringpiece SRCS piece.cc)
 cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
 
 cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
+cc_test(to_string_test SRCS to_string_test.cc)
diff --git a/paddle/string/piece.h b/paddle/string/piece.h
index db7c3e6980..03ae9243a4 100644
--- a/paddle/string/piece.h
+++ b/paddle/string/piece.h
@@ -35,12 +35,12 @@ public:
 
   // We provide non-explicit singleton constructors so users can
   // pass in a "const char*" or a "string" wherever a "Piece"
-  // is expected.  These contructors ensure that if data_ is NULL,
+  // is expected.  These constructors ensure that if data_ is NULL,
   // size_ is 0.
   Piece();
   Piece(const char* d, size_t n);
-  Piece(const char* d);
-  Piece(const std::string& s);
+  Piece(const char* d);         // NOLINT: accept C string into Piece.
+  Piece(const std::string& s);  // NOLINT: accept C++ string into Piece.
 
   const char* data() const { return data_; }
   size_t len() const { return size_; }
diff --git a/paddle/string/to_string.h b/paddle/string/to_string.h
new file mode 100644
index 0000000000..4f478b6a36
--- /dev/null
+++ b/paddle/string/to_string.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <sstream>
+#include <string>
+
+namespace paddle {
+namespace string {
+template <typename T>
+inline std::string to_string(T v) {
+  std::ostringstream sout;
+  sout << v;
+  return sout.str();
+}
+
+// Faster std::string/const char* type
+template <>
+inline std::string to_string(std::string v) {
+  return v;
+}
+
+template <>
+inline std::string to_string(const char* v) {
+  return std::string(v);
+}
+
+}  // namespace string
+}  // namespace paddle
diff --git a/paddle/string/to_string_test.cc b/paddle/string/to_string_test.cc
new file mode 100644
index 0000000000..5ff1b007f1
--- /dev/null
+++ b/paddle/string/to_string_test.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/string/to_string.h"
+#include <gtest/gtest.h>
+
+constexpr char kOutputString[] = "User Defined Output";
+class UserDefinedClass {
+public:
+};
+
+std::ostream& operator<<(std::ostream& s, const UserDefinedClass& ins) {
+  s << kOutputString;
+  return s;
+}
+
+TEST(to_string, normal) {
+  using namespace paddle::string;
+  ASSERT_EQ("10", to_string(10));
+  ASSERT_EQ("abc", to_string("abc"));
+  ASSERT_EQ("1.2", to_string(1.2));
+}
+
+TEST(to_string, user_defined) {
+  using namespace paddle::string;
+  UserDefinedClass instance;
+  ASSERT_EQ(kOutputString, to_string(instance));
+}
\ No newline at end of file
diff --git a/paddle/trainer/NewRemoteParameterUpdater.cpp b/paddle/trainer/NewRemoteParameterUpdater.cpp
index b359d9da21..35dcb235e7 100644
--- a/paddle/trainer/NewRemoteParameterUpdater.cpp
+++ b/paddle/trainer/NewRemoteParameterUpdater.cpp
@@ -28,6 +28,17 @@ NewRemoteParameterUpdater::NewRemoteParameterUpdater(
       newGradients_(nullptr),
       pserverSpec_(pserverSpec) {}
 
+NewRemoteParameterUpdater::NewRemoteParameterUpdater(
+    const OptimizationConfig &config,
+    const std::string pserverSpec,
+    const bool useEtcd)
+    : trainerConfig_(config),
+      parameterClient_(-1),
+      newParameters_(nullptr),
+      newGradients_(nullptr),
+      pserverSpec_(pserverSpec),
+      useEtcd_(useEtcd) {}
+
 void NewRemoteParameterUpdater::init(
     const std::vector<ParameterPtr> &parameters) {
   ParameterUpdater::init(parameters);
@@ -38,8 +49,13 @@ void NewRemoteParameterUpdater::init(
   }
 
   // create parameter server client.
-  parameterClient_ = paddle_new_pserver_client((char *)pserverSpec_.c_str(),
-                                               FLAGS_trainer_id == 0);
+  if (useEtcd_) {
+    parameterClient_ =
+        paddle_new_etcd_pserver_client((char *)pserverSpec_.c_str());
+  } else {
+    parameterClient_ = paddle_new_pserver_client((char *)pserverSpec_.c_str(),
+                                                 FLAGS_trainer_id == 0);
+  }
 
   // init new parameter and gradient.
   newParameters_ = initNewParameter(PARAMETER_VALUE);
@@ -50,24 +66,92 @@ void NewRemoteParameterUpdater::init(
   // from parameter server
   if (paddle_begin_init_params(parameterClient_)) {
     LOG(INFO) << "paddle_begin_init_params start";
+    // NOTE: convert V1 OptimizatioinConfig proto to V2 OptimizerConfig.
+    // This makes golang pserver compatible with handy V1 demos.
+    // TODO(wuyi): Refine or remove these ugly converting lines
+    OptimizerConfig optimizerConfigV2;
+    if (trainerConfig_.learning_method() == "momentum") {
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
+    } else if (trainerConfig_.learning_method() == "adagrad") {
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adagrad);
+      optimizerConfigV2.mutable_adagrad()->set_epsilon(
+          trainerConfig_.ada_epsilon());
+    } else if (trainerConfig_.learning_method() == "adadelta") {
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adagrad);
+      optimizerConfigV2.mutable_adadelta()->set_epsilon(
+          trainerConfig_.ada_epsilon());
+      optimizerConfigV2.mutable_adadelta()->set_rho(trainerConfig_.ada_rou());
+    } else if (trainerConfig_.learning_method() == "adam") {
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adam);
+      optimizerConfigV2.mutable_adam()->set_beta_1(trainerConfig_.adam_beta1());
+      optimizerConfigV2.mutable_adam()->set_beta_2(trainerConfig_.adam_beta2());
+      optimizerConfigV2.mutable_adam()->set_epsilon(
+          trainerConfig_.adam_epsilon());
+    } else {
+      LOG(ERROR) << "got unsupported v1 optimizer config: "
+                 << trainerConfig_.learning_method();
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
+    }
+
+    if (trainerConfig_.learning_rate_schedule() == "constant") {
+      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
+      optimizerConfigV2.mutable_const_lr()->set_learning_rate(
+          trainerConfig_.learning_rate());
+    } else if (trainerConfig_.learning_rate_schedule() == "linear") {
+      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Linear);
+      optimizerConfigV2.mutable_linear_lr()->set_learning_rate(
+          trainerConfig_.learning_rate());
+      optimizerConfigV2.mutable_linear_lr()->set_lr_decay_a(
+          trainerConfig_.learning_rate_decay_a());
+      optimizerConfigV2.mutable_linear_lr()->set_lr_decay_b(
+          trainerConfig_.learning_rate_decay_b());
+    } else {
+      LOG(ERROR) << "got unsupported v1 learning_rate_schedule config: "
+                 << trainerConfig_.learning_rate_schedule() << ", set to const";
+      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
+    }
+
+    // overwrite optimizerConfigV2 for per-parameter(layer) configs
     for (int i = 0; i < parameterSize(); ++i) {
       auto paramConfig = parameters_[i]->getConfig();
-      LOG(INFO) << "old param config: " << paramConfig.DebugString();
-      // FIXME(typhoonzero): convert old paramConfig to optimizerConfig
-      OptimizerConfig optimizeConfigV2;
-      auto sgdConfigV2 = optimizeConfigV2.mutable_sgd();
-      sgdConfigV2->set_momentum(paramConfig.momentum());
-      sgdConfigV2->set_decay(paramConfig.decay_rate());
-      optimizeConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
-      auto constlr = optimizeConfigV2.mutable_const_lr();
-      constlr->set_learning_rate(paramConfig.learning_rate());
-      if (trainerConfig_.algorithm() == "sgd") {
-        optimizeConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
-        // FIXME: config all algorithms
-      } else {
-        optimizeConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
+      if (paramConfig.has_momentum() &&
+          trainerConfig_.learning_method() == "momentum") {
+        optimizerConfigV2.mutable_sgd()->set_momentum(paramConfig.momentum());
+      }
+      if (paramConfig.has_learning_rate()) {
+        switch (optimizerConfigV2.lr_policy()) {
+          case 0:
+            optimizerConfigV2.mutable_const_lr()->set_learning_rate(
+                paramConfig.learning_rate());
+            break;
+          case 1:
+            optimizerConfigV2.mutable_linear_lr()->set_learning_rate(
+                paramConfig.learning_rate());
+            break;
+        }
+      }
+      if (paramConfig.has_decay_rate()) {
+        switch (optimizerConfigV2.optimizer()) {
+          case 1:  // SGD
+            optimizerConfigV2.mutable_sgd()->set_decay(
+                paramConfig.decay_rate());
+            break;
+          case 2:  // Adadelta
+            optimizerConfigV2.mutable_adadelta()->set_decay(
+                paramConfig.decay_rate());
+            break;
+          case 3:  // Adagrad
+            optimizerConfigV2.mutable_adagrad()->set_decay(
+                paramConfig.decay_rate());
+            break;
+          case 4:  // Adam
+            optimizerConfigV2.mutable_adam()->set_decay(
+                paramConfig.decay_rate());
+            break;
+        }
       }
-      std::string bytes = optimizeConfigV2.SerializeAsString();
+      // send param and config to pserver
+      std::string bytes = optimizerConfigV2.SerializeAsString();
       const char *array = bytes.data();
       int size = (int)bytes.size();
       paddle_init_param(
diff --git a/paddle/trainer/NewRemoteParameterUpdater.h b/paddle/trainer/NewRemoteParameterUpdater.h
index dfed00bc21..6223ba427c 100644
--- a/paddle/trainer/NewRemoteParameterUpdater.h
+++ b/paddle/trainer/NewRemoteParameterUpdater.h
@@ -32,6 +32,9 @@ class NewRemoteParameterUpdater : public ParameterUpdater {
 public:
   NewRemoteParameterUpdater(const OptimizationConfig& config,
                             const std::string pserverSpec);
+  NewRemoteParameterUpdater(const OptimizationConfig& config,
+                            const std::string pserverSpec,
+                            const bool useEtcd);
   ~NewRemoteParameterUpdater() {
     releaseNewParameter(newParameters_);
     releaseNewParameter(newGradients_);
@@ -111,6 +114,8 @@ protected:
   paddle_parameter** newGradients_;
   /// the specification of parameter server "host1:port,host1:port"
   std::string pserverSpec_;
+  /// true if pserverSpec_ is etcd endpoint, else pserverSpec_ is pserver addr
+  bool useEtcd_;
 };
 
 }  // namespace paddle
diff --git a/paddle/trainer/TrainerConfigHelper.cpp b/paddle/trainer/TrainerConfigHelper.cpp
index 60ac8459a1..a0a365aa0b 100644
--- a/paddle/trainer/TrainerConfigHelper.cpp
+++ b/paddle/trainer/TrainerConfigHelper.cpp
@@ -28,6 +28,7 @@ DECLARE_bool(with_cost);
 DECLARE_bool(with_gpu);
 DECLARE_bool(parallel_nn);
 DECLARE_string(config_args);
+DECLARE_bool(use_mkldnn);
 
 const char *kConfigParserModuleName = "paddle.trainer.config_parser";
 const char *kConfigParserFuncName = "parse_config_and_serialize";
@@ -44,6 +45,7 @@ TrainerConfigHelper::TrainerConfigHelper(const std::string &configFilePath)
   configArgs << "trainer_id=" << FLAGS_trainer_id << ",local=" << FLAGS_local
              << ",with_cost=" << FLAGS_with_cost << ",use_gpu=" << FLAGS_use_gpu
              << ",parallel_nn=" << FLAGS_parallel_nn
+             << ",use_mkldnn=" << FLAGS_use_mkldnn
              << ",cudnn_version=" << hl_get_cudnn_lib_version();
   if (!FLAGS_config_args.empty()) {
     configArgs << "," << FLAGS_config_args;
@@ -62,11 +64,7 @@ TrainerConfigHelper::TrainerConfigHelper(const TrainerConfig &config)
   m->conf = config;
 }
 
-TrainerConfigHelper::~TrainerConfigHelper() {
-  if (m) {
-    delete m;
-  }
-}
+TrainerConfigHelper::~TrainerConfigHelper() { delete m; }
 
 const TrainerConfig &TrainerConfigHelper::getConfig() const { return m->conf; }
 
diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt
index 08b2d8a38e..f01ad4142d 100644
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -2,19 +2,19 @@
 add_unittest_without_exec(test_Compare
     test_Compare.cpp)
 add_test(NAME test_Compare
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python
         ${CMAKE_CURRENT_BINARY_DIR}/test_Compare
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 
 ################# test_Trainer ###########################
 add_unittest_without_exec(test_Trainer
     test_Trainer.cpp)
 add_test(NAME test_Trainer
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/paddle/trainer/tests/gen_proto_data.py &&
-        ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
+        ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/gen_proto_data.py &&
+        ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
         ${CMAKE_CURRENT_BINARY_DIR}/test_Trainer
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 
 ############### test_TrainerOnePass ##########################
 if(WITH_PYTHON)
@@ -23,60 +23,60 @@ if(WITH_PYTHON)
   add_unittest_without_exec(test_TrainerOnePass
       test_TrainerOnePass.cpp)
   add_test(NAME test_TrainerOnePass
-    COMMAND  ${PROJ_ROOT}/paddle/.set_python_path.sh -d
-          ${PROJ_ROOT}/python/:${PROJ_ROOT}/paddle/trainer/tests
-          ${PROJ_ROOT}/paddle/.set_port.sh -p port ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
-      WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+    COMMAND  ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
+          ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests
+          ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
+      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 endif()
 ################ test_CompareTwoNets ######################
 add_unittest_without_exec(test_CompareTwoNets
     test_CompareTwoNets.cpp)
 add_test(NAME test_CompareTwoNets
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
         ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
             --config_file_a=trainer/tests/sample_trainer_config_qb_rnn.conf --config_file_b=trainer/tests/sample_trainer_config_rnn.conf
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 
 ############### test_CompareTwoOpts ###################
 add_unittest_without_exec(test_CompareTwoOpts
     test_CompareTwoOpts.cpp)
 add_test(NAME test_CompareTwoOpts
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
         ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoOpts
             --config_file_a=trainer/tests/sample_trainer_config_opt_a.conf --config_file_b=trainer/tests/sample_trainer_config_opt_b.conf
             --num_passes=1 --need_high_accuracy=0
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 
 ################# test_CompareSparse ##################
 add_unittest_without_exec(test_CompareSparse
     test_CompareSparse.cpp)
 if(NOT ON_TRAVIS)
   add_test(NAME test_CompareSparse
-    COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
               ./.set_port.sh -p port -n 6
                   ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 endif()
 ################# test_recurrent_machine_generation ###############
 add_unittest_without_exec(test_recurrent_machine_generation
     test_recurrent_machine_generation.cpp)
 add_test(NAME test_recurrent_machine_generation
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
         ${CMAKE_CURRENT_BINARY_DIR}/test_recurrent_machine_generation
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 
 #################### test_PyDataProviderWrapper #########################
 add_unittest_without_exec(test_PyDataProviderWrapper
     test_PyDataProviderWrapper.cpp)
 
 add_test(NAME test_PyDataProviderWrapper
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d
-        ${PROJ_ROOT}/python/:${PROJ_ROOT}/paddle/trainer/tests
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
+        ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests
         ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProviderWrapper
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 
 #################### test_config_parser #########################
 add_test(NAME test_config_parser
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/paddle/trainer/tests/config_parser_test.py
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
+        ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/config_parser_test.py
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
diff --git a/paddle/trainer/tests/compare_sparse_data b/paddle/trainer/tests/compare_sparse_data
new file mode 100644
index 0000000000..18fc654138
Binary files /dev/null and b/paddle/trainer/tests/compare_sparse_data differ
diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data
similarity index 100%
rename from paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto
rename to paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data
diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist
index 8b041cd664..6b406dff0b 100644
--- a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist
+++ b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist
@@ -1 +1 @@
-./trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto
+./trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data
diff --git a/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf b/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf
new file mode 100644
index 0000000000..92f32a18c0
--- /dev/null
+++ b/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf
@@ -0,0 +1,154 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
+
+# Note: when making change to this file, please make sure
+# sample_trainer_config_rnn.conf is changed accordingly so that the uniitest
+# for comparing these two nets can pass (test_CompareTwoNets)
+
+default_initial_std(0.1)
+default_device(0)
+
+word_dim = 999
+l1 = 0
+l2 = 0
+
+model_type("nn")
+
+sparse_update = get_config_arg("sparse_update", bool, False)
+
+TrainData(ProtoData(        
+            type = "proto_sequence",
+            files = ('trainer/tests/train_sparse.list'), 
+            ))
+
+Settings(
+    algorithm='sgd',
+    batch_size=100,
+    learning_rate=0.0001,
+    learning_rate_decay_a=4e-08,
+    learning_rate_decay_b=0.0,
+    learning_rate_schedule='poly',
+)
+
+
+wordvec_dim = 32
+layer2_dim = 16
+layer3_dim = 16
+hidden_dim = 32
+
+slot_names = ["qb", "qw", "tb", "tw"]
+
+def ltr_network(network_name,
+                word_dim=word_dim,
+                wordvec_dim=wordvec_dim,
+                layer2_dim=layer2_dim,
+                layer3_dim=layer3_dim,
+                hidden_dim=hidden_dim,
+                slot_names=slot_names,
+                l1=l1,
+                l2=l2):
+
+    slotnum = len(slot_names)
+    for i in xrange(slotnum):
+        Inputs(slot_names[i] + network_name)
+    for i in xrange(slotnum):
+        Layer(
+            name = slot_names[i] + network_name,
+            type = "data",
+            size = word_dim,
+            device = -1,
+        )
+        Layer(
+            name = slot_names[i] + "_embedding_" + network_name,
+            type = "mixed",
+            size = wordvec_dim,
+            bias = False,
+            device = -1,
+            inputs = TableProjection(slot_names[i] + network_name,
+                                     parameter_name = "embedding.w0",
+                                     decay_rate_l1=l1,
+                                     sparse_remote_update = True,
+                                     sparse_update = sparse_update,
+                                     ),
+        )
+        Layer(
+            name = slot_names[i] + "_rnn1_" + network_name,
+            type = "recurrent",
+            active_type = "tanh",
+            bias = Bias(initial_std = 0,
+                        parameter_name = "rnn1.bias"),
+            inputs = Input(slot_names[i] + "_embedding_" + network_name,
+                           parameter_name = "rnn1.w0")
+        )
+        Layer(
+            name = slot_names[i] + "_rnnlast_" + network_name,
+            type = "seqlastins",
+            inputs = [
+                slot_names[i] + "_rnn1_" + network_name,
+            ],
+        )
+
+    Layer(
+        name = "layer2_" + network_name,
+        type = "fc",
+        active_type = "tanh",
+        size = layer2_dim,
+        bias = Bias(parameter_name = "layer2.bias"),
+        inputs = [Input(slot_name + "_rnnlast_" + network_name, 
+                        parameter_name = "_layer2_" + slot_name + ".w", 
+                        decay_rate = l2, 
+                        initial_smart = True) for slot_name in slot_names]
+    )
+    Layer(
+        name = "layer3_" + network_name,
+        type = "fc",
+        active_type = "tanh",
+        size = layer3_dim,
+        bias = Bias(parameter_name = "layer3.bias"),
+        inputs = [
+            Input("layer2_" + network_name, 
+                  parameter_name = "_layer3.w", 
+                  decay_rate = l2, 
+                  initial_smart = True),
+        ]
+    )
+    Layer(
+        name = "output_" + network_name,
+        type = "fc",
+        size = 1,
+        bias = False,
+        inputs = [
+                  Input("layer3_" + network_name,
+                       parameter_name = "_layerO.w"),
+                 ],
+        )
+
+
+ltr_network("left")
+ltr_network("right")
+Inputs("label")
+Layer(
+    name = "label",
+    type = "data",
+    size = 1,
+    )
+Outputs("cost", "qb_rnnlast_left")
+Layer(
+    name = "cost",
+    type = "rank-cost",
+    inputs = ["output_left", "output_right", "label"],
+    )
diff --git a/paddle/trainer/tests/simple_sparse_neural_network.py b/paddle/trainer/tests/simple_sparse_neural_network.py
index 9604e1b9b4..30346ef299 100644
--- a/paddle/trainer/tests/simple_sparse_neural_network.py
+++ b/paddle/trainer/tests/simple_sparse_neural_network.py
@@ -1,6 +1,6 @@
 from paddle.trainer_config_helpers import *
 
-settings(batch_size=128, learning_method=AdaGradOptimizer(), learning_rate=1e-4)
+settings(batch_size=17, learning_method=AdaGradOptimizer(), learning_rate=1e-4)
 
 file_list = 'trainer/tests/fake_file_list.list'
 
@@ -12,7 +12,7 @@ define_py_data_sources2(
 
 embedding = embedding_layer(
     input=data_layer(
-        name="word_ids", size=65536),
+        name="word_ids", size=8191),
     size=128,
     param_attr=ParamAttr(sparse_update=True))
 prediction = fc_layer(input=embedding, size=10, act=SoftmaxActivation())
diff --git a/paddle/trainer/tests/simple_sparse_neural_network_dp.py b/paddle/trainer/tests/simple_sparse_neural_network_dp.py
index 8bfd1f37e7..86b272edfe 100644
--- a/paddle/trainer/tests/simple_sparse_neural_network_dp.py
+++ b/paddle/trainer/tests/simple_sparse_neural_network_dp.py
@@ -7,15 +7,15 @@ def init_hook(settings, is_train, **kwargs):
 
 
 @provider(
-    input_types={'word_ids': integer_value(65536),
+    input_types={'word_ids': integer_value(8191),
                  'label': integer_value(10)},
     min_pool_size=0,
     init_hook=init_hook)
 def process(settings, filename):
     if settings.is_train:
-        data_size = 2**20
-    else:
         data_size = 2**10
+    else:
+        data_size = 2**5
 
     for _ in xrange(data_size):
-        yield random.randint(0, 65535), random.randint(0, 9)
+        yield random.randint(0, 8190), random.randint(0, 9)
diff --git a/paddle/trainer/tests/test_CompareSparse.cpp b/paddle/trainer/tests/test_CompareSparse.cpp
index a7000eb77e..813275518e 100644
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/trainer/tests/test_CompareSparse.cpp
@@ -23,7 +23,7 @@ using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
 static const string& configFile1 =
-    "trainer/tests/sample_trainer_config_qb_rnn.conf";
+    "trainer/tests/sample_trainer_config_compare_sparse.conf";
 
 DECLARE_bool(use_gpu);
 DECLARE_string(config);
diff --git a/paddle/trainer/tests/test_TrainerOnePass.cpp b/paddle/trainer/tests/test_TrainerOnePass.cpp
index 4d0174f784..00ba61377a 100644
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ b/paddle/trainer/tests/test_TrainerOnePass.cpp
@@ -100,25 +100,25 @@ TEST(average_window, gpu) {
 }
 
 TEST(average_window, gpu2) {
-  FLAGS_num_passes = 100;
+  FLAGS_num_passes = 20;
   trainerOnePassTest(configFile1, true, false, 2, 0.01);
   FLAGS_num_passes = 1;
 }
 
 TEST(average_window, gpu4) {
-  FLAGS_num_passes = 100;
+  FLAGS_num_passes = 20;
   trainerOnePassTest(configFile1, true, false, 4, 0.01);
   FLAGS_num_passes = 1;
 }
 
 TEST(average_window_cpu, gpu2) {
-  FLAGS_num_passes = 100;
+  FLAGS_num_passes = 20;
   trainerOnePassTest(configFile1, true, false, 2, 0.01, true);
   FLAGS_num_passes = 1;
 }
 
 TEST(average_window_cpu, gpu4) {
-  FLAGS_num_passes = 100;
+  FLAGS_num_passes = 20;
   trainerOnePassTest(configFile1, true, false, 4, 0.01, true);
   FLAGS_num_passes = 1;
 }
diff --git a/paddle/trainer/tests/train_sparse.list b/paddle/trainer/tests/train_sparse.list
new file mode 100644
index 0000000000..6ea020e220
--- /dev/null
+++ b/paddle/trainer/tests/train_sparse.list
@@ -0,0 +1 @@
+trainer/tests/compare_sparse_data
diff --git a/paddle/utils/DynamicLoader.h b/paddle/utils/DynamicLoader.h
index 9b5ad21724..2e5ff76a06 100644
--- a/paddle/utils/DynamicLoader.h
+++ b/paddle/utils/DynamicLoader.h
@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef DYNAMIC_LOAD_H_
-#define DYNAMIC_LOAD_H_
+#pragma once
 
 #include <dlfcn.h>
 #include <memory>
@@ -59,5 +58,3 @@ void GetWarpCTCDsoHandle(void** dso_handle);
  *
  */
 void GetLapackDsoHandle(void** dso_handle);
-
-#endif  // DYNAMIC_LOAD_H_
diff --git a/paddle/utils/Error.h b/paddle/utils/Error.h
index 27ddaab3f0..7cde983060 100644
--- a/paddle/utils/Error.h
+++ b/paddle/utils/Error.h
@@ -126,9 +126,11 @@ public:
   }
 
   /**
-   * @brief operator bool, return True if there is something error.
+   * @brief check this status by glog.
+   * @note It is a temp method used during cleaning Paddle code. It will be
+   *       removed later.
    */
-  operator bool() const { return !this->isOK(); }
+  void check() const { CHECK(this->isOK()) << msg(); }
 
   /**
    * @brief isOK return True if there is no error.
@@ -136,13 +138,6 @@ public:
    */
   bool isOK() const { return msg_ == nullptr; }
 
-  /**
-   * @brief check this status by glog.
-   * @note It is a temp method used during cleaning Paddle code. It will be
-   *       removed later.
-   */
-  void check() const { CHECK(this->isOK()) << msg(); }
-
 private:
   std::shared_ptr<std::string> msg_;
 };
diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp
index 320f671ed9..ab1c181c62 100644
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
@@ -20,6 +20,13 @@ DEFINE_bool(use_gpu, false, "Only support CPU training");
 DEFINE_bool(use_gpu, true, "Whether to use GPU for training");
 #endif
 
+#ifdef PADDLE_USE_MKLDNN
+// TODO(TJ): change to true when MKLDNN layers support multi-inputs
+DEFINE_bool(use_mkldnn, false, "Default still keep use CPU training");
+#else
+DEFINE_bool(use_mkldnn, false, "Only support CPU training");
+#endif
+
 DEFINE_bool(parallel_nn,
             false,
             "Whether to use multi-threads to calculate one neural network."
diff --git a/paddle/utils/Flags.h b/paddle/utils/Flags.h
index dc4faef833..1832bb515e 100644
--- a/paddle/utils/Flags.h
+++ b/paddle/utils/Flags.h
@@ -40,3 +40,4 @@ DECLARE_bool(show_layer_stat);
 DECLARE_string(predict_file);
 DECLARE_bool(prev_batch_state);
 DECLARE_string(init_model_path);
+DECLARE_bool(use_mkldnn);
diff --git a/paddle/utils/ThreadLocal.h b/paddle/utils/ThreadLocal.h
index b5e2862546..0a27b8b97b 100644
--- a/paddle/utils/ThreadLocal.h
+++ b/paddle/utils/ThreadLocal.h
@@ -51,7 +51,7 @@ template <class T>
 class ThreadLocal {
 public:
   ThreadLocal() {
-    CHECK(pthread_key_create(&threadSpecificKey_, dataDestructor) == 0);
+    CHECK_EQ(pthread_key_create(&threadSpecificKey_, dataDestructor), 0);
   }
   ~ThreadLocal() { pthread_key_delete(threadSpecificKey_); }
 
@@ -65,7 +65,7 @@ public:
     if (!p && createLocal) {
       p = new T();
       int ret = pthread_setspecific(threadSpecificKey_, p);
-      CHECK(ret == 0);
+      CHECK_EQ(ret, 0);
     }
     return p;
   }
@@ -79,7 +79,7 @@ public:
     if (T* q = get(false)) {
       dataDestructor(q);
     }
-    CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+    CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
   }
 
   /**
@@ -112,7 +112,7 @@ private:
 template <class T>
 class ThreadLocalD {
 public:
-  ThreadLocalD() { CHECK(pthread_key_create(&threadSpecificKey_, NULL) == 0); }
+  ThreadLocalD() { CHECK_EQ(pthread_key_create(&threadSpecificKey_, NULL), 0); }
   ~ThreadLocalD() {
     pthread_key_delete(threadSpecificKey_);
     for (auto t : threadMap_) {
@@ -127,7 +127,7 @@ public:
     T* p = (T*)pthread_getspecific(threadSpecificKey_);
     if (!p) {
       p = new T();
-      CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+      CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
       updateMap(p);
     }
     return p;
@@ -141,7 +141,7 @@ public:
     if (T* q = (T*)pthread_getspecific(threadSpecificKey_)) {
       dataDestructor(q);
     }
-    CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+    CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
     updateMap(p);
   }
 
diff --git a/paddle/utils/tests/CMakeLists.txt b/paddle/utils/tests/CMakeLists.txt
index aa923b3553..c770ce1698 100644
--- a/paddle/utils/tests/CMakeLists.txt
+++ b/paddle/utils/tests/CMakeLists.txt
@@ -13,6 +13,6 @@ add_executable(
 link_paddle_exe(test_CustomStackTracePrint)
 if(NOT APPLE)
     add_test(NAME test_CustomStackTracePrint
-        COMMAND ${PROJ_ROOT}/paddle/utils/tests/test_CustomStackTracePrint.sh
+        COMMAND ${PADDLE_SOURCE_DIR}/paddle/utils/tests/test_CustomStackTracePrint.sh
         WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 endif()
diff --git a/paddle/utils/tests/test_Error.cpp b/paddle/utils/tests/test_Error.cpp
index fdf326b17a..6f311fa6b8 100644
--- a/paddle/utils/tests/test_Error.cpp
+++ b/paddle/utils/tests/test_Error.cpp
@@ -18,17 +18,17 @@ limitations under the License. */
 
 TEST(Error, testAll) {
   paddle::Error error;
-  ASSERT_FALSE(error);
+  ASSERT_TRUE(error.isOK());
   error = paddle::Error("I'm the error");
-  ASSERT_TRUE(error);
+  ASSERT_FALSE(error.isOK());
   ASSERT_STREQ("I'm the error", error.msg());
 
   error = paddle::Error("error2");
-  ASSERT_TRUE(error);
+  ASSERT_FALSE(error.isOK());
   ASSERT_STREQ("error2", error.msg());
 
   int i = 3;
   auto error3 = paddle::Error("error%d", i);
-  ASSERT_TRUE(error3);
+  ASSERT_FALSE(error3.isOK());
   ASSERT_STREQ("error3", error3.msg());
 }
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index 18584cafe7..6212c2e60a 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -9,15 +9,15 @@ foreach(filename ${proto_filenames})
     get_filename_component(ABS_FIL ${filename} ABSOLUTE)
     get_filename_component(FIL_WE ${filename} NAME_WE)
     set(CUR_PROTO_GEN_PY
-            ${PROJ_ROOT}/paddle/python/paddle/proto/${FIL_WE}_pb2.py)
+            ${PADDLE_SOURCE_DIR}/paddle/python/paddle/proto/${FIL_WE}_pb2.py)
     set(PROTO_GEN_PY
             ${CUR_PROTO_GEN_PY}
             ${PROTO_GEN_PY})
     add_custom_command(OUTPUT ${CUR_PROTO_GEN_PY}
             COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
-            ARGS "--python_out=${PROJ_ROOT}/python/paddle/proto"
+            ARGS "--python_out=${PADDLE_SOURCE_DIR}/python/paddle/proto"
             "-I" ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL}
-            DEPENDS ${ABS_FIL} ${external_project_dependencies})
+            DEPENDS ${ABS_FIL} protoc)
 endforeach()
 
 add_custom_target(gen_proto_py ALL DEPENDS ${PROTO_GEN_PY})
diff --git a/proto/DataConfig.proto b/proto/DataConfig.proto
index e895c184d9..0cb5d7afbb 100644
--- a/proto/DataConfig.proto
+++ b/proto/DataConfig.proto
@@ -15,14 +15,13 @@ syntax = "proto2";
 
 package paddle;
 
-
 message FileGroupConf {
-  optional uint32 queue_capacity = 1 [default = 1];
+  optional uint32 queue_capacity = 1 [ default = 1 ];
   // how many files to load for a load file thread
-  optional int32 load_file_count = 2 [default = 1];
+  optional int32 load_file_count = 2 [ default = 1 ];
   // how many threads to load files
   // Setting to be 5~10 is appropriate when loading files by hadoop vfs
-  optional int32 load_thread_num = 3 [default = 1];
+  optional int32 load_thread_num = 3 [ default = 1 ];
 };
 
 message DataConfig {
@@ -32,26 +31,28 @@ message DataConfig {
   // name of a text file which contains a list of file names at each line
   optional string files = 3;
 
-  optional int32 feat_dim = 4;//feature dimension of one frame
-  repeated int32 slot_dims = 5;//feature slot dims
-  optional int32 context_len = 6;//max neibour frame numbers
-  optional uint64 buffer_capacity = 7;//the number of samples
+  optional int32 feat_dim = 4;         // feature dimension of one frame
+  repeated int32 slot_dims = 5;        // feature slot dims
+  optional int32 context_len = 6;      // max neibour frame numbers
+  optional uint64 buffer_capacity = 7; // the number of samples
 
-  //part of data used in training
-  //if not -1, part of train data is used in training
-  optional int64 train_sample_num = 8 [default = -1];
+  // part of data used in training
+  // if not -1, part of train data is used in training
+  optional int64 train_sample_num = 8 [ default = -1 ];
 
-  //The number of documents processed once
-  optional int32  file_load_num = 9 [default = -1];
-  optional bool  async_load_data = 12 [default = false];
+  // The number of documents processed once
+  optional int32 file_load_num = 9 [ default = -1 ];
+  optional bool async_load_data = 12 [ default = false ];
   /// Note the field number 10, 11 and 13 have been deprecated.
-  optional bool for_test = 14 [default = false];  // whether this data is for test
+  optional bool for_test = 14
+      [ default = false ]; // whether this data is for test
   optional FileGroupConf file_group_conf = 15;
   repeated int32 float_slot_dims = 16;
 
   /// Note the field number 17, 18 and 19 have been deprecated.
 
-  // a list of values which will be used to create additional one dimensional float
+  // a list of values which will be used to create additional one dimensional
+  // float
   // values slots. These one dimensional slots can be used as the weight input
   // for cost layers.
   // Currently this is only supported by ProtoDataProvider.
@@ -65,21 +66,21 @@ message DataConfig {
 
   // for MultiDataProvider
   repeated DataConfig sub_data_configs = 24; // sub dataproviders
-  /*
-   * the ratio of each sub dataproviders:
-   * e.g. sub dataprovider A's ratio is 1, B's ratio is 9, batch_size is 100,
-   * then each mini-batch is combined by 10 instance from A and 90 instances
-   * from B.
-   */
+                                             /*
+                                              * the ratio of each sub dataproviders:
+                                              * e.g. sub dataprovider A's ratio is 1, B's ratio is 9, batch_size is 100,
+                                              * then each mini-batch is combined by 10 instance from A and 90 instances
+                                              * from B.
+                                              */
   optional int32 data_ratio = 25;
   /*
    * if one of the sub dataproviders is running out of data, then
    * (1) it is "main data", then finish current pass.
    * (2) it is not "main data", then reset it, and try getNextBatch again.
    */
-  optional bool is_main_data = 26 [default = true];
+  optional bool is_main_data = 26 [ default = true ];
 
-  // the usage ratio of instances. Setting to 1.0 means the use of all instances.
-  optional double usage_ratio = 27 [default = 1.0];
+  // the usage ratio of instances. Setting to 1.0 means the use of all
+  // instances.
+  optional double usage_ratio = 27 [ default = 1.0 ];
 };
-
diff --git a/proto/DataFormat.proto b/proto/DataFormat.proto
index 19b1499b02..7d963bc29f 100644
--- a/proto/DataFormat.proto
+++ b/proto/DataFormat.proto
@@ -17,27 +17,32 @@ package paddle;
 
 /*
  If values is not empty and ids is empty, this is a dense vector.
- If values is not empty and ids is not empty, this is a sparse vector. The position of each value
+ If values is not empty and ids is not empty, this is a sparse vector. The
+ position of each value
  is specified by ids.
- If values is empty and ids is not empty, this is a sparse vector whose non-zero values are 1.
+ If values is empty and ids is not empty, this is a sparse vector whose non-zero
+ values are 1.
  The position of each 1 is specified by ids.
 */
 message VectorSlot {
-  repeated float values = 1 [packed = true];
-  repeated uint32 ids = 2 [packed = true];
+  repeated float values = 1 [ packed = true ];
+  repeated uint32 ids = 2 [ packed = true ];
   /* For multidimensional data, for example "image width height depth" */
-  repeated uint32 dims = 3 [packed = true];
-  repeated string strs = 4; 
+  repeated uint32 dims = 3 [ packed = true ];
+  repeated string strs = 4;
 };
 
 /*
- SubseqSlot use to record whether VectorSlot or any other slot in future has subseq.
- If not all VectorSlot have subseq, we only store the one who has subseq, and use *slot_id* to record it.
- One vector_slots has one sequence, and it may have N subseq, thus the number of *lens* will be N too. 
+ SubseqSlot use to record whether VectorSlot or any other slot in future has
+ subseq.
+ If not all VectorSlot have subseq, we only store the one who has subseq, and
+ use *slot_id* to record it.
+ One vector_slots has one sequence, and it may have N subseq, thus the number of
+ *lens* will be N too.
 */
 message SubseqSlot {
-  required uint32 slot_id = 1; //the id of slot who has subseq
-  repeated uint32 lens = 2; // lengths of sub-sequence in the slot
+  required uint32 slot_id = 1; // the id of slot who has subseq
+  repeated uint32 lens = 2;    // lengths of sub-sequence in the slot
 };
 
 message SlotDef {
@@ -45,13 +50,14 @@ message SlotDef {
     VECTOR_DENSE = 0;
     VECTOR_SPARSE_NON_VALUE = 1;
     VECTOR_SPARSE_VALUE = 2;
-    INDEX = 3;  // This can be used as label, or word id, etc.
+    INDEX = 3; // This can be used as label, or word id, etc.
     VAR_MDIM_DENSE = 4;
     VAR_MDIM_INDEX = 5;
     STRING = 6;
   }
   required SlotType type = 1;
-  required uint32 dim = 2;  // For INDEX slots, this means the maximal index plus 1.
+  required uint32 dim =
+      2; // For INDEX slots, this means the maximal index plus 1.
 };
 
 message DataHeader {
@@ -60,11 +66,11 @@ message DataHeader {
 };
 
 message DataSample {
-  optional bool is_beginning = 1 [default = true]; // is the beginning of a sequence
+  optional bool is_beginning = 1
+      [ default = true ]; // is the beginning of a sequence
   repeated VectorSlot vector_slots = 2;
-  repeated uint32 id_slots = 3 [packed = true];
+  repeated uint32 id_slots = 3 [ packed = true ];
   /* use ids of VectorSlot */
   repeated VectorSlot var_id_slots = 4;
   repeated SubseqSlot subseq_slots = 5;
 };
-
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index 37cd16c798..4f3d5bf3f6 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -21,7 +21,6 @@ package paddle;
  * Various structs for the configuration of a neural network
  */
 
-
 message ExternalConfig {
   repeated string layer_names = 1;
   repeated string input_layer_names = 2;
@@ -68,7 +67,7 @@ message ConvConfig {
   required uint32 img_size = 8;
 
   // caffe mode for output size coherence
-  required bool caffe_mode = 9 [default = true];
+  required bool caffe_mode = 9 [ default = true ];
 
   // if filter_size_y is set , this convolutional layer will use
   // filters of size filter_size * filter_size_y pixels.
@@ -99,7 +98,7 @@ message PoolConfig {
   optional uint32 start = 4;
 
   // Defines the stride size between successive pooling squares.
-  required uint32 stride = 5 [default = 1];
+  required uint32 stride = 5 [ default = 1 ];
 
   // The size of output feature map.
   required uint32 output_x = 6;
@@ -109,7 +108,7 @@ message PoolConfig {
 
   // padding = 4, instructs the net to implicitly
   // pad the images with a 4-pixel border of zeros.
-  optional uint32 padding = 8 [default = 0];
+  optional uint32 padding = 8 [ default = 0 ];
 
   // if not set, use size_x
   optional uint32 size_y = 9;
@@ -194,8 +193,11 @@ message MaxOutConfig {
   required uint32 groups = 2;
 }
 
-message RowConvConfig {
-  required uint32 context_length = 1;
+message RowConvConfig { required uint32 context_length = 1; }
+
+message SliceConfig {
+  required uint32 start = 1;
+  required uint32 end = 2;
 }
 
 message ProjectionConfig {
@@ -207,17 +209,21 @@ message ProjectionConfig {
   // For ShiftProjection
   optional int32 context_start = 5;
   optional int32 context_length = 6;
-  optional bool trainable_padding = 7 [default = false];
+  optional bool trainable_padding = 7 [ default = false ];
 
   // For convolution
   optional ConvConfig conv_conf = 8;
   optional int32 num_filters = 9;
 
   // For IdentityOffsetProjection
-  optional uint64 offset = 11 [default = 0];
+  optional uint64 offset = 11 [ default = 0 ];
 
   // For pool
   optional PoolConfig pool_conf = 12;
+
+  // For slice
+  // Each slice output is the input[start, end)
+  repeated SliceConfig slices = 13;
 }
 
 message OperatorConfig {
@@ -227,7 +233,7 @@ message OperatorConfig {
   required uint64 output_size = 4;
 
   // For DotMulOperator
-  optional double dotmul_scale = 5 [default = 1.0];
+  optional double dotmul_scale = 5 [ default = 1.0 ];
 
   // For ConvOperator
   optional ConvConfig conv_conf = 6;
@@ -273,8 +279,8 @@ message MultiBoxLossConfig {
   required float neg_overlap = 4;
   required uint32 background_id = 5;
   required uint32 input_num = 6;
-  optional uint32 height = 7 [default = 1];
-  optional uint32 width = 8 [default = 1];
+  optional uint32 height = 7 [ default = 1 ];
+  optional uint32 width = 8 [ default = 1 ];
 }
 
 message DetectionOutputConfig {
@@ -285,8 +291,13 @@ message DetectionOutputConfig {
   required uint32 input_num = 5;
   required uint32 keep_top_k = 6;
   required float confidence_threshold = 7;
-  optional uint32 height = 8 [default = 1];
-  optional uint32 width = 9 [default = 1];
+  optional uint32 height = 8 [ default = 1 ];
+  optional uint32 width = 9 [ default = 1 ];
+}
+
+message ClipConfig {
+  required double min = 1;
+  required double max = 2;
 }
 
 message LayerInputConfig {
@@ -309,6 +320,7 @@ message LayerInputConfig {
   optional RowConvConfig row_conv_conf = 15;
   optional MultiBoxLossConfig multibox_loss_conf = 16;
   optional DetectionOutputConfig detection_output_conf = 17;
+  optional ClipConfig clip_conf = 18;
 }
 
 message LayerConfig {
@@ -316,7 +328,7 @@ message LayerConfig {
   required string name = 1;
   required string type = 2;
   optional uint64 size = 3;
-  //optional ActivationConfig activation = 4;
+  // optional ActivationConfig activation = 4;
   optional string active_type = 4;
   repeated LayerInputConfig inputs = 5;
   optional string bias_parameter_name = 6;
@@ -329,7 +341,7 @@ message LayerConfig {
   // (which is how convnets are usually trained). Setting this to
   // false will untie the biases, yielding a separate bias for
   // every location at which the filter is applied.
-  optional bool shared_biases = 8 [default = false];
+  optional bool shared_biases = 8 [ default = false ];
 
   // Valid values are ones that divide the area of the output
   // grid in this convolutional layer. For example if this layer
@@ -347,33 +359,35 @@ message LayerConfig {
 
   // the gpu device which the Layer's data in.
   // Only used by ParallelNeuralNetork. Ignored otherwise.
-  optional int32 device = 12 [default = -1];
+  optional int32 device = 12 [ default = -1 ];
 
-  // for recurrent layer. If true, the recurrence runs from the end to the beginning.
-  optional bool reversed = 13 [default = false];
+  // for recurrent layer. If true, the recurrence runs from the end to the
+  // beginning.
+  optional bool reversed = 13 [ default = false ];
 
-  // for lstmemory layer. Different types of nodes have different activation type.
-  optional string active_gate_type  = 14;
+  // for lstmemory layer. Different types of nodes have different activation
+  // type.
+  optional string active_gate_type = 14;
   optional string active_state_type = 15;
 
   // For NCELayer
   // The number of random negative labels for each sample
-  optional int32 num_neg_samples = 16 [default = 10];
+  optional int32 num_neg_samples = 16 [ default = 10 ];
 
   // For NCELayer
   // The distribution for generating the random negative labels.
   // A uniform distribution will be used if not provided
-  repeated double neg_sampling_dist = 17 [packed = true];
+  repeated double neg_sampling_dist = 17 [ packed = true ];
 
   // For MaxLayer
   // default: output VALUE of MaxLayer. set this flag to true for output INDEX
   // INDEX will be put in Argument::value as double values.
-  optional bool output_max_index = 19 [default = false];
+  optional bool output_max_index = 19 [ default = false ];
 
   /// The filed number 20 have been deprecated.
 
   // For self-normalized estimation
-  optional double softmax_selfnorm_alpha = 21 [default = 0.1];
+  optional double softmax_selfnorm_alpha = 21 [ default = 0.1 ];
 
   /// The filed numbers 22 and 23 have been deprecated.
 
@@ -384,14 +398,14 @@ message LayerConfig {
   optional bool norm_by_times = 25;
 
   // for CostLayers
-  optional double coeff = 26 [default = 1.0];
+  optional double coeff = 26 [ default = 1.0 ];
 
   // for AverageLayer
   // can be set to: 'average', 'sum' or 'squarerootn'
   optional string average_strategy = 27;
 
   // for error clipping
-  optional double error_clipping_threshold = 28 [default = 0.0];
+  optional double error_clipping_threshold = 28 [ default = 0.0 ];
 
   // for operators used by mixed layer
   repeated OperatorConfig operator_confs = 29;
@@ -419,43 +433,44 @@ message LayerConfig {
   optional uint32 beam_size = 39;
 
   // for seqlastins layer, whether select first instead last
-  optional bool select_first = 40 [default = false];
+  optional bool select_first = 40 [ default = false ];
 
   // for seqlastins layer, AverageLayer, MaxLayer and ExpandLayer
   // can be set to: 'non-seq','seq'
-  optional string trans_type = 41 [default = 'non-seq'];
+  optional string trans_type = 41 [ default = 'non-seq' ];
 
   // to indicate whether selective_fc layer
   // is used in sequence generation or not
-  optional bool selective_fc_pass_generation = 42 [default = false];
+  optional bool selective_fc_pass_generation = 42 [ default = false ];
 
   // to indicate whether selective_fc layer take its last input to
   // selected several columns and only compute the multiplications
   // between the input matrices and the selected columns of
   // the parameter matrices of this layer.
   // if set false, selective_fc degrades into fc.
-  optional bool has_selected_colums = 43 [default = true];
+  optional bool has_selected_colums = 43 [ default = true ];
 
   // this parameter is for speed consideration.
   // if number of the selected columns is less than
   // sample number * selective_fc output size * selective_fc_mull_mull_ratio
   // sparse multiplication is used, otherwise, using full multiplication.
-  optional double selective_fc_full_mul_ratio = 44 [default = 0.02];
+  optional double selective_fc_full_mul_ratio = 44 [ default = 0.02 ];
 
   // to indicate how many threads selective_fc use to to accelate
   // the plain_mul period
   // leave empty or set to 0 to disable multi-thread accleleration
-  optional uint32 selective_fc_parallel_plain_mul_thread_num = 45 [default = 0];
+  optional uint32 selective_fc_parallel_plain_mul_thread_num = 45
+      [ default = 0 ];
 
   // for batch normalization layer
   // if set use_global_stats true, will use the loaded mean and variance.
   optional bool use_global_stats = 46;
 
   // use to compute moving mean and variance.
-  optional double moving_average_fraction = 47 [default = 0.9];
+  optional double moving_average_fraction = 47 [ default = 0.9 ];
 
   // bias size
-  optional uint32 bias_size = 48 [default = 0];
+  optional uint32 bias_size = 48 [ default = 0 ];
 
   // this parameter can be used as a user-defined parameter when necessary,
   // without changing the proto file.
@@ -470,12 +485,17 @@ message LayerConfig {
   optional uint64 width = 51;
 
   // blank label used in ctc loss
-  optional uint32 blank = 52 [default = 0];
+  optional uint32 blank = 52 [ default = 0 ];
 
-  // stride parameter for seqlastins layer, AverageLayer, MaxLayer, which 
+  // stride parameter for seqlastins layer, AverageLayer, MaxLayer, which
   // controls the scope of pooling operation. can be set > 0.
   // leave empty or set to -1 to disable this stride pooling.
-  optional int32 seq_pool_stride = 53 [default = -1];
+  optional int32 seq_pool_stride = 53 [ default = -1 ];
+
+  // for crop layer
+  optional int32 axis = 54 [ default = 2 ];
+  repeated uint32 offset = 55;
+  repeated uint32 shape = 56;
 }
 
 message EvaluatorConfig {
@@ -491,9 +511,9 @@ message EvaluatorConfig {
 
   // Used by PrecisionRecallEvaluator and ClassificationErrorEvaluator
   // For multi binary labels: true if output > classification_threshold
-  optional double classification_threshold = 6 [default = 0.5];
+  optional double classification_threshold = 6 [ default = 0.5 ];
   // The positive label. -1 means average precision and recall
-  optional int32 positive_label = 7 [default = -1];
+  optional int32 positive_label = 7 [ default = -1 ];
 
   // load dict from this file
   optional string dict_file = 8;
@@ -502,10 +522,10 @@ message EvaluatorConfig {
   optional string result_file = 9;
 
   // top # results for max id printer
-  optional int32 num_results = 10 [default = 1];
+  optional int32 num_results = 10 [ default = 1 ];
 
   // whether to delimit the sequence in the seq_text_printer
-  optional bool delimited = 11 [default = true];
+  optional bool delimited = 11 [ default = true ];
 
   // Used by ChunkEvaluator
   // chunk of these types are not counted
@@ -513,23 +533,23 @@ message EvaluatorConfig {
 
   // Used by ClassificationErrorEvaluator
   // top # classification error
-  optional int32 top_k = 13 [default = 1];
+  optional int32 top_k = 13 [ default = 1 ];
 
   // Used by DetectionMAPEvaluator
-  optional double overlap_threshold = 14 [default = 0.5];
+  optional double overlap_threshold = 14 [ default = 0.5 ];
 
-  optional int32 background_id = 15 [default = 0];
+  optional int32 background_id = 15 [ default = 0 ];
 
-  optional bool evaluate_difficult = 16 [default = false];
+  optional bool evaluate_difficult = 16 [ default = false ];
 
-  optional string ap_type = 17 [default = "11point"];
+  optional string ap_type = 17 [ default = "11point" ];
 }
 
 message LinkConfig {
   required string layer_name = 1;
   required string link_name = 2;
   // If true, this link has sub-sequence
-  optional bool has_subseq = 3 [default = false];
+  optional bool has_subseq = 3 [ default = false ];
 }
 
 message MemoryConfig {
@@ -542,18 +562,18 @@ message MemoryConfig {
   optional uint32 boot_with_const_id = 7;
 
   // memory is a sequence, initailized by a sequence boot layer
-  optional bool is_sequence = 6 [default = false];
+  optional bool is_sequence = 6 [ default = false ];
 }
 
 message GeneratorConfig {
   required uint32 max_num_frames = 1;
   required string eos_layer_name = 2;
-  optional int32 num_results_per_sample = 3 [default = 1];
+  optional int32 num_results_per_sample = 3 [ default = 1 ];
 
   // for beam search
-  optional int32 beam_size = 4 [default = 1];
+  optional int32 beam_size = 4 [ default = 1 ];
 
-  optional bool log_prob = 5 [default = true];
+  optional bool log_prob = 5 [ default = true ];
 }
 
 message SubModelConfig {
@@ -563,10 +583,10 @@ message SubModelConfig {
   repeated string output_layer_names = 4;
   repeated string evaluator_names = 5;
 
-  optional bool is_recurrent_layer_group = 6 [default = false];
+  optional bool is_recurrent_layer_group = 6 [ default = false ];
 
   // If true, the recurrence runs from the end to the beginning.
-  optional bool reversed = 7 [default = false];
+  optional bool reversed = 7 [ default = false ];
 
   // name and link name of memory
   repeated MemoryConfig memories = 8;
@@ -580,14 +600,15 @@ message SubModelConfig {
 
   optional GeneratorConfig generator = 11;
 
-  // the id of inlink which share info with outlinks, used in recurrent layer group
+  // the id of inlink which share info with outlinks, used in recurrent layer
+  // group
   optional int32 target_inlinkid = 12;
 }
 
 message ModelConfig {
   // type of the model.
   // Currently, "nn", "recurrent_nn" and "recursive_nn" are supported
-  required string type = 1 [default = "nn"];
+  required string type = 1 [ default = "nn" ];
 
   // layers should be ordered in such a way that the forward propagation
   // can be correctly executed by going from the first layer to the last layer
diff --git a/proto/OptimizerConfig.proto b/proto/OptimizerConfig.proto
index 2a87e293f6..d27b1bcf80 100644
--- a/proto/OptimizerConfig.proto
+++ b/proto/OptimizerConfig.proto
@@ -1,5 +1,5 @@
 syntax = "proto2";
- 
+
 option optimize_for = LITE_RUNTIME;
 
 package paddle;
@@ -9,13 +9,11 @@ message SGDConfig {
   // momentum: float >= 0. Parameter updates momentum.
   // decay: float >= 0. Learning rate decay over each update.
   // nesterov: boolean. Whether to apply Nesterov momentum.
-  optional double momentum = 21 [default = 0.0];
-  optional double decay = 23 [default = 0.0];
-  optional bool nesterov =24 [default = false];
-
+  optional double momentum = 21 [ default = 0.0 ];
+  optional double decay = 23 [ default = 0.0 ];
+  optional bool nesterov = 24 [ default = false ];
 }
 
-
 message AdadeltaConfig {
   // Adadelta
   // It is recommended to leave it at the default value.
@@ -23,21 +21,23 @@ message AdadeltaConfig {
   // epsilon: float >= 0. Fuzz factor.
   // decay: float >= 0. Learning rate decay over each update.
 
-  // reference : [Adadelta - an adaptive learning rate method](http://arxiv.org/abs/1212.5701)
-  optional double rho = 33 [default = 0.90];
-  optional double epsilon = 31 [default = 1e-5];
-  optional double decay = 32 [default = 0.0];
-
+  // reference : [Adadelta - an adaptive learning rate
+  // method](http://arxiv.org/abs/1212.5701)
+  optional double rho = 33 [ default = 0.90 ];
+  optional double epsilon = 31 [ default = 1e-5 ];
+  optional double decay = 32 [ default = 0.0 ];
 }
 
 message AdagradConfig {
-// Adagrad
-// epsilon: float >= 0.
-// decay: float >= 0. Learning rate decay over each update.
+  // Adagrad
+  // epsilon: float >= 0.
+  // decay: float >= 0. Learning rate decay over each update.
 
-// reference : [Adaptive Subgradient Methods for Online Learning and Stochastic Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-  optional double epsilon = 41 [default = 1e-5];
-  optional double decay = 42 [default = 0.0];
+  // reference : [Adaptive Subgradient Methods for Online Learning and
+  // Stochastic
+  // Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+  optional double epsilon = 41 [ default = 1e-5 ];
+  optional double decay = 42 [ default = 0.0 ];
 }
 
 message AdamConfig {
@@ -46,7 +46,8 @@ message AdamConfig {
   // beta_2: float, 0 < beta < 1. Generally close to 1.
   // epsilon: float >= 0. Fuzz factor.
   // decay: float >= 0. Learning rate decay over each update.
-  // reference : [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8)
+  // reference : [Adam - A Method for Stochastic
+  // Optimization](http://arxiv.org/abs/1412.6980v8)
   optional double beta_1 = 41;
   optional double beta_2 = 42;
   optional double epsilon = 43;
@@ -55,32 +56,32 @@ message AdamConfig {
 
 message ConstLrConfig {
   // learninRate Policy
-  optional double learning_rate = 1 [default = 1.0];
+  optional double learning_rate = 1 [ default = 1.0 ];
 }
 
 message LinearLrConfig {
   // learninRate Policy
-  optional double learning_rate = 1 [default = 1.0];
+  optional double learning_rate = 1 [ default = 1.0 ];
   optional double lr_decay_a = 2;
   optional double lr_decay_b = 3;
 }
 
 message TensorProto {
-enum DataType {
-  PADDLE_ELEMENT_TYPE_INT32 = 0;
-  PADDLE_ELEMENT_TYPE_UINT32 = 1;
-  PADDLE_ELEMENT_TYPE_INT64 = 2;
-  PADDLE_ELEMENT_TYPE_UINT64 = 3;
-  PADDLE_ELEMENT_TYPE_FLOAT32 = 4;
-  PADDLE_ELEMENT_TYPE_FLOAT64 = 5;
-}
+  enum DataType {
+    PADDLE_ELEMENT_TYPE_INT32 = 0;
+    PADDLE_ELEMENT_TYPE_UINT32 = 1;
+    PADDLE_ELEMENT_TYPE_INT64 = 2;
+    PADDLE_ELEMENT_TYPE_UINT64 = 3;
+    PADDLE_ELEMENT_TYPE_FLOAT32 = 4;
+    PADDLE_ELEMENT_TYPE_FLOAT64 = 5;
+  }
   optional DataType data_type = 1;
   repeated bytes content = 2;
 }
 
 message LrPolicyState {
   // learninRate Policy
-  optional double learning_rate = 1 [default = 1.0];
+  optional double learning_rate = 1 [ default = 1.0 ];
   optional double lr_decay_a = 2;
   optional double lr_decay_b = 3;
 }
@@ -104,7 +105,6 @@ message AdadeltaOptimizerState {
   optional TensorProto update_delta = 4;
 }
 
-
 message AdagradOptimizerState {
   optional LrPolicyState lr_state = 101;
   optional double num_sample_passed = 104;
@@ -124,10 +124,10 @@ message AdamOptimizerState {
 
 message OptimizerConfig {
   enum Optimizer {
-   SGD = 1;
-   Adadelta = 2;
-   Adagrad = 3;
-   Adam = 4;
+    SGD = 1;
+    Adadelta = 2;
+    Adagrad = 3;
+    Adam = 4;
   }
   optional Optimizer optimizer = 1;
   optional SGDConfig sgd = 3;
@@ -136,8 +136,8 @@ message OptimizerConfig {
   optional AdamConfig adam = 6;
 
   enum LrPolicy {
-   Const = 0;
-   Linear = 1;
+    Const = 0;
+    Linear = 1;
   }
   optional LrPolicy lr_policy = 11;
   optional ConstLrConfig const_lr = 12;
diff --git a/proto/ParameterConfig.proto b/proto/ParameterConfig.proto
index 580d663246..b13570a2c6 100644
--- a/proto/ParameterConfig.proto
+++ b/proto/ParameterConfig.proto
@@ -27,56 +27,57 @@ enum ParameterInitStrategy {
 message ParameterUpdaterHookConfig {
   // hook type such as  'pruning'
   required string type = 1;
-  // this represents the ratio of zero element to be set by the Parameter 
-  optional double sparsity_ratio = 2 [default = 0.6];
+  // this represents the ratio of zero element to be set by the Parameter
+  optional double sparsity_ratio = 2 [ default = 0.6 ];
 }
 
 message ParameterConfig {
   required string name = 1;
   required uint64 size = 2;
-  optional double learning_rate = 3 [default = 1.0];
-  optional double momentum = 4 [default = 0.0];
-  optional double initial_mean = 5 [default = 0.0];
-  optional double initial_std = 6 [default = 0.01];
+  optional double learning_rate = 3 [ default = 1.0 ];
+  optional double momentum = 4 [ default = 0.0 ];
+  optional double initial_mean = 5 [ default = 0.0 ];
+  optional double initial_std = 6 [ default = 0.01 ];
   // use L2-regularization if decay_rate set and decay_rate_l1 not set
-  optional double decay_rate = 7 [default = 0.0];
+  optional double decay_rate = 7 [ default = 0.0 ];
   // use L1-regularization if decay_rate_l1 set
-  optional double decay_rate_l1 = 8 [default = 0.0];
+  optional double decay_rate_l1 = 8 [ default = 0.0 ];
   // dims of Parameter, e.g. dims[0] as height, dims[1] as width..
   repeated uint64 dims = 9;
   // the gpu device which the parameter in.
   // Only used by ParallelNeuralNetork. Ignored otherwise.
-  optional int32 device = 10 [default = -1];
+  optional int32 device = 10 [ default = -1 ];
   // how to init the parameter: 0 -> normal, 1 -> uniform
   // 0: treat initial_mean as mean, intial_std as standard deviation
   // 1: range is (initial_mean - initial_std) to (initial_mean + initial_std)
-  optional int32 initial_strategy = 11 [default = 0];
+  optional int32 initial_strategy = 11 [ default = 0 ];
   // define the variance when init the parameter, by height of the Matrix
-  optional bool initial_smart = 12 [default = false];
+  optional bool initial_smart = 12 [ default = false ];
   // apply regularization every # batches
-  optional int32 num_batches_regularization = 13 [default = 1];
+  optional int32 num_batches_regularization = 13 [ default = 1 ];
   // if is_sparse is true, para is sparse, else para is dense
-  optional bool is_sparse = 14[default = false];
-  // if para is sparse, format should be "csc" or "csr", empty means is not sparse
-  optional string format = 15 [default = ""];
+  optional bool is_sparse = 14 [ default = false ];
+  // if para is sparse, format should be "csc" or "csr", empty means is not
+  // sparse
+  optional string format = 15 [ default = "" ];
   // sparse remote update or not
-  optional bool sparse_remote_update = 16 [default = false];
+  optional bool sparse_remote_update = 16 [ default = false ];
   // gradient clipping threshold, no clipping by default
-  optional double gradient_clipping_threshold = 17 [default = 0.0];
+  optional double gradient_clipping_threshold = 17 [ default = 0.0 ];
   // static parameters are fixed when training
-  optional bool is_static = 18 [default = false];
+  optional bool is_static = 18 [ default = false ];
   // para_id should NOT be set by config_parser. It is for
   // internal use.
   optional uint64 para_id = 19;
 
   repeated ParameterUpdaterHookConfig update_hooks = 20;
   // setup load mat -> csr
-  optional bool need_compact = 21 [default = false];
+  optional bool need_compact = 21 [ default = false ];
   // whether to do sparse update for this parameter
-  optional bool sparse_update = 22 [default = false];
+  optional bool sparse_update = 22 [ default = false ];
 
   // whether this parameter is shared or not.
-  optional bool is_shared = 23 [default = false];
+  optional bool is_shared = 23 [ default = false ];
   // parameter block size
-  optional uint64 parameter_block_size = 24 [default = 0];
+  optional uint64 parameter_block_size = 24 [ default = 0 ];
 }
diff --git a/proto/ParameterServerConfig.proto b/proto/ParameterServerConfig.proto
index 404f961379..bd63cf35b1 100644
--- a/proto/ParameterServerConfig.proto
+++ b/proto/ParameterServerConfig.proto
@@ -15,13 +15,10 @@ syntax = "proto2";
 
 package paddle;
 
-
 /**
  * Configuration structure for ParameterClient2.
  */
-message ParameterClientConfig {
-  required int32 trainer_id = 1;
-}
+message ParameterClientConfig { required int32 trainer_id = 1; }
 
 /**
  * Configuration structure for ParameterServer2.
@@ -30,24 +27,24 @@ message ParameterServerConfig {
   // Number of ports for sending dense parameter,
   // following ports on parameter server will be visited
   // for sending dense parameter: [port, port+ports_num-1]
-  required int32 ports_num = 1 [default = 1];
+  required int32 ports_num = 1 [ default = 1 ];
   // Number of ports for sending sparse parameter,
   // following ports on parameter server will be visited
   // for sending sparse parameter:
   // [port+ports_num, port+ports_num+ports_num_for_sparse-1]
-  required int32 ports_num_for_sparse = 2 [default = 0];
+  required int32 ports_num_for_sparse = 2 [ default = 0 ];
   // network device name for pservers
-  required string nics = 3 [default = "xgbe0,xgbe1"];
-  required string rdma_tcp = 4 [default = "tcp"];
+  required string nics = 3 [ default = "xgbe0,xgbe1" ];
+  required string rdma_tcp = 4 [ default = "tcp" ];
   // Listening port for pserver
-  required int32 port = 5 [default = 20134];
+  required int32 port = 5 [ default = 20134 ];
   // number of gradient servers
-  required int32 num_gradient_servers = 6 [default = 1];
+  required int32 num_gradient_servers = 6 [ default = 1 ];
   // number of threads for sync op exec
-  required int32 pserver_num_threads = 7 [default = 1];
+  required int32 pserver_num_threads = 7 [ default = 1 ];
   // control config_.async_lagged_grad_discard_ratio() min value
-  required double async_lagged_ratio_min = 8 [default = 1.0];
+  required double async_lagged_ratio_min = 8 [ default = 1.0 ];
   // if async_lagged_grad_discard_ratio is not set in trainer_config.conf
   // use it as defalut value
-  required double async_lagged_ratio_default = 9 [default = 1.5];
+  required double async_lagged_ratio_default = 9 [ default = 1.5 ];
 }
\ No newline at end of file
diff --git a/proto/ParameterService.proto b/proto/ParameterService.proto
index c1c04d8cc5..e3c180ccc3 100644
--- a/proto/ParameterService.proto
+++ b/proto/ParameterService.proto
@@ -23,8 +23,8 @@ package paddle;
  */
 enum ParameterUpdateMode {
   // Set parameter
-   PSERVER_UPDATE_MODE_SET_PARAM = 0;//use local param
-   PSERVER_UPDATE_MODE_SET_PARAM_ZERO = 1;//set zero param
+  PSERVER_UPDATE_MODE_SET_PARAM = 0;      // use local param
+  PSERVER_UPDATE_MODE_SET_PARAM_ZERO = 1; // set zero param
 
   // Update parameter once a gradient is received
   PSERVER_UPDATE_MODE_ASYNC_SGD = 2;
@@ -37,7 +37,7 @@ enum ParameterUpdateMode {
 
   // No update. Only get parameters back.
   PSERVER_UPDATE_MODE_GET_PARAM = 5;
-  PSERVER_UPDATE_MODE_GET_PARAM_SPARSE = 6;//only get sparse rows
+  PSERVER_UPDATE_MODE_GET_PARAM_SPARSE = 6; // only get sparse rows
 };
 
 message ParameterBlock {
@@ -80,42 +80,34 @@ message SendParameterRequest {
   optional int32 trainer_id = 7;
 
   // send back parameter type on pserver, PARAMETER_VALUE by default
-  optional int32 send_back_parameter_type = 8 [default = 0];
+  optional int32 send_back_parameter_type = 8 [ default = 0 ];
 
   // forwardbackward time in usec
   optional uint64 forwardbackward_time = 9;
-
 }
 
-message WaitPassStartRequest {
-}
+message WaitPassStartRequest {}
 
-message WaitPassStartResponse {
-}
+message WaitPassStartResponse {}
 
-message WaitPassFinishRequest {
-}
+message WaitPassFinishRequest {}
 
-message WaitPassFinishResponse {
-}
+message WaitPassFinishResponse {}
 
 enum SyncObject {
   SYNC_DEFAULT = 0; // wait for the synchronizeBarrier_
-  SYNC_DATA = 1; // wait for the synchronizeDataBarrier_
+  SYNC_DATA = 1;    // wait for the synchronizeDataBarrier_
 }
 
 message SynchronizeRequest {
-  required SyncObject sync_object_id = 1 [default = SYNC_DEFAULT];
+  required SyncObject sync_object_id = 1 [ default = SYNC_DEFAULT ];
 
   optional int32 trainer_id = 2;
 }
 
-message SynchronizeResponse {
-}
+message SynchronizeResponse {}
 
-message SendParameterResponse  {
-  repeated ParameterBlock blocks = 1;
-}
+message SendParameterResponse { repeated ParameterBlock blocks = 1; }
 
 message SetConfigRequest {
   repeated ParameterConfig param_configs = 1;
@@ -125,26 +117,18 @@ message SetConfigRequest {
   required bool is_sparse_server = 6;
 }
 
-message SetConfigResponse{
-}
+message SetConfigResponse {}
 
-message GetStatusRequest {
-}
+message GetStatusRequest {}
 
-message GetStatusResponse {
-  required PServerStatus status = 1;
-}
+message GetStatusResponse { required PServerStatus status = 1; }
 
-message SetStatusRequest {
-  required PServerStatus status = 1;
-}
+message SetStatusRequest { required PServerStatus status = 1; }
 
-message SetStatusResponse {
-}
+message SetStatusResponse {}
 
 // create a column vector. The size is the dimension of parameter
-message CreateVectorRequest {
-}
+message CreateVectorRequest {}
 
 message CreateVectorResponse {
   // error message. Empty if success
@@ -153,9 +137,7 @@ message CreateVectorResponse {
   required int64 handle = 2;
 }
 
-message ReleaseVectorRequest {
-  required int64 handle = 1;
-}
+message ReleaseVectorRequest { required int64 handle = 1; }
 
 message ReleaseVectorResponse {
   // error message. Empty if success
@@ -164,9 +146,7 @@ message ReleaseVectorResponse {
 
 // Create a column major matrix. The number of rows is the dimension
 // of parameter. The number of columns is specifed by num_cols
-message CreateMatrixRequest {
-  required int32 num_cols = 1;
-}
+message CreateMatrixRequest { required int32 num_cols = 1; }
 
 message CreateMatrixResponse {
   // error message. Empty if success
@@ -175,16 +155,13 @@ message CreateMatrixResponse {
   required int64 handle = 2;
 }
 
-message ReleaseMatrixRequest {
-  required int64 handle = 1;
-}
+message ReleaseMatrixRequest { required int64 handle = 1; }
 
 message ReleaseMatrixResponse {
   // error message. Empty if success
   optional string return_message = 1;
 }
 
-
 /**
  * The operations are defined using the variables commented at Operation
  * and OperationResult
@@ -245,36 +222,36 @@ enum MatrixVectorOperation {
 
 message ProtoVector {
   required int64 dim = 1;
-  repeated double values = 2 [packed = true];
+  repeated double values = 2 [ packed = true ];
 }
 
 message ProtoMatrix {
   required int64 num_rows = 1;
   required int64 num_cols = 2;
-  repeated double values = 3 [packed = true];
+  repeated double values = 3 [ packed = true ];
 }
 
 message Operation {
   required MatrixVectorOperation operation = 1;
 
   // vector handles created on the pserver
-  repeated int64 pvectors = 2;        // u, v, w
+  repeated int64 pvectors = 2; // u, v, w
 
   // matrix handles created on the pserver
-  repeated int64 pmatrices = 3;       // A, B, C
+  repeated int64 pmatrices = 3; // A, B, C
 
-  repeated double scalars = 4;  	      // a, b, c
-  repeated ProtoVector vectors = 5;   // x, y, z
-  repeated ProtoMatrix matrices = 6;  // X, Y, Z
+  repeated double scalars = 4;       // a, b, c
+  repeated ProtoVector vectors = 5;  // x, y, z
+  repeated ProtoMatrix matrices = 6; // X, Y, Z
 }
 
 message OperationResult {
   // error message. Empty if success
   optional string return_message = 1;
-//
-  repeated double scalars = 2;  // d, e, f
+  //
+  repeated double scalars = 2;       // d, e, f
   repeated ProtoVector vectors = 3;  // p, q, r
-  repeated ProtoMatrix matrices = 4;  // P, Q, R
+  repeated ProtoMatrix matrices = 4; // P, Q, R
 }
 
 message DoOperationRequest {
@@ -301,18 +278,14 @@ message DoOperationResponse {
   required bool pass_finish = 3;
 }
 
-message LoadValueRequest {
-  required string dir_name = 1;
-}
+message LoadValueRequest { required string dir_name = 1; }
 
 message LoadValueResponse {
   // error message. Empty if success
   optional string return_message = 1;
 }
 
-message SaveValueRequest {
-  required string dir_name = 1;
-}
+message SaveValueRequest { required string dir_name = 1; }
 
 message SaveValueResponse {
   // error message. Empty if success
@@ -331,11 +304,11 @@ enum DataUpdateMode {
   // Client send it's own ref label to pserver
   DATA_UPDATE_MODE_SET_REF_LABEL = 4;
   // Client get all ref labels from all pservers
-  DATA_UPDATE_MODE_GET_REF_LABEL =5;
+  DATA_UPDATE_MODE_GET_REF_LABEL = 5;
   // Client send it's own ref grad to pserver
-  DATA_UPDATE_MODE_SET_REF_GRAD =6;
+  DATA_UPDATE_MODE_SET_REF_GRAD = 6;
   // Client get all ref grad from all pservers
-  DATA_UPDATE_MODE_GET_REF_GRAD =7;
+  DATA_UPDATE_MODE_GET_REF_GRAD = 7;
 }
 
 enum SendDataType {
@@ -360,7 +333,7 @@ message DataBlock {
   // byte size of one data type
   required int32 data_size = 2;
   // data_type
-  optional TransDataType data_type = 3 [default = TRANS_DOUBLE];
+  optional TransDataType data_type = 3 [ default = TRANS_DOUBLE ];
 }
 
 message SendDataRequest {
diff --git a/proto/TrainerConfig.proto b/proto/TrainerConfig.proto
index a819d20d11..b7c2355159 100644
--- a/proto/TrainerConfig.proto
+++ b/proto/TrainerConfig.proto
@@ -20,14 +20,14 @@ package paddle;
 
 message OptimizationConfig {
   required int32 batch_size = 3;
-  required string algorithm = 4 [default = "async_sgd"];
-  optional int32 num_batches_per_send_parameter = 5 [default = 1];
-  optional int32 num_batches_per_get_parameter = 6 [default = 1];
+  required string algorithm = 4 [ default = "async_sgd" ];
+  optional int32 num_batches_per_send_parameter = 5 [ default = 1 ];
+  optional int32 num_batches_per_get_parameter = 6 [ default = 1 ];
 
   required double learning_rate = 7;
-  optional double learning_rate_decay_a = 8 [default = 0];
-  optional double learning_rate_decay_b = 9 [default = 0];
-  optional string learning_rate_schedule = 27 [default = "constant"];
+  optional double learning_rate_decay_a = 8 [ default = 0 ];
+  optional double learning_rate_decay_b = 9 [ default = 0 ];
+  optional string learning_rate_schedule = 27 [ default = "constant" ];
   // learning rate will be scaled according to learning_rate_schedule
   // 1), constant:
   // lr = learning_rate
@@ -49,88 +49,92 @@ message OptimizationConfig {
 
   // owlqn related
   // L1-regularization
-  optional double l1weight = 10 [default = 0.1];
+  optional double l1weight = 10 [ default = 0.1 ];
   // L2-regularization
-  optional double l2weight = 11 [default = 0];
+  optional double l2weight = 11 [ default = 0 ];
   // "c1" in wolfe condition: if (newobj <= oldobj + c1 * origDirDeriv * step)
   // then accept the step
-  optional double c1 = 12 [default = 0.0001];
+  optional double c1 = 12 [ default = 0.0001 ];
   // multiply the step with "backoff", when wolfe condition doesn't satisfy
-  optional double backoff = 13 [default = 0.5];
+  optional double backoff = 13 [ default = 0.5 ];
   // how many "s"s and "y"s are kept in owlqn
-  optional int32 owlqn_steps = 14 [default = 10];
+  optional int32 owlqn_steps = 14 [ default = 10 ];
   // accept the step if encountered "max_backoff" times of "reduce the step"
-  optional int32 max_backoff = 15 [default = 5];
+  optional int32 max_backoff = 15 [ default = 5 ];
   // L2-regularization coefficient is reduced linearly from iteration 0 to
   // "l2weight_zero_iter", and set to 0 after "l2weight_zero_iter"
   // iterations. set "l2weight_zero_iter" to 0 to disable this strategy.
-  optional int32 l2weight_zero_iter = 17 [default = 0];
+  optional int32 l2weight_zero_iter = 17 [ default = 0 ];
 
   // averaged sgd
   // About average_window * numBatchProcessed parameter are used
   // for average. To be accurate, between average_window * numBatchProcessed
   // and 2 * average_window * numBatchProcessed parameters are used for
   // average.
-  optional double average_window = 18 [default = 0];
-  optional int64 max_average_window = 19 [default = 0x7fffffffffffffff];
+  optional double average_window = 18 [ default = 0 ];
+  optional int64 max_average_window = 19 [ default = 0x7fffffffffffffff ];
 
   //////////////////////////
   // Options Adaptive SGD //
   //////////////////////////
 
-  // learning method for sgd/asgd, such as "momentum", "adagrad", "adadelta", "rmsprop"
-  // default learning method("momentum") use global decayed learning rate with momentum.
+  // learning method for sgd/asgd, such as "momentum", "adagrad", "adadelta",
+  // "rmsprop"
+  // default learning method("momentum") use global decayed learning rate with
+  // momentum.
   // "adagrad", "adadelta" and "rmsprop" can set momentum too.
-  optional string learning_method = 23 [default = "momentum"];
-  optional double ada_epsilon = 24 [default = 1e-6];
-  optional double ada_rou = 26 [default = 0.95];
+  optional string learning_method = 23 [ default = "momentum" ];
+  optional double ada_epsilon = 24 [ default = 1e-6 ];
+  optional double ada_rou = 26 [ default = 0.95 ];
 
   // Force to do average in cpu in order to save gpu memory usage
-  optional bool do_average_in_cpu = 25 [default = false];
+  optional bool do_average_in_cpu = 25 [ default = false ];
 
   // delta add rate in pserver, used while num_batches_per_send_parameter>1
   // will be divided by #machines automatically.
-  optional double delta_add_rate = 28 [default = 1.0];
+  optional double delta_add_rate = 28 [ default = 1.0 ];
 
   // We split a large size into smaller mini-batches, whose sizes are
   // determined by mini_batch_size. It only takes effect when there is
   // an ExternalMachine.
-  optional int32 mini_batch_size = 29 [default = 128];
+  optional int32 mini_batch_size = 29 [ default = 128 ];
 
   // automatically set if any one of parameters set sparse remote update flag
-  optional bool use_sparse_remote_updater = 30 [default = false];
+  optional bool use_sparse_remote_updater = 30 [ default = false ];
 
-  // how to update center parameter and feedback to local parameter, 
+  // how to update center parameter and feedback to local parameter,
   // when use local sgd update in cluster training.
-  // A option is elastic_average, proposed by the paper: Deep learning with elastic averaging SGD.
-  // If use elastic_average method, every trainer node should sample from whole data sets.
-  optional string center_parameter_update_method = 31 [default = "average"];
+  // A option is elastic_average, proposed by the paper: Deep learning with
+  // elastic averaging SGD.
+  // If use elastic_average method, every trainer node should sample from whole
+  // data sets.
+  optional string center_parameter_update_method = 31 [ default = "average" ];
 
   // shrink sparse parameter value
   // only works if parameter is remote sparse update and has L1 decay rate
-  optional double shrink_parameter_value = 32 [default = 0];
+  optional double shrink_parameter_value = 32 [ default = 0 ];
 
   ////////////////////////////
   // Options Adam Optimizer //
   ////////////////////////////
-  optional double adam_beta1 = 33 [default = 0.9];
-  optional double adam_beta2 = 34 [default = 0.999];
-  optional double adam_epsilon = 35 [default = 1e-8];
+  optional double adam_beta1 = 33 [ default = 0.9 ];
+  optional double adam_beta2 = 34 [ default = 0.999 ];
+  optional double adam_epsilon = 35 [ default = 1e-8 ];
 
   // arguments for learning rate scheduler
   // Format: num1:rate1,num2:rate2,...,numK:rateK
   // For learning_rate_schedule="manual", num is the number of samples,
   // For learning_rate_schedule="pass_manual",
   //  num is the number of passes (starting from 0)
-  optional string learning_rate_args = 36 [default = ""];
- 
+  optional string learning_rate_args = 36 [ default = "" ];
+
   // for async sgd gradient commit control.
   // when async_lagged_grad_discard_ratio * num_gradient_servers commit passed,
   // current async gradient will be discard silently.
-  optional double async_lagged_grad_discard_ratio = 37 [default = 1.5];
+  optional double async_lagged_grad_discard_ratio = 37 [ default = 1.5 ];
 
-  // global threshold for gradient clipping 
-  optional double gradient_clipping_threshold = 38 [default = 0.0];
+  // global threshold for gradient clipping
+  optional double gradient_clipping_threshold = 38 [ default = 0.0 ];
 };
 
 message TrainerConfig {
@@ -141,7 +145,7 @@ message TrainerConfig {
   repeated string config_files = 5;
 
   // the directory to save/load model files for each training path
-  optional string save_dir = 6 [default = "./output/model"];
+  optional string save_dir = 6 [ default = "./output/model" ];
 
   // Path of the initial model parameters.
   // If it was set, start_pass will be ignored.
@@ -149,7 +153,7 @@ message TrainerConfig {
 
   // Start training from this pass.
   // Will load parameter from the previous pass.
-  optional int32 start_pass = 8 [default = 0];
+  optional int32 start_pass = 8 [ default = 0 ];
 
   // file path to the trainer config file
   optional string config_file = 9;
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 0171f9d8cc..7bd6d59b00 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1,5 +1,3 @@
-set(OUTPUT_DIR
-    "${CMAKE_CURRENT_BINARY_DIR}/build")
 
 file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
 file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
@@ -18,28 +16,45 @@ SET(COPY_PADDLE_MASTER "")
 if(WITH_GOLANG)
   SET(COPY_PADDLE_MASTER "copy_paddle_master")
   add_custom_command(TARGET ${COPY_PADDLE_MASTER}
-    COMMAND cp ${paddle_master_LIB_PATH} ${PROJ_ROOT}/python/paddle/v2/master/
+    COMMAND cp ${paddle_master_LIB_PATH} ${PADDLE_SOURCE_DIR}/python/paddle/v2/master/
     )
   add_dependencies(copy_paddle_master paddle_master)
 endif(WITH_GOLANG)
 
+set(MKL_SHARED_LIBS "")
+set(MKL_DEPENDS "")
+if(WITH_MKLML)
+  list(APPEND MKL_SHARED_LIBS ${MKLML_LIB} ${MKLML_IOMP_LIB})
+  list(APPEND MKL_DEPENDS mklml)
+endif()
+
+if(WITH_MKLDNN)
+  list(APPEND MKL_SHARED_LIBS "${MKLDNN_LIB}" "${MKLDNN_LIB}.0")
+  list(APPEND MKL_DEPENDS mkldnn)
+endif()
+
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
 
-add_custom_command(OUTPUT ${PROJ_ROOT}/python/paddle/v2/framework/core.so
-        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PROJ_ROOT}/python/paddle/v2/framework/core.so
+add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so
         DEPENDS paddle_pybind)
-add_custom_target(copy_paddle_pybind ALL DEPENDS ${PROJ_ROOT}/python/paddle/v2/framework/core.so)
+add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so)
 
 
-add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
+add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-    COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp
+    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+    COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
     DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 
-add_custom_target(paddle_python ALL DEPENDS
-    ${OUTPUT_DIR}/.timestamp)
+set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp paddle_pserver_main paddle_trainer paddle_merge_model ${MKL_DEPENDS})
+if(WITH_SWIG_PY)
+    list(APPEND paddle_python_deps python_api_wheel)
+endif()
+add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps})
 
 set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
 
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 826ba2834a..8d71629faa 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -565,6 +565,35 @@ class IdentityOffsetProjection(Projection):
         return []
 
 
+@config_class
+class SliceProjection(Projection):
+    type = 'slice'
+
+    def __init__(self, input_layer_name, slices, **xargs):
+        super(SliceProjection, self).__init__(input_layer_name, **xargs)
+        input = g_layer_map[input_layer_name]
+        if input.type in ["exconv", "cudnn_conv"]:
+            # the slice operator is for the channel dimension
+            assert input.num_filters is not None
+            channels = input.num_filters
+            image_size = input.size / channels
+            assert slices[len(slices) - 1][1] <= channels
+            for i in xrange(len(slices)):
+                slice = self.proj_conf.slices.add()
+                slice.start = slices[i][0] * image_size
+                slice.end = slices[i][1] * image_size
+                self.size += slice.end - slice.start
+        else:
+            config_assert(False,
+                          'Currently the input should be convolution layer')
+
+    def calc_parameter_size(self, input_size, output_size):
+        return 0
+
+    def calc_parameter_dims(self, input_size, output_size):
+        return []
+
+
 # DotMulProjection performs element-wise multiplication with weight
 @config_class
 class DotMulProjection(Projection):
@@ -1575,15 +1604,36 @@ class MultiClassCrossEntropySelfNormCostLayer(LayerBase):
 
 @config_layer('fc')
 class FCLayer(LayerBase):
-    def __init__(self, name, size, inputs, bias=True, **xargs):
-        super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)
+    layer_type = 'fc'
+
+    def __init__(self,
+                 name,
+                 size,
+                 inputs,
+                 bias=True,
+                 error_clipping_threshold=None,
+                 **xargs):
+        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+        use_mkldnn_wgt = bool(
+            int(g_command_config_args.get("use_mkldnn_wgt", 0)))
+        if use_mkldnn:
+            self.layer_type = 'mkldnn_fc'
+            config_assert(
+                len(inputs) == 1,
+                "MkldnnFCLayer support one and only one input!")
+        super(FCLayer, self).__init__(
+            name, self.layer_type, size, inputs=inputs, **xargs)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             psize = self.config.size * input_layer.size
             dims = [input_layer.size, self.config.size]
             format = self.inputs[input_index].format
             sparse = format == "csr" or format == "csc"
-
+            if use_mkldnn:
+                config_assert(not sparse,
+                              "MkldnnFCLayer do not support sparse format yet")
+                if use_mkldnn_wgt:
+                    dims = [self.config.size, input_layer.size]
             if sparse:
                 psize = self.inputs[input_index].nnz
             else:
@@ -1592,6 +1642,13 @@ class FCLayer(LayerBase):
             self.create_input_parameter(input_index, psize, dims, sparse,
                                         format)
         self.create_bias_parameter(bias, self.config.size)
+        if error_clipping_threshold is not None:
+            self.config.error_clipping_threshold = error_clipping_threshold
+
+
+@config_layer('mkldnn_fc')
+class MkldnnFcLayer(FCLayer):
+    layer_type = 'mkldnn_fc'
 
 
 @config_layer('selective_fc')
@@ -1990,6 +2047,23 @@ class PadLayer(LayerBase):
         self.config.size = out_ch * out_h * out_w
 
 
+@config_layer('crop')
+class CropLayer(LayerBase):
+    def __init__(self, name, inputs, axis, offset, shape, **xargs):
+        super(CropLayer, self).__init__(name, 'crop', 0, inputs=inputs, **xargs)
+        self.config.axis = axis
+        self.config.offset.extend(offset)
+        self.config.shape.extend(shape)
+
+        # get channel, width and height from input_0 layer
+        input_layer = self.get_input_layer(0)
+        image_conf = self.config.inputs[0].image_conf
+        image_conf.img_size = input_layer.width
+        image_conf.img_size_y = input_layer.height
+        image_conf.channels = input_layer.size / (input_layer.width *
+                                                  input_layer.height)
+
+
 @config_layer('batch_norm')
 class BatchNormLayer(LayerBase):
     layer_type = 'batch_norm'
@@ -2030,8 +2104,7 @@ class BatchNormLayer(LayerBase):
         # Automatically select cudnn_batch_norm for GPU and batch_norm for CPU.
         # Also based on cudnn version.
         use_cudnn = use_gpu and batch_norm_type != "batch_norm" and \
-            ((not parallel_nn) or self.config.device > -1) and \
-            cudnn_version >= 4007
+                ((not parallel_nn) or self.config.device > -1)
         self.layer_type = "cudnn_batch_norm" if use_cudnn else "batch_norm"
         super(BatchNormLayer, self).__init__(
             name, self.layer_type, 0, inputs=inputs, **xargs)
@@ -2145,6 +2218,34 @@ class RowConvLayer(LayerBase):
         self.create_input_parameter(0, psize, dims)
 
 
+@config_layer('clip')
+class ClipLayer(LayerBase):
+    def __init__(self, name, inputs, min, max, **xargs):
+        super(ClipLayer, self).__init__(name, 'clip', 0, inputs=inputs, **xargs)
+        config_assert(
+            len(self.inputs) == 1,
+            'ClipLayer must have one and only one input.')
+        config_assert(min < max, 'min must be less than max.')
+        input_layer = self.get_input_layer(0)
+        self.set_layer_size(input_layer.size)
+        self.config.inputs[0].clip_conf.min = min
+        self.config.inputs[0].clip_conf.max = max
+
+
+@config_layer('scale_shift')
+class ScaleShiftLayer(LayerBase):
+    def __init__(self, name, inputs, bias=True, **xargs):
+        super(ScaleShiftLayer, self).__init__(
+            name, 'scale_shift', 0, inputs=inputs, **xargs)
+        config_assert(
+            len(self.inputs) == 1,
+            'ScaleShiftLayer must have one and only one input.')
+        input_layer = self.get_input_layer(0)
+        self.set_layer_size(input_layer.size)
+        self.create_input_parameter(0, 1, [1, 1])
+        self.create_bias_parameter(bias, 1)
+
+
 # key: cost type
 # value: cost class
 g_cost_map = {}
@@ -2590,6 +2691,31 @@ class SubSequenceLayer(LayerBase):
         self.create_bias_parameter(bias, size)
 
 
+@config_layer('sub_nested_seq')
+class SubNestedSequenceLayer(LayerBase):
+    def __init__(self, name, inputs, selected_indices, bias=False, **xargs):
+        if isinstance(inputs, list):
+            assert len(inputs) == 1, ('the first input of sub_nested_seq '
+                                      'layer is a single nested sequence.')
+            inputs = inputs[0]
+        if isinstance(selected_indices, list):
+            assert len(selected_indices) == 1, (
+                'the second input of '
+                'sub_nested_seq layer is a single layer which is a '
+                'set of selected indices.')
+            selected_indices = selected_indices[0]
+
+        super(SubNestedSequenceLayer, self).__init__(
+            name,
+            'sub_nested_seq',
+            0,
+            inputs=[inputs, selected_indices],
+            **xargs)
+        input_layer0 = self.get_input_layer(0)
+        size = input_layer0.size
+        self.set_layer_size(size)
+
+
 @config_layer('out_prod')
 class OuterProdLayer(LayerBase):
     def __init__(self, name, inputs, device=None):
@@ -2701,6 +2827,16 @@ class SumToOneNormLayer(LayerBase):
         self.set_layer_size(input_layer0.size)
 
 
+@config_layer('row_l2_norm')
+class RowL2NormLayer(LayerBase):
+    def __init__(self, name, inputs, **xargs):
+        super(RowL2NormLayer, self).__init__(
+            name, 'row_l2_norm', 0, inputs=inputs, **xargs)
+        config_assert(len(self.inputs) == 1, 'RowL2NormLayer must have 1 input')
+        input_layer = self.get_input_layer(0)
+        self.set_layer_size(input_layer.size)
+
+
 @config_layer('cos_vm')
 class CosSimVecMatLayer(LayerBase):
     def __init__(self, name, size, inputs, cos_scale=1.0, device=None):
@@ -3146,6 +3282,16 @@ class CTCLayer(LayerBase):
         config_assert(len(self.inputs) == 2, 'CTCLayer must have 2 inputs')
 
 
+@config_layer('kmax_seq_score')
+class KmaxSeqScoreLayer(LayerBase):
+    def __init__(self, name, inputs, beam_size, **xargs):
+        super(KmaxSeqScoreLayer, self).__init__(
+            name, 'kmax_seq_score', 0, inputs=inputs, **xargs)
+        config_assert(
+            len(self.inputs) == 1, 'KmaxSeqScoreLayer has only one input.')
+        self.config.beam_size = beam_size
+
+
 @config_layer('warp_ctc')
 class WarpCTCLayer(LayerBase):
     def __init__(self,
@@ -3194,6 +3340,10 @@ def ParameterHook(type, **kwargs):
         if sparsity_ratio is not None:
             hook.sparsity_ratio = sparsity_ratio
         return hook
+    elif type == 'dpruning':
+        hook = ParameterUpdaterHookConfig()
+        hook.type = type
+        return hook
     else:
         return None
 
diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py
index 9b9f979bb6..ecba871910 100644
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -272,7 +272,7 @@ class ExtraLayerAttribute(object):
         for key in self.attr:
             if not hasattr(self, 'can_%s' % key) or \
                     not getattr(self, 'can_%s' % key):
-                raise NotImplementedError("Layer %s cannot support %s" %
+                raise NotImplementedError("Layer %s does not support %s" %
                                           (layer_name, key))
 
     @staticmethod
diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py
index 44d52edfa7..57979db4de 100644
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
@@ -298,8 +298,8 @@ def pnpair_evaluator(
         input,
         label,
         info,
-        name=None,
-        weight=None, ):
+        weight=None,
+        name=None, ):
     """
     Positive-negative pair rate Evaluator which adapts to rank task like
     learning to rank. This evaluator must contain at least three layers.
@@ -308,27 +308,31 @@ def pnpair_evaluator(
 
     .. code-block:: python
 
-       eval = pnpair_evaluator(input, info, label)
+       eval = pnpair_evaluator(input, label, info)
 
-    :param name: Evaluator name.
-    :type name: None|basestring
     :param input: Input Layer name. The output prediction of network.
     :type input: LayerOutput
     :param label: Label layer name.
     :type label: LayerOutput
-    :param info: Label layer name. (TODO, explaination)
+    :param info: Info layer name. (TODO, explaination)
     :type info: LayerOutput
     :param weight: Weight Layer name. It should be a matrix with size
                   [sample_num, 1]. (TODO, explaination)
     :type weight: LayerOutput
+    :param name: Evaluator name.
+    :type name: None|basestring
     """
+    if not isinstance(input, list):
+        input = [input]
+    if label:
+        input.append(label)
+    if info:
+        input.append(info)
     evaluator_base(
-        name=name,
-        type="pnpair",
         input=input,
-        label=label,
-        info=info,
-        weight=weight)
+        type="pnpair",
+        weight=weight,
+        name=name, )
 
 
 @evaluator(EvaluatorAttribute.FOR_CLASSIFICATION)
@@ -429,12 +433,12 @@ def chunk_evaluator(
 
     .. code-block:: text
 
-        Scheme    Description                                                                                  
+        Scheme    Description
         plain    Use the same label for the whole chunk.
-        IOB      Two labels for chunk type X, B-X for chunk begining and I-X for chunk inside. 
+        IOB      Two labels for chunk type X, B-X for chunk begining and I-X for chunk inside.
         IOE      Two labels for chunk type X, E-X for chunk ending and I-X for chunk inside.
-        IOBES    Four labels for chunk type X, B-X for chunk begining, I-X for chunk inside, E-X for chunk end and S-X for single word chunk. 
-   
+        IOBES    Four labels for chunk type X, B-X for chunk begining, I-X for chunk inside, E-X for chunk end and S-X for single word chunk.
+
     To make it clear, let's illustrate by an NER example.
     Assuming that there are three named entity types including ORG, PER and LOC which are called 'chunk type' here,
     if 'IOB' scheme were used, the label set will be extended to a set including B-ORG, I-ORG, B-PER, I-PER, B-LOC, I-LOC and O,
@@ -451,7 +455,7 @@ def chunk_evaluator(
         tagType = label % numTagType
         chunkType = label / numTagType
         otherChunkType = numChunkTypes
-    
+
     The following table shows the mapping rule between tagType and tag type in each scheme.
 
     .. code-block:: text
@@ -475,7 +479,7 @@ def chunk_evaluator(
         O      6
 
     In this example, chunkType has three values: 0 for ORG, 1 for PER, 2 for LOC, because the scheme is
-    "IOB" so tagType has two values: 0 for B and 1 for I. 
+    "IOB" so tagType has two values: 0 for B and 1 for I.
     Here we will use I-LOC to explain the above mapping rules in detail.
     For I-LOC, the label id is 5, so we can get tagType=1 and chunkType=2, which means I-LOC is a part of NER chunk LOC
     and the tag is I.
@@ -486,7 +490,7 @@ def chunk_evaluator(
 
        eval = chunk_evaluator(input, label, chunk_scheme, num_chunk_types)
 
-    
+
     :param input: The input layers.
     :type input: LayerOutput
     :param label: An input layer containing the ground truth label.
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index b0524a507b..c9e3ded65c 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -76,6 +76,7 @@ __all__ = [
     'trans_layer',
     'rotate_layer',
     'sum_to_one_norm_layer',
+    'row_l2_norm_layer',
     'get_output_layer',
     'LayerType',
     'context_projection',
@@ -126,6 +127,13 @@ __all__ = [
     'row_conv_layer',
     'dropout_layer',
     'prelu_layer',
+    'gated_unit_layer',
+    'crop_layer',
+    'sub_nested_seq_layer',
+    'clip_layer',
+    'slice_projection',
+    'kmax_sequence_score_layer',
+    'scale_shift_layer',
 ]
 
 
@@ -157,6 +165,7 @@ class LayerType(object):
     BATCH_NORM_LAYER = 'batch_norm'
     NORM_LAYER = 'norm'
     SUM_TO_ONE_NORM_LAYER = 'sum_to_one_norm'
+    ROW_L2_NORM_LAYER = 'row_l2_norm'
     ADDTO_LAYER = 'addto'
 
     CONCAT_LAYER = 'concat'
@@ -217,6 +226,12 @@ class LayerType(object):
     SMOOTH_L1 = 'smooth_l1'
 
     PRELU = 'prelu'
+    CROP_LAYER = 'crop'
+    SUB_NESTED_SEQ = 'sub_nested_seq'
+    CLIP_LAYER = 'clip'
+
+    KMAX_SEQ_SCORE = 'kmax_seq_score'
+    SCALE_SHIFT_LAYER = 'scale_shift'
 
     @staticmethod
     def is_layer_type(type_name):
@@ -533,6 +548,45 @@ def identity_projection(input, offset=None, size=None):
     return proj
 
 
+def slice_projection(input, slices):
+    """
+    slice_projection can slice the input value into multiple parts,
+    and then select some of them to merge into a new output.
+
+    .. math::
+       output = [input.slices()]
+
+    The example usage is:
+
+    .. code-block:: python
+
+       proj = slice_projection(input=layer, slices=[(0, 10), (20, 30)])
+
+    Note that slice_projection should not have any parameter.
+
+    :param input: Input Layer.
+    :type input: LayerOutput
+    :param slices: An array of slice parameters.
+                   Each slice contains the start and end offsets based
+                   on the input.
+    :type slices: pair of int
+    :return: A SliceProjection object
+    :rtype: SliceProjection
+    """
+    assert len(slices) >= 1
+    start = 0
+    for i in xrange(len(slices)):
+        assert len(slices[i]) == 2
+        # The start position of the next slice needs to be greater than
+        # or equal to the end position of the previous slice.
+        assert slices[i][0] >= start
+        assert slices[i][1] >= slices[i][0]
+        start = slices[i][1]
+    proj = SliceProjection(input_layer_name=input.name, slices=slices)
+    proj.origin = input
+    return proj
+
+
 @wrap_param_attr_default()
 def scaling_projection(input, param_attr=None):
     """
@@ -862,7 +916,7 @@ def data_layer(name, size, height=None, width=None, layer_attr=None):
 
 @wrap_name_default("embedding")
 @wrap_param_attr_default()
-@layer_support(ERROR_CLIPPING)
+@layer_support(ERROR_CLIPPING, DROPOUT)
 def embedding_layer(input, size, name=None, param_attr=None, layer_attr=None):
     """
     Define a embedding Layer.
@@ -1317,7 +1371,7 @@ def pooling_layer(input,
 @wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
 @wrap_act_default(param_names=["act", 'state_act'], act=TanhActivation())
 @wrap_name_default("lstmemory")
-@layer_support(DROPOUT)
+@layer_support()
 def lstmemory(input,
               name=None,
               size=None,
@@ -1426,7 +1480,7 @@ def lstmemory(input,
 @wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
 @wrap_act_default(param_names=["act"], act=TanhActivation())
 @wrap_name_default("gru")
-@layer_support(DROPOUT)
+@layer_support()
 def grumemory(input,
               size=None,
               name=None,
@@ -1790,7 +1844,7 @@ def repeat_layer(input,
 @wrap_name_default("seqreshape")
 @wrap_act_default(act=IdentityActivation())
 @wrap_bias_attr_default(has_bias=False)
-@layer_support()
+@layer_support(ERROR_CLIPPING, DROPOUT)
 def seq_reshape_layer(input,
                       reshape_size,
                       act=None,
@@ -2700,7 +2754,7 @@ def img_cmrnorm_layer(input,
     default_factory=lambda _: ParamAttr(initial_mean=1.0, initial_std=0.))
 @wrap_act_default(act=ReluActivation())
 @wrap_name_default("batch_norm")
-@layer_support(DROPOUT)
+@layer_support(DROPOUT, ERROR_CLIPPING)
 def batch_norm_layer(input,
                      act=None,
                      name=None,
@@ -2780,15 +2834,6 @@ def batch_norm_layer(input,
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
-    if not isinstance(act, ReluActivation):
-        logger.log(logging.WARN,
-                   "%s is not recommend for batch normalization's activation, "
-                   "maybe the relu is better" % act.name)
-
-    if not isinstance(input.activation, LinearActivation):
-        logger.log(logging.WARN,
-                   "The activation should be inside batch normalization, the "
-                   "previous layer's activation may be Linear")
 
     if num_channels is None:
         if input.num_filters is not None:
@@ -2855,10 +2900,46 @@ def sum_to_one_norm_layer(input, name=None, layer_attr=None):
         name, LayerType.SUM_TO_ONE_NORM_LAYER, parents=[input], size=input.size)
 
 
+@wrap_name_default()
+@layer_support()
+def row_l2_norm_layer(input, name=None, layer_attr=None):
+    """
+    A layer for L2-normalization in each row.
+
+    .. math::
+       out[i] = \frac{in[i]}{\sqrt{\sum_{k=1}^N in[k]^{2}}}
+
+    where the size of :math:`in` is (batchSize x dataDim) ,
+    and the size of :math:`out` is a (batchSize x dataDim) .
+
+    The example usage is:
+
+    .. code-block:: python
+
+       row_l2_norm_layer = row_l2_norm_layer(input=layer)
+
+    :param input: Input layer.
+    :type input: LayerOutput
+    :param name: Layer name.
+    :type name: basestring
+    :param layer_attr: extra layer attributes.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    Layer(
+        name=name,
+        type=LayerType.ROW_L2_NORM_LAYER,
+        inputs=[input.name],
+        **ExtraAttr.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.ROW_L2_NORM_LAYER, parents=[input], size=input.size)
+
+
 @wrap_name_default("addto")
 @wrap_act_default(act=LinearActivation())
 @wrap_bias_attr_default(has_bias=False)
-@layer_support(DROPOUT)
+@layer_support(DROPOUT, ERROR_CLIPPING)
 def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None):
     """
     AddtoLayer.
@@ -2937,7 +3018,7 @@ def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None):
 
 @wrap_act_default(act=IdentityActivation())
 @wrap_name_default("concat")
-@layer_support()
+@layer_support(DROPOUT, ERROR_CLIPPING)
 def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
     """
     Concat all input vector into one huge vector.
@@ -3021,7 +3102,7 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
 @wrap_name_default("seqconcat")
 @wrap_act_default(act=IdentityActivation())
 @wrap_bias_attr_default(has_bias=False)
-@layer_support()
+@layer_support(DROPOUT, ERROR_CLIPPING)
 def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
                      bias_attr=None):
     """
@@ -3170,8 +3251,8 @@ def memory(name,
 
 
 @wrap_bias_attr_default()
-@wrap_act_default(
-    param_names=['gate_act', 'state_act'], act=SigmoidActivation())
+@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
+@wrap_act_default(param_names=['state_act'], act=TanhActivation())
 @wrap_act_default(act=TanhActivation())
 @wrap_name_default('lstm_step')
 @layer_support()
@@ -3528,12 +3609,7 @@ def SubsequenceInput(input):
 
 
 @wrap_name_default("recurrent_group")
-def recurrent_group(step,
-                    input,
-                    reverse=False,
-                    name=None,
-                    targetInlink=None,
-                    is_generating=False):
+def recurrent_group(step, input, reverse=False, name=None, targetInlink=None):
     """
     Recurrent layer group is an extremely flexible recurrent unit in
     PaddlePaddle. As long as the user defines the calculation done within a
@@ -3599,21 +3675,12 @@ def recurrent_group(step,
 
     :type targetInlink: LayerOutput|SubsequenceInput
 
-    :param is_generating: If is generating, none of input type should be LayerOutput;
-                          else, for training or testing, one of the input type must
-                          be LayerOutput.
-
-    :type is_generating: bool
-
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
     model_type('recurrent_nn')
 
-    def is_single_input(x):
-        return isinstance(x, LayerOutput) or isinstance(x, StaticInput)
-
-    if is_single_input(input):
+    if isinstance(input, LayerOutput) or isinstance(input, StaticInput):
         input = [input]
     assert isinstance(input, collections.Sequence)
 
@@ -3627,13 +3694,8 @@ def recurrent_group(step,
         in_links=map(lambda x: x.name, in_links),
         seq_reversed=reverse)
     in_args = []
-    has_LayerOutput = False
     for each_input in input:
-        assert is_single_input(each_input)
-        if isinstance(each_input, LayerOutput):
-            in_args.append(each_input)
-            has_LayerOutput = True
-        else:  # StaticInput
+        if isinstance(each_input, StaticInput):  # StaticInput
             mem_name = "__%s_memory__" % each_input.input.name
             mem = memory(
                 name=None,
@@ -3641,24 +3703,26 @@ def recurrent_group(step,
                 boot_layer=each_input.input)
             mem.set_input(mem)
             in_args.append(mem)
-
-    assert (is_generating != has_LayerOutput)
+        else:
+            in_args.append(each_input)
 
     layer_outs = step(*in_args)
 
     if isinstance(layer_outs, LayerOutput):
         layer_outs = [layer_outs]
 
-    for ot in layer_outs:
-        assert isinstance(ot, LayerOutput)
-        ot.reverse = reverse
-        RecurrentLayerGroupSetOutLink(ot.name)
+    for layer_out in layer_outs:
+        assert isinstance(
+            layer_out, LayerOutput
+        ), "Type of step function's return value must be LayerOutput."
+        layer_out.reverse = reverse
+        RecurrentLayerGroupSetOutLink(layer_out.name)
 
     RecurrentLayerGroupEnd(name=name)
 
     for layer_out in layer_outs:
-        # Thee previous full_name is the name is the rnn group
-        # We need a full_name outside the rnn group
+        # The previous full_name is the name inside the recurrent group.
+        # We need a full_name outside the recurrent group.
         layer_out.full_name = MakeLayerNameInSubmodel(layer_out.name)
 
     if len(layer_outs) == 1:
@@ -3681,7 +3745,20 @@ class BaseGeneratedInput(object):
 
 class GeneratedInput(BaseGeneratedInput):
     def after_real_step(self, input):
-        return maxid_layer(input=input, name='__beam_search_predict__')
+        if isinstance(input, LayerOutput):
+            input = [input]
+        elif isinstance(input, collections.Sequence):
+            input = list(input)
+            if len(input) > 1:
+                logger.info(
+                    ("More than one layers inside the recurrent_group "
+                     "are returned as outputs of the entire recurrent_group "
+                     "PLEASE garantee the first output is probability of "
+                     "the predicted next word."))
+
+        return [maxid_layer(
+            input=input[0], name='__beam_search_predict__')] + (
+                input[1:] if len(input) > 1 else [])
 
     def before_real_step(self):
         predict_id = memory(
@@ -3868,6 +3945,7 @@ def beam_search(step,
     :type step: callable
     :param input: Input data for the recurrent unit, which should include the
                   previously generated words as a GeneratedInput object.
+                  In beam_search, none of the input's type should be LayerOutput.
     :type input: list
     :param bos_id: Index of the start symbol in the dictionary. The start symbol
                    is a special token for NLP task, which indicates the
@@ -3909,15 +3987,18 @@ def beam_search(step,
 
     real_input = []
     for i, each_input in enumerate(input):
-        assert isinstance(each_input, StaticInput) or isinstance(
-            each_input, BaseGeneratedInput)
+        assert not isinstance(each_input, LayerOutput), (
+            "in beam_search, "
+            "none of the input should has a type of LayerOutput.")
         if isinstance(each_input, BaseGeneratedInput):
-            assert generated_input_index == -1
+            assert generated_input_index == -1, ("recurrent_group accepts "
+                                                 "only one GeneratedInput.")
             generated_input_index = i
+
         else:
             real_input.append(each_input)
 
-    assert generated_input_index != -1
+    assert generated_input_index != -1, "No GeneratedInput is given."
 
     gipt = input[generated_input_index]
 
@@ -3938,17 +4019,11 @@ def beam_search(step,
 
         predict = gipt.after_real_step(step(*args))
 
-        eos_layer(input=predict, eos_id=eos_id, name=eos_name)
+        eos_layer(input=predict[0], eos_id=eos_id, name=eos_name)
         return predict
 
-    tmp = recurrent_group(
-        step=__real_step__,
-        input=real_input,
-        reverse=False,
-        name=name,
-        is_generating=True)
-
-    return tmp
+    return recurrent_group(
+        step=__real_step__, input=real_input, reverse=False, name=name)
 
 
 def __cost_input__(input, label, weight=None):
@@ -4483,7 +4558,7 @@ def tensor_layer(a,
 @wrap_param_attr_default()
 @wrap_bias_attr_default()
 @wrap_act_default()
-@layer_support()
+@layer_support(DROPOUT, ERROR_CLIPPING)
 def selective_fc_layer(input,
                        size,
                        select=None,
@@ -5862,7 +5937,7 @@ def prelu_layer(input,
     :rtype: LayerOutput
     """
 
-    assert isinstance(input, LayerOutput), 'prelu_layer only accepts one input'
+    assert isinstance(input, LayerOutput), 'prelu_layer accepts only one input.'
     assert isinstance(param_attr, ParameterAttribute)
 
     l = Layer(
@@ -5876,3 +5951,304 @@ def prelu_layer(input,
         layer_type=LayerType.PRELU,
         parents=input,
         size=l.config.size)
+
+
+@wrap_name_default()
+@layer_support(ERROR_CLIPPING, DROPOUT)
+@wrap_act_default(act=LinearActivation())
+def gated_unit_layer(input,
+                     size,
+                     act=None,
+                     name=None,
+                     gate_attr=None,
+                     gate_param_attr=None,
+                     gate_bias_attr=True,
+                     inproj_attr=None,
+                     inproj_param_attr=None,
+                     inproj_bias_attr=True,
+                     layer_attr=None):
+    """
+    The gated unit layer implements a simple gating mechanism over the input.
+    The input :math:`X` is first projected into a new space :math:`X'`, and
+    it is also used to produce a gate weight :math:`\sigma`. Element-wise
+    prodict between :match:`X'` and :math:`\sigma` is finally returned.
+
+    Reference:
+        Language Modeling with Gated Convolutional Networks
+        https://arxiv.org/abs/1612.08083
+
+    .. math::
+       y=\\text{act}(X \cdot W + b)\otimes \sigma(X \cdot V + c)
+
+    The example usage is:
+
+    .. code-block:: python
+        gated_unit = gated_unit_layer(size=128, input=input_layer))
+
+    :param input: input for this layer.
+    :type input: LayerOutput
+    :param size: output size of the gated unit.
+    :type size: int
+    :param act: activation type of the projected input.
+    :type act: BaseActivation
+    :param name: name of this layer.
+    :type name: basestring
+    :param gate_attr: Attributes to tune the gate output, for example, error
+        clipping threshold, dropout and so on. See ExtraLayerAttribute for
+        more details.
+    :type gate_attr: ExtraLayerAttribute|None
+    :param gate_param_attr: Attributes to tune the learnable projected matrix
+        parameter of the gate.
+    :type gate_param_attr: ParameterAttribute|None
+    :param gate_bias_attr: Attributes to tune the learnable bias of the gate.
+    :type gate_bias_attr: ParameterAttribute|None
+    :param inproj_attr: Attributes to the tune the projected input, for
+        example, error clipping threshold, dropout and so on. See
+        ExtraLayerAttribute for more details.
+    :type inproj_attr: ExtraLayerAttribute|None
+    :param inproj_param_attr: Attributes to tune the learnable parameter of
+        the projection of input.
+    :type inproj_param_attr: ParameterAttribute|None
+    :param inproj_bias_attr: Attributes to tune the learnable bias of
+        projection of the input.
+    :type inproj_bias_attr: ParameterAttribute|None
+    :param layer_attr: Attributes to tune the final output of the gated unit,
+        for example, error clipping threshold, dropout and so on. See
+        ExtraLayerAttribute for more details.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(
+        input, LayerOutput), 'The gated linear unit accepts only one input.'
+
+    input_proj = fc_layer(
+        input=input,
+        name="%s_input_proj" % name,
+        size=size,
+        act=act,
+        layer_attr=inproj_attr,
+        param_attr=inproj_param_attr,
+        bias_attr=inproj_bias_attr)
+
+    gate = fc_layer(
+        size=size,
+        name="%s_gate" % name,
+        act=SigmoidActivation(),
+        input=input,
+        layer_attr=gate_attr,
+        param_attr=gate_param_attr,
+        bias_attr=gate_bias_attr)
+    return mixed_layer(
+        name="%s_gated_act" % name,
+        input=dotmul_operator(input_proj, gate),
+        layer_attr=layer_attr)
+
+
+@wrap_name_default()
+@layer_support()
+def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None):
+    """
+    The crop layer crops images by offset and shape. User can set crop shape by
+    args 'shape' explicitly or by reference input layer.
+
+    The example usage is:
+
+    .. code-block:: python
+    crop = crop_layer(input=[image_input, reference_input], axis=2, offset=[2, 3])
+
+    :param input: The input layer.If two inputs were setted,
+                    the second input will be regarded as reference input
+    :type input: LayerOutput or Sequence
+    :param offset: The crop offset
+    :type offset: Sequence
+    :param axis: start axis to be cropped. To image input layer:
+        - 0: batch size
+        - 1: channels
+        - 2: height
+        - 3: width
+    :type partial_sum: int
+    :param shape: The shape to be cropped. Default is None.
+    :type shape: Sequence | None
+    :param name: Name of this layer.
+    :type name: basestring
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    if isinstance(input, LayerOutput):
+        input = [input]
+    else:
+        assert isinstance(input, collections.Sequence)
+    l = Layer(
+        inputs=[x.name for x in input],
+        axis=axis,
+        offset=offset,
+        shape=shape,
+        name=name,
+        type=LayerType.CROP_LAYER,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.CROP_LAYER,
+        parents=input,
+        size=l.config.size)
+
+
+@wrap_name_default()
+@layer_support()
+def sub_nested_seq_layer(input, selected_indices, name=None):
+    """
+    The sub_nested_seq_layer accepts two inputs: the first one is a nested
+    sequence; the second one is a set of selceted indices in the nested sequence.
+
+    Then sub_nest_seq_layer trims the first nested sequence input according
+    to the selected indices to form a new output. This layer is useful in
+    beam training.
+
+    The example usage is:
+
+    .. code-block:: python
+
+        sub_nest_seq = sub_nested_seq_layer(input=[data, selected_indices])
+
+
+    :param input: A nested sequence.
+    :type input: LayerOutput
+    :param selected_indices: a set of sequence indices in the nested sequence.
+    :type input: LayerOutput
+    :param name: name of this layer.
+    :type name: basestring
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(input, LayerOutput), (
+        'The first input of '
+        'sub_nested_seq_layer must be a Paddle layer.')
+    assert isinstance(selected_indices, LayerOutput), (
+        'The second input of '
+        'sub_nested_seq_layer must be a Paddle layer.')
+
+    l = Layer(
+        inputs=input.name,
+        selected_indices=selected_indices.name,
+        name=name,
+        type=LayerType.SUB_NESTED_SEQ)
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.SUB_NESTED_SEQ,
+        parents=input,
+        size=l.config.size)
+
+
+@wrap_name_default("clip")
+def clip_layer(input, min, max, name=None):
+    """
+    A layer for clipping the input value by the threshold.
+
+    .. math::
+
+        out[i] = \min\left(\max\left(in[i],p_{1}\right),p_{2}\right)
+
+    .. code-block:: python
+
+        clip = clip_layer(input=input_layer, min=-10, max=10)
+
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input: The input layer.
+    :type input: LayerOutput.
+    :param min: The lower threshold for clipping.
+    :type min: double
+    :param max: The upper threshold for clipping.
+    :type max: double
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    Layer(
+        name=name,
+        type=LayerType.CLIP_LAYER,
+        inputs=[input.name],
+        min=min,
+        max=max)
+    return LayerOutput(
+        name, LayerType.CLIP_LAYER, parents=[input], size=input.size)
+
+
+@wrap_name_default()
+@layer_support()
+def kmax_sequence_score_layer(input, name=None, beam_size=1):
+    """
+    This layer accepts one input which are scores over a sequence or a nested
+    sequence, and returns indices of beam_size sequences with highest scores.
+
+    .. code-block:: python
+
+        kmax_indices = kmax_sequence_score_layer(input=input_layer, beam_size)
+
+
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input: The input layer. It stores scores over a sequence or a nested
+        sequence and its size must be 1.
+    :type input: LayerOutput.
+    :param beam_size: squence indices with top beam_size scores are returned.
+    :type beam_size: double
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput), ("kmax_sequence_score_layer "
+                                            "accepts only one input.")
+    assert input.size == 1, (
+        "input of kmax_sequence_score_layer is a score"
+        "over a sequence or a nested sequence, so its width must be 1.")
+
+    Layer(
+        name=name,
+        type=LayerType.KMAX_SEQ_SCORE,
+        inputs=[input.name],
+        beam_size=beam_size)
+
+    return LayerOutput(
+        name, LayerType.KMAX_SEQ_SCORE, parents=[input], size=input.size)
+
+
+@wrap_name_default("scale_shift")
+@wrap_param_attr_default()
+@wrap_bias_attr_default()
+def scale_shift_layer(input, name=None, param_attr=None, bias_attr=None):
+    """
+    A layer applies a linear transformation to each element in each row of 
+    the input matrix. For each element, the layer first re-scale it and then 
+    adds a bias to it.
+
+    This layer is very like the SlopeInterceptLayer, except the scale and 
+    bias are trainable.
+
+    .. math::
+
+        y = w * x + b
+
+    .. code-block:: python
+
+        scale_shift = scale_shift_layer(input=input_layer, bias_attr=False)
+
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input: The input layer.
+    :type input: LayerOutput.
+    :param param_attr: The parameter attribute of scaling.
+    :type param_attr: ParameterAttribute
+    :param bias_attr: The parameter attribute of shifting.
+    :type bias_attr: ParameterAttribute
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    Layer(
+        name=name,
+        type=LayerType.SCALE_SHIFT_LAYER,
+        inputs=Input(input.name, **param_attr.attr),
+        bias=ParamAttr.to_bias(bias_attr))
+    return LayerOutput(
+        name, LayerType.SCALE_SHIFT_LAYER, parents=[input], size=input.size)
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 810bea913e..34be203ee2 100755
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -340,24 +340,40 @@ def img_conv_group(input,
                    conv_with_batchnorm=False,
                    conv_batchnorm_drop_rate=0,
                    pool_stride=1,
-                   pool_type=None):
+                   pool_type=None,
+                   param_attr=None):
     """
     Image Convolution Group, Used for vgg net.
 
-    TODO(yuyang18): Complete docs
-
-    :param conv_batchnorm_drop_rate:
-    :param input:
-    :param conv_num_filter:
-    :param pool_size:
-    :param num_channels:
-    :param conv_padding:
-    :param conv_filter_size:
-    :param conv_act:
-    :param conv_with_batchnorm:
-    :param pool_stride:
-    :param pool_type:
-    :return:
+    :param conv_batchnorm_drop_rate: if conv_with_batchnorm[i] is true,
+        conv_batchnorm_drop_rate[i] represents the drop rate of each batch norm.
+    :type conv_batchnorm_drop_rate: list
+    :param input: layer's input.
+    :type input: LayerOutput
+    :param conv_num_filter: output channels num.
+    :type conv_num_filter: int
+    :param pool_size: pooling filter size.
+    :type pool_size: int
+    :param num_channels: input channels num.
+    :type num_channels: int
+    :param conv_padding: convolution padding size.
+    :type conv_padding: int
+    :param conv_filter_size: convolution filter size.
+    :type conv_filter_size: int
+    :param conv_act: activation funciton after convolution.
+    :type conv_act: BaseActivation
+    :param conv_with_batchnorm: conv_with_batchnorm[i] represents
+        if there is a batch normalization after each convolution.
+    :type conv_with_batchnorm: list
+    :param pool_stride: pooling stride size.
+    :type pool_stride: int
+    :param pool_type: pooling type.
+    :type pool_type: BasePoolingType
+    :param param_attr: Convolution param attribute.
+        None means default attribute.
+    :type param_attr: ParameterAttribute
+    :return: Layer's output
+    :type: LayerOutput
     """
     tmp = input
 
@@ -397,6 +413,7 @@ def img_conv_group(input,
             padding=conv_padding[i],
             filter_size=conv_filter_size[i],
             num_filters=conv_num_filter[i],
+            param_attr=param_attr,
             **extra_kwargs)
 
         # logger.debug("tmp.num_filters = %d" % tmp.num_filters)
@@ -614,18 +631,17 @@ def simple_lstm(input,
 
 @wrap_name_default('lstm_unit')
 def lstmemory_unit(input,
-                   memory_boot=None,
+                   out_memory=None,
                    name=None,
                    size=None,
                    param_attr=None,
                    act=None,
                    gate_act=None,
                    state_act=None,
-                   mixed_bias_attr=None,
+                   input_proj_bias_attr=None,
+                   input_proj_layer_attr=None,
                    lstm_bias_attr=None,
-                   mixed_layer_attr=None,
-                   lstm_layer_attr=None,
-                   get_output_layer_attr=None):
+                   lstm_layer_attr=None):
     """
     Define calculations that a LSTM unit performs during a single time step.
     This function itself is not a recurrent layer, so it can not be
@@ -662,8 +678,8 @@ def lstmemory_unit(input,
 
     :param input: input layer name.
     :type input: LayerOutput
-    :param memory_boot: the initialization state of the LSTM cell.
-    :type memory_boot: LayerOutput | None
+    :param out_memory: output of previous time step
+    :type out_memory: LayerOutput | None
     :param name: lstmemory unit name.
     :type name: basestring
     :param size: lstmemory unit size.
@@ -676,33 +692,35 @@ def lstmemory_unit(input,
     :type gate_act: BaseActivation
     :param state_act: lstm state activiation type.
     :type state_act: BaseActivation
-    :param mixed_bias_attr: bias parameter attribute of mixed layer.
-                            False means no bias, None means default bias.
-    :type mixed_bias_attr: ParameterAttribute|False
+    :param input_proj_bias_attr: bias attribute for input-to-hidden projection.
+                False means no bias, None means default bias.
+    :type input_proj_bias_attr: ParameterAttribute|False|None
+    :param input_proj_layer_attr: extra layer attribute for input to hidden
+                projection of the LSTM unit, such as dropout, error clipping.
+    :type input_proj_layer_attr: ExtraLayerAttribute
     :param lstm_bias_attr: bias parameter attribute of lstm layer.
-                           False means no bias, None means default bias.
+                False means no bias, None means default bias.
     :type lstm_bias_attr: ParameterAttribute|False
-    :param mixed_layer_attr: mixed layer's extra attribute.
-    :type mixed_layer_attr: ExtraLayerAttribute
     :param lstm_layer_attr: lstm layer's extra attribute.
     :type lstm_layer_attr: ExtraLayerAttribute
-    :param get_output_layer_attr: get output layer's extra attribute.
-    :type get_output_layer_attr: ExtraLayerAttribute
     :return: lstmemory unit name.
     :rtype: LayerOutput
     """
     if size is None:
         assert input.size % 4 == 0
         size = input.size / 4
-    out_mem = memory(name=name, size=size)
-    state_mem = memory(
-        name="%s_state" % name, size=size, boot_layer=memory_boot)
+    if out_memory is None:
+        out_mem = memory(name=name, size=size)
+    else:
+        out_mem = out_memory
+
+    state_mem = memory(name="%s_state" % name, size=size)
 
     with mixed_layer(
             name="%s_input_recurrent" % name,
             size=size * 4,
-            bias_attr=mixed_bias_attr,
-            layer_attr=mixed_layer_attr,
+            bias_attr=input_proj_bias_attr,
+            layer_attr=input_proj_layer_attr,
             act=IdentityActivation()) as m:
         m += identity_projection(input=input)
         m += full_matrix_projection(input=out_mem, param_attr=param_attr)
@@ -717,11 +735,7 @@ def lstmemory_unit(input,
         gate_act=gate_act,
         state_act=state_act,
         layer_attr=lstm_layer_attr)
-    get_output_layer(
-        name='%s_state' % name,
-        input=lstm_out,
-        arg_name='state',
-        layer_attr=get_output_layer_attr)
+    get_output_layer(name='%s_state' % name, input=lstm_out, arg_name='state')
 
     return lstm_out
 
@@ -730,17 +744,16 @@ def lstmemory_unit(input,
 def lstmemory_group(input,
                     size=None,
                     name=None,
-                    memory_boot=None,
+                    out_memory=None,
                     reverse=False,
                     param_attr=None,
                     act=None,
                     gate_act=None,
                     state_act=None,
-                    mixed_bias_attr=None,
+                    input_proj_bias_attr=None,
+                    input_proj_layer_attr=None,
                     lstm_bias_attr=None,
-                    mixed_layer_attr=None,
-                    lstm_layer_attr=None,
-                    get_output_layer_attr=None):
+                    lstm_layer_attr=None):
     """
     lstm_group is a recurrent_group version of Long Short Term Memory. It
     does exactly the same calculation as the lstmemory layer (see lstmemory in
@@ -774,8 +787,8 @@ def lstmemory_group(input,
     :type size: int
     :param name: name of the lstmemory group.
     :type name: basestring
-    :param memory_boot: the initialization state of LSTM cell.
-    :type memory_boot: LayerOutput | None
+    :param out_memory: output of previous time step
+    :type out_memory: LayerOutput | None
     :param reverse: is lstm reversed
     :type reverse: bool
     :param param_attr: Parameter config, None if use default.
@@ -786,18 +799,17 @@ def lstmemory_group(input,
     :type gate_act: BaseActivation
     :param state_act: lstm state activiation type.
     :type state_act: BaseActivation
-    :param mixed_bias_attr: bias parameter attribute of mixed layer.
-                            False means no bias, None means default bias.
-    :type mixed_bias_attr: ParameterAttribute|False
     :param lstm_bias_attr: bias parameter attribute of lstm layer.
                            False means no bias, None means default bias.
     :type lstm_bias_attr: ParameterAttribute|False
-    :param mixed_layer_attr: mixed layer's extra attribute.
-    :type mixed_layer_attr: ExtraLayerAttribute
+    :param input_proj_bias_attr: bias attribute for input-to-hidden projection.
+                False means no bias, None means default bias.
+    :type input_proj_bias_attr: ParameterAttribute|False|None
+    :param input_proj_layer_attr: extra layer attribute for input to hidden
+                projection of the LSTM unit, such as dropout, error clipping.
+    :type input_proj_layer_attr: ExtraLayerAttribute
     :param lstm_layer_attr: lstm layer's extra attribute.
     :type lstm_layer_attr: ExtraLayerAttribute
-    :param get_output_layer_attr: get output layer's extra attribute.
-    :type get_output_layer_attr: ExtraLayerAttribute
     :return: the lstmemory group.
     :rtype: LayerOutput
     """
@@ -805,18 +817,17 @@ def lstmemory_group(input,
     def __lstm_step__(ipt):
         return lstmemory_unit(
             input=ipt,
-            memory_boot=memory_boot,
             name=name,
             size=size,
-            mixed_bias_attr=mixed_bias_attr,
-            mixed_layer_attr=mixed_layer_attr,
-            param_attr=param_attr,
-            lstm_bias_attr=lstm_bias_attr,
             act=act,
             gate_act=gate_act,
             state_act=state_act,
+            out_memory=out_memory,
+            input_proj_bias_attr=input_proj_bias_attr,
+            input_proj_layer_attr=input_proj_layer_attr,
+            param_attr=param_attr,
             lstm_layer_attr=lstm_layer_attr,
-            get_output_layer_attr=get_output_layer_attr)
+            lstm_bias_attr=lstm_bias_attr)
 
     return recurrent_group(
         name='%s_recurrent_group' % name,
diff --git a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
index 6c860fd497..580aef935b 100644
--- a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
@@ -1,17 +1,17 @@
 #################### test_config_parser #########################
 add_test(NAME layers_test
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/layers_test.py
-    WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
+        ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/layers_test.py
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/python/paddle)
 
 add_test(NAME test_reset_hook
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
-    WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
+        ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/python/paddle)
 
 add_paddle_exe(protobuf_equal ProtobufEqualMain.cpp)
 add_test(NAME test_layerHelpers
   COMMAND
-  ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
+  ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
   ${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal
 )
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index 70e342fb79..3860699f6f 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -7,6 +7,7 @@ test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
 test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
 test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
 test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer
-test_recursive_topology)
+test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer
+test_kmax_seq_socre_layer test_seq_select_layers test_scale_shift_layer)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
index 7f2aa5a0fe..75cf231203 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
@@ -104,7 +104,7 @@ layers {
   }
   bias_parameter_name: "lstm_bias"
   active_gate_type: "sigmoid"
-  active_state_type: "sigmoid"
+  active_state_type: "tanh"
 }
 layers {
   name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
@@ -183,7 +183,7 @@ layers {
   }
   bias_parameter_name: "lstm_bias"
   active_gate_type: "sigmoid"
-  active_state_type: "sigmoid"
+  active_state_type: "tanh"
 }
 layers {
   name: "__lstm_group_1___state@__lstm_group_1___recurrent_group"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_clip_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_clip_layer.protostr
new file mode 100644
index 0000000000..4b9578a0c0
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_clip_layer.protostr
@@ -0,0 +1,31 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "__clip_0__"
+  type: "clip"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+    clip_conf {
+      min: -10
+      max: 10
+    }
+  }
+}
+input_layer_names: "input"
+output_layer_names: "__clip_0__"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "__clip_0__"
+  input_layer_names: "input"
+  output_layer_names: "__clip_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr
new file mode 100644
index 0000000000..f1e4d894a5
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr
@@ -0,0 +1,106 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 256
+  active_type: ""
+}
+layers {
+  name: "__gated_unit_layer_0___input_proj"
+  type: "fc"
+  size: 512
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___gated_unit_layer_0___input_proj.w0"
+  }
+  bias_parameter_name: "___gated_unit_layer_0___input_proj.wbias"
+  error_clipping_threshold: 100.0
+}
+layers {
+  name: "__gated_unit_layer_0___gate"
+  type: "fc"
+  size: 512
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___gated_unit_layer_0___gate.w0"
+  }
+  bias_parameter_name: "___gated_unit_layer_0___gate.wbias"
+  error_clipping_threshold: 100.0
+}
+layers {
+  name: "__gated_unit_layer_0___gated_act"
+  type: "mixed"
+  size: 512
+  active_type: ""
+  inputs {
+    input_layer_name: "__gated_unit_layer_0___input_proj"
+  }
+  inputs {
+    input_layer_name: "__gated_unit_layer_0___gate"
+  }
+  error_clipping_threshold: 100.0
+  operator_confs {
+    type: "dot_mul"
+    input_indices: 0
+    input_indices: 1
+    input_sizes: 512
+    input_sizes: 512
+    output_size: 512
+    dotmul_scale: 1
+  }
+}
+parameters {
+  name: "___gated_unit_layer_0___input_proj.w0"
+  size: 131072
+  initial_mean: 0.0
+  initial_std: 0.0001
+  dims: 256
+  dims: 512
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___gated_unit_layer_0___input_proj.wbias"
+  size: 512
+  initial_mean: 0.0
+  initial_std: 1
+  dims: 1
+  dims: 512
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___gated_unit_layer_0___gate.w0"
+  size: 131072
+  initial_mean: 0.0
+  initial_std: 0.0001
+  dims: 256
+  dims: 512
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___gated_unit_layer_0___gate.wbias"
+  size: 512
+  initial_mean: 0.0
+  initial_std: 1
+  dims: 1
+  dims: 512
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "input"
+output_layer_names: "__gated_unit_layer_0___gated_act"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "__gated_unit_layer_0___input_proj"
+  layer_names: "__gated_unit_layer_0___gate"
+  layer_names: "__gated_unit_layer_0___gated_act"
+  input_layer_names: "input"
+  output_layer_names: "__gated_unit_layer_0___gated_act"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr
new file mode 100644
index 0000000000..81bd71f68e
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr
@@ -0,0 +1,66 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "data"
+  type: "data"
+  size: 128
+  active_type: ""
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 1
+  active_type: "exponential"
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___fc_layer_0__.w0"
+  }
+  bias_parameter_name: "___fc_layer_0__.wbias"
+}
+layers {
+  name: "__kmax_sequence_score_layer_0__"
+  type: "kmax_seq_score"
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+  }
+  beam_size: 5
+}
+parameters {
+  name: "___fc_layer_0__.w0"
+  size: 128
+  initial_mean: 0.0
+  initial_std: 0.0883883476483
+  dims: 128
+  dims: 1
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_0__.wbias"
+  size: 1
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+output_layer_names: "__kmax_sequence_score_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "data"
+  layer_names: "__fc_layer_0__"
+  layer_names: "__kmax_sequence_score_layer_0__"
+  input_layer_names: "data"
+  output_layer_names: "__kmax_sequence_score_layer_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
index af1b63c5df..711785be37 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
@@ -258,7 +258,7 @@ layers {
   }
   bias_parameter_name: "___lstm_group_0__@__lstm_group_0___recurrent_group.wbias"
   active_gate_type: "sigmoid"
-  active_state_type: "sigmoid"
+  active_state_type: "tanh"
 }
 layers {
   name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_l2_norm_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_l2_norm_layer.protostr
new file mode 100644
index 0000000000..c2786ff55c
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_l2_norm_layer.protostr
@@ -0,0 +1,27 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "__row_l2_norm_layer_0__"
+  type: "row_l2_norm"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+  }
+}
+input_layer_names: "input"
+output_layer_names: "__row_l2_norm_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "__row_l2_norm_layer_0__"
+  input_layer_names: "input"
+  output_layer_names: "__row_l2_norm_layer_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_shift_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_shift_layer.protostr
new file mode 100644
index 0000000000..35ade126a2
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_shift_layer.protostr
@@ -0,0 +1,72 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__scale_shift_0__"
+  type: "scale_shift"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___scale_shift_0__.w0"
+  }
+}
+layers {
+  name: "__scale_shift_1__"
+  type: "scale_shift"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___scale_shift_1__.w0"
+  }
+  bias_parameter_name: "___scale_shift_1__.wbias"
+}
+parameters {
+  name: "___scale_shift_0__.w0"
+  size: 1
+  initial_mean: 0.0
+  initial_std: 1.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___scale_shift_1__.w0"
+  size: 1
+  initial_mean: 0.0
+  initial_std: 1.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___scale_shift_1__.wbias"
+  size: 1
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+output_layer_names: "__scale_shift_0__"
+output_layer_names: "__scale_shift_1__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__scale_shift_0__"
+  layer_names: "__scale_shift_1__"
+  input_layer_names: "data"
+  output_layer_names: "__scale_shift_0__"
+  output_layer_names: "__scale_shift_1__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_select_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_select_layers.protostr
new file mode 100644
index 0000000000..4b906b113e
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_select_layers.protostr
@@ -0,0 +1,37 @@
+type: "nn"
+layers {
+  name: "input_seq"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "input"
+  type: "data"
+  size: 5
+  active_type: ""
+}
+layers {
+  name: "__sub_nested_seq_layer_0__"
+  type: "sub_nested_seq"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "input_seq"
+  }
+  inputs {
+    input_layer_name: "input"
+  }
+}
+input_layer_names: "input_seq"
+output_layer_names: "__sub_nested_seq_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "input_seq"
+  layer_names: "input"
+  layer_names: "__sub_nested_seq_layer_0__"
+  input_layer_names: "input_seq"
+  output_layer_names: "__sub_nested_seq_layer_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py b/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
index 05810597b3..565e281a6e 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
@@ -20,12 +20,13 @@ lstm1 = lstmemory_group(
     input=m1,
     param_attr=lstm_param,
     lstm_bias_attr=lstm_bias,
-    mixed_bias_attr=False)
+    input_proj_bias_attr=False)
+
 lstm2 = lstmemory_group(
     input=m2,
     param_attr=lstm_param,
     lstm_bias_attr=lstm_bias,
-    mixed_bias_attr=False)
+    input_proj_bias_attr=False)
 
 softmax_param = ParamAttr(name='softmax_param')
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py
new file mode 100644
index 0000000000..f066fe1fb3
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py
@@ -0,0 +1,6 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='input', size=300)
+clip = clip_layer(input=data, min=-10, max=10)
+
+outputs(clip)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_crop.py b/python/paddle/trainer_config_helpers/tests/configs/test_crop.py
new file mode 100644
index 0000000000..8314a7e9a5
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_crop.py
@@ -0,0 +1,21 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+data = data_layer(name='data', size=2016, height=48, width=42)
+refernce_data = data_layer(name='data', size=768, height=16, width=16)
+
+conv = img_conv_layer(
+    input=data,
+    filter_size=3,
+    num_channels=1,
+    num_filters=16,
+    padding=1,
+    act=LinearActivation(),
+    bias_attr=True)
+
+pool = img_pool_layer(input=conv, pool_size=2, stride=2, pool_type=MaxPooling())
+
+crop = crop_layer(input=[pool, refernce_data], axis=2)
+
+outputs(pad)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
new file mode 100644
index 0000000000..9dab45519c
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
@@ -0,0 +1,16 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='input', size=256)
+glu = gated_unit_layer(
+    size=512,
+    input=data,
+    act=TanhActivation(),
+    gate_attr=ExtraLayerAttribute(error_clipping_threshold=100.0),
+    gate_param_attr=ParamAttr(initial_std=1e-4),
+    gate_bias_attr=ParamAttr(initial_std=1),
+    inproj_attr=ExtraLayerAttribute(error_clipping_threshold=100.0),
+    inproj_param_attr=ParamAttr(initial_std=1e-4),
+    inproj_bias_attr=ParamAttr(initial_std=1),
+    layer_attr=ExtraLayerAttribute(error_clipping_threshold=100.0))
+
+outputs(glu)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
new file mode 100644
index 0000000000..d245c5a41c
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python
+#coding=utf-8
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='input', size=300)
+
+data = data_layer(name="data", size=128)
+scores = fc_layer(input=data, size=1, act=ExpActivation())
+kmax_seq_id = kmax_sequence_score_layer(input=scores, beam_size=5)
+
+outputs(kmax_seq_id)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py
new file mode 100644
index 0000000000..ac8badb26a
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py
@@ -0,0 +1,6 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='input', size=300)
+row_l2_norm = row_l2_norm_layer(input=data)
+
+outputs(row_l2_norm)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_scale_shift_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_scale_shift_layer.py
new file mode 100644
index 0000000000..dd589116fa
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_scale_shift_layer.py
@@ -0,0 +1,9 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='data', size=100)
+
+scale = scale_shift_layer(input=data, bias_attr=False)
+
+scale_shift = scale_shift_layer(input=data)
+
+outputs(scale, scale_shift)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_seq_select_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_seq_select_layers.py
new file mode 100644
index 0000000000..6d1c3175ba
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_seq_select_layers.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python
+#coding=utf-8
+from paddle.trainer_config_helpers import *
+
+beam_size = 5
+
+data = data_layer(name='input_seq', size=300)
+selected_ids = data_layer(name='input', size=beam_size)
+sub_nest_seq = sub_nested_seq_layer(input=data, selected_indices=selected_ids)
+
+outputs(sub_nest_seq)
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
index 3ba5c31871..5bea980611 100644
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -20,7 +20,6 @@ import trainer
 import event
 import data_type
 import topology
-import data_feeder
 import networks
 import evaluator
 from . import dataset
@@ -31,10 +30,11 @@ import op
 import pooling
 import inference
 import networks
-import py_paddle.swig_paddle as api
 import minibatch
 import plot
 import image
+import model
+import paddle.trainer.config_parser as cp
 
 __all__ = [
     'optimizer',
@@ -47,7 +47,6 @@ __all__ = [
     'data_type',
     'attr',
     'pooling',
-    'data_feeder',
     'dataset',
     'reader',
     'topology',
@@ -57,10 +56,14 @@ __all__ = [
     'evaluator',
     'image',
     'master',
+    'model',
 ]
 
+cp.begin_parse()
+
 
 def init(**kwargs):
+    import py_paddle.swig_paddle as api
     args = []
     args_dict = {}
     # NOTE: append arguments if they are in ENV
@@ -73,6 +76,11 @@ def init(**kwargs):
     for key in args_dict.keys():
         args.append('--%s=%s' % (key, str(args_dict[key])))
 
+    if 'use_gpu' in kwargs:
+        cp.g_command_config_args['use_gpu'] = kwargs['use_gpu']
+    assert 'parallel_nn' not in kwargs, ("currently 'parallel_nn' is not "
+                                         "supported in v2 APIs.")
+
     api.initPaddle(*args)
 
 
diff --git a/python/paddle/v2/data_feeder.py b/python/paddle/v2/data_feeder.py
index 2698251b9e..98dfb85a0e 100644
--- a/python/paddle/v2/data_feeder.py
+++ b/python/paddle/v2/data_feeder.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from py_paddle import DataProviderConverter
 import collections
 import paddle.trainer.PyDataProvider2 as pydp2
diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py
index 2e4beb6882..90830515c1 100644
--- a/python/paddle/v2/dataset/__init__.py
+++ b/python/paddle/v2/dataset/__init__.py
@@ -26,8 +26,9 @@ import sentiment
 import wmt14
 import mq2007
 import flowers
+import voc2012
 
 __all__ = [
     'mnist', 'imikolov', 'imdb', 'cifar', 'movielens', 'conll05', 'sentiment'
-    'uci_housing', 'wmt14', 'mq2007', 'flowers'
+    'uci_housing', 'wmt14', 'mq2007', 'flowers', 'voc2012'
 ]
diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py
index f885b2834e..0a2a1ced11 100644
--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/v2/dataset/cifar.py
@@ -133,7 +133,7 @@ def convert(path):
     """
     Converts dataset to recordio format
     """
-    paddle.v2.dataset.common.convert(path, train100(), 10, "cifar_train100")
-    paddle.v2.dataset.common.convert(path, test100(), 10, "cifar_test100")
-    paddle.v2.dataset.common.convert(path, train10(), 10, "cifar_train10")
-    paddle.v2.dataset.common.convert(path, test10(), 10, "cifar_test10")
+    paddle.v2.dataset.common.convert(path, train100(), 1000, "cifar_train100")
+    paddle.v2.dataset.common.convert(path, test100(), 1000, "cifar_test100")
+    paddle.v2.dataset.common.convert(path, train10(), 1000, "cifar_train10")
+    paddle.v2.dataset.common.convert(path, test10(), 1000, "cifar_test10")
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
index 4a2eb59c34..053ae151c5 100644
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@@ -22,6 +22,8 @@ import importlib
 import paddle.v2.dataset
 import cPickle
 import glob
+import cPickle as pickle
+import random
 
 __all__ = [
     'DATA_HOME', 'download', 'md5file', 'split', 'cluster_files_reader',
@@ -30,17 +32,22 @@ __all__ = [
 
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
 
+
 # When running unit tests, there could be multiple processes that
 # trying to create DATA_HOME directory simultaneously, so we cannot
 # use a if condition to check for the existence of the directory;
 # instead, we use the filesystem as the synchronization mechanism by
 # catching returned errors.
-try:
-    os.makedirs(DATA_HOME)
-except OSError as exc:
-    if exc.errno != errno.EEXIST:
-        raise
-    pass
+def must_mkdirs(path):
+    try:
+        os.makedirs(DATA_HOME)
+    except OSError as exc:
+        if exc.errno != errno.EEXIST:
+            raise
+        pass
+
+
+must_mkdirs(DATA_HOME)
 
 
 def md5file(fname):
@@ -91,6 +98,19 @@ def fetch_all():
                 "fetch")()
 
 
+def fetch_all_recordio(path):
+    for module_name in filter(lambda x: not x.startswith("__"),
+                              dir(paddle.v2.dataset)):
+        if "convert" in dir(
+                importlib.import_module("paddle.v2.dataset.%s" % module_name)) and \
+                not module_name == "common":
+            ds_path = os.path.join(path, module_name)
+            must_mkdirs(ds_path)
+            getattr(
+                importlib.import_module("paddle.v2.dataset.%s" % module_name),
+                "convert")(ds_path)
+
+
 def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
     """
     you can call the function as:
@@ -164,55 +184,37 @@ def cluster_files_reader(files_pattern,
     return reader
 
 
-def convert(output_path,
-            reader,
-            num_shards,
-            name_prefix,
-            max_lines_to_shuffle=1000):
+def convert(output_path, reader, line_count, name_prefix):
     import recordio
-    import cPickle as pickle
-    import random
     """
     Convert data from reader to recordio format files.
 
     :param output_path: directory in which output files will be saved.
     :param reader: a data reader, from which the convert program will read data instances.
-    :param num_shards: the number of shards that the dataset will be partitioned into.
     :param name_prefix: the name prefix of generated files.
     :param max_lines_to_shuffle: the max lines numbers to shuffle before writing.
     """
 
-    assert num_shards >= 1
-    assert max_lines_to_shuffle >= 1
-
-    def open_writers():
-        w = []
-        for i in range(0, num_shards):
-            n = "%s/%s-%05d-of-%05d" % (output_path, name_prefix, i,
-                                        num_shards - 1)
-            w.append(recordio.writer(n))
-
-        return w
-
-    def close_writers(w):
-        for i in range(0, num_shards):
-            w[i].close()
+    assert line_count >= 1
+    indx_f = 0
 
-    def write_data(w, lines):
+    def write_data(indx_f, lines):
         random.shuffle(lines)
-        for i, d in enumerate(lines):
-            d = pickle.dumps(d, pickle.HIGHEST_PROTOCOL)
-            w[i % num_shards].write(d)
+        filename = "%s/%s-%05d" % (output_path, name_prefix, indx_f)
+        writer = recordio.writer(filename)
+        for l in lines:
+            # FIXME(Yancey1989):
+            # dumps with protocol: pickle.HIGHEST_PROTOCOL
+            writer.write(cPickle.dumps(l))
+        writer.close()
 
-    w = open_writers()
     lines = []
-
     for i, d in enumerate(reader()):
         lines.append(d)
-        if i % max_lines_to_shuffle == 0 and i >= max_lines_to_shuffle:
-            write_data(w, lines)
+        if i % line_count == 0 and i >= line_count:
+            write_data(indx_f, lines)
             lines = []
+            indx_f += 1
             continue
 
-    write_data(w, lines)
-    close_writers(w)
+    write_data(indx_f, lines)
diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py
index f8aae52e7c..23f5a24a1c 100644
--- a/python/paddle/v2/dataset/conll05.py
+++ b/python/paddle/v2/dataset/conll05.py
@@ -233,5 +233,5 @@ def convert(path):
     """
     Converts dataset to recordio format
     """
-    paddle.v2.dataset.common.convert(path, test(), 10, "conl105_train")
-    paddle.v2.dataset.common.convert(path, test(), 10, "conl105_test")
+    paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_train")
+    paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_test")
diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py
index e2a21e6e3e..634388094c 100644
--- a/python/paddle/v2/dataset/flowers.py
+++ b/python/paddle/v2/dataset/flowers.py
@@ -116,7 +116,7 @@ def reader_creator(data_file,
             data = batch['data']
             labels = batch['label']
             for sample, label in itertools.izip(data, batch['label']):
-                yield sample, int(label)
+                yield sample, int(label) - 1
 
     if use_xmap:
         return xmap_readers(mapper, reader, cpu_count(), buffered_size)
diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py
index c0ec5992e0..93dd3e8f7d 100644
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/v2/dataset/imdb.py
@@ -173,5 +173,5 @@ def convert(path):
     Converts dataset to recordio format
     """
     w = word_dict()
-    paddle.v2.dataset.common.convert(path, lambda: train(w), 10, "imdb_train")
-    paddle.v2.dataset.common.convert(path, lambda: test(w), 10, "imdb_test")
+    paddle.v2.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train")
+    paddle.v2.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test")
diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py
index b18ee8e9ba..617c722c41 100644
--- a/python/paddle/v2/dataset/imikolov.py
+++ b/python/paddle/v2/dataset/imikolov.py
@@ -155,6 +155,7 @@ def convert(path):
     N = 5
     word_dict = build_dict()
     paddle.v2.dataset.common.convert(path,
-                                     train(word_dict, N), 10, "imikolov_train")
+                                     train(word_dict, N), 1000,
+                                     "imikolov_train")
     paddle.v2.dataset.common.convert(path,
-                                     test(word_dict, N), 10, "imikolov_test")
+                                     test(word_dict, N), 1000, "imikolov_test")
diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py
index ea5891f4f3..9f675bed89 100644
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
@@ -119,5 +119,5 @@ def convert(path):
     """
     Converts dataset to recordio format
     """
-    paddle.v2.dataset.common.convert(path, train(), 10, "minist_train")
-    paddle.v2.dataset.common.convert(path, test(), 10, "minist_test")
+    paddle.v2.dataset.common.convert(path, train(), 1000, "minist_train")
+    paddle.v2.dataset.common.convert(path, test(), 1000, "minist_test")
diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py
index d9372d422a..5b61a9420a 100644
--- a/python/paddle/v2/dataset/movielens.py
+++ b/python/paddle/v2/dataset/movielens.py
@@ -254,8 +254,8 @@ def convert(path):
     """
     Converts dataset to recordio format
     """
-    paddle.v2.dataset.common.convert(path, train(), 10, "movielens_train")
-    paddle.v2.dataset.common.convert(path, test(), 10, "movielens_test")
+    paddle.v2.dataset.common.convert(path, train(), 1000, "movielens_train")
+    paddle.v2.dataset.common.convert(path, test(), 1000, "movielens_test")
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/dataset/mq2007.py b/python/paddle/v2/dataset/mq2007.py
index fd71b34166..b705c9109b 100644
--- a/python/paddle/v2/dataset/mq2007.py
+++ b/python/paddle/v2/dataset/mq2007.py
@@ -212,19 +212,19 @@ def gen_pair(querylist, partial_order="full"):
         for j in range(i + 1, len(querylist)):
             query_right = querylist[j]
             if query_left.relevance_score > query_right.relevance_score:
-                labels.append(1)
+                labels.append([1])
                 docpairs.append([
                     np.array(query_left.feature_vector),
                     np.array(query_right.feature_vector)
                 ])
             elif query_left.relevance_score < query_right.relevance_score:
-                labels.append(1)
+                labels.append([1])
                 docpairs.append([
                     np.array(query_right.feature_vector),
                     np.array(query_left.feature_vector)
                 ])
     for label, pair in zip(labels, docpairs):
-        yield label, pair[0], pair[1]
+        yield np.array(label), pair[0], pair[1]
 
 
 def gen_list(querylist):
@@ -242,9 +242,9 @@ def gen_list(querylist):
     if not isinstance(querylist, QueryList):
         querylist = QueryList(querylist)
     querylist._correct_ranking_()
-    relevance_score_list = [query.relevance_score for query in querylist]
+    relevance_score_list = [[query.relevance_score] for query in querylist]
     feature_vector_list = [query.feature_vector for query in querylist]
-    yield np.array(relevance_score_list).T, np.array(feature_vector_list)
+    yield np.array(relevance_score_list), np.array(feature_vector_list)
 
 
 def query_filter(querylists):
diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py
index e33f120c87..b0b9757c1a 100644
--- a/python/paddle/v2/dataset/sentiment.py
+++ b/python/paddle/v2/dataset/sentiment.py
@@ -137,5 +137,5 @@ def convert(path):
     """
     Converts dataset to recordio format
     """
-    paddle.v2.dataset.common.convert(path, train, 10, "sentiment_train")
-    paddle.v2.dataset.common.convert(path, test, 10, "sentiment_test")
+    paddle.v2.dataset.common.convert(path, train, 1000, "sentiment_train")
+    paddle.v2.dataset.common.convert(path, test, 1000, "sentiment_test")
diff --git a/python/paddle/v2/dataset/tests/voc2012_test.py b/python/paddle/v2/dataset/tests/voc2012_test.py
new file mode 100644
index 0000000000..31e72ebf5e
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/voc2012_test.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.voc2012
+import unittest
+
+
+class TestVOC(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, 3 * l[1].size)
+            sum += 1
+        return sum
+
+    def test_train(self):
+        count = self.check_reader(paddle.v2.dataset.voc_seg.train())
+        self.assertEqual(count, 2913)
+
+    def test_test(self):
+        count = self.check_reader(paddle.v2.dataset.voc_seg.test())
+        self.assertEqual(count, 1464)
+
+    def test_val(self):
+        count = self.check_reader(paddle.v2.dataset.voc_seg.val())
+        self.assertEqual(count, 1449)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py
index ec10ce646e..ce60aa21c2 100644
--- a/python/paddle/v2/dataset/uci_housing.py
+++ b/python/paddle/v2/dataset/uci_housing.py
@@ -119,5 +119,5 @@ def convert(path):
     """
     Converts dataset to recordio format
     """
-    paddle.v2.dataset.common.convert(path, train(), 10, "uci_housing_train")
-    paddle.v2.dataset.common.convert(path, test(), 10, "uci_houseing_test")
+    paddle.v2.dataset.common.convert(path, train(), 1000, "uci_housing_train")
+    paddle.v2.dataset.common.convert(path, test(), 1000, "uci_houseing_test")
diff --git a/python/paddle/v2/dataset/voc2012.py b/python/paddle/v2/dataset/voc2012.py
new file mode 100644
index 0000000000..617e212d67
--- /dev/null
+++ b/python/paddle/v2/dataset/voc2012.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image dataset for segmentation.
+The 2012 dataset contains images from 2008-2011 for which additional
+segmentations have been prepared. As in previous years the assignment
+to training/test sets has been maintained. The total number of images
+with segmentation has been increased from 7,062 to 9,993.
+"""
+
+import tarfile
+import io
+import numpy as np
+from paddle.v2.dataset.common import download
+from paddle.v2.image import *
+from PIL import Image
+
+__all__ = ['train', 'test', 'val']
+
+VOC_URL = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/\
+VOCtrainval_11-May-2012.tar'
+
+VOC_MD5 = '6cd6e144f989b92b3379bac3b3de84fd'
+SET_FILE = 'VOCdevkit/VOC2012/ImageSets/Segmentation/{}.txt'
+DATA_FILE = 'VOCdevkit/VOC2012/JPEGImages/{}.jpg'
+LABEL_FILE = 'VOCdevkit/VOC2012/SegmentationClass/{}.png'
+
+CACHE_DIR = 'voc2012'
+
+
+def reader_creator(filename, sub_name):
+
+    tarobject = tarfile.open(filename)
+    name2mem = {}
+    for ele in tarobject.getmembers():
+        name2mem[ele.name] = ele
+
+    def reader():
+        set_file = SET_FILE.format(sub_name)
+        sets = tarobject.extractfile(name2mem[set_file])
+        for line in sets:
+            line = line.strip()
+            data_file = DATA_FILE.format(line)
+            label_file = LABEL_FILE.format(line)
+            data = tarobject.extractfile(name2mem[data_file]).read()
+            label = tarobject.extractfile(name2mem[label_file]).read()
+            data = Image.open(io.BytesIO(data))
+            label = Image.open(io.BytesIO(label))
+            data = np.array(data)
+            label = np.array(label)
+            yield data, label
+
+    return reader
+
+
+def train():
+    """
+    Create a train dataset reader containing 2913 images in HWC order.
+    """
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'trainval')
+
+
+def test():
+    """
+    Create a test dataset reader containing 1464 images in HWC order.
+    """
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'train')
+
+
+def val():
+    """
+    Create a val dataset reader containing 1449 images in HWC order.
+    """
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'val')
diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py
index 2a631c365f..95a35d97ce 100644
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
@@ -169,5 +169,6 @@ def convert(path):
     Converts dataset to recordio format
     """
     dict_size = 30000
-    paddle.v2.dataset.common.convert(path, train(dict_size), 10, "wmt14_train")
-    paddle.v2.dataset.common.convert(path, test(dict_size), 10, "wmt14_test")
+    paddle.v2.dataset.common.convert(path,
+                                     train(dict_size), 1000, "wmt14_train")
+    paddle.v2.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test")
diff --git a/python/paddle/v2/event.py b/python/paddle/v2/event.py
index fd6050fa33..7589cc9917 100644
--- a/python/paddle/v2/event.py
+++ b/python/paddle/v2/event.py
@@ -9,8 +9,6 @@ There are:
 * BeginPass
 * EndPass
 """
-import py_paddle.swig_paddle as api
-
 __all__ = [
     'EndIteration', 'BeginIteration', 'BeginPass', 'EndPass', 'TestResult'
 ]
@@ -18,6 +16,7 @@ __all__ = [
 
 class WithMetric(object):
     def __init__(self, evaluator):
+        import py_paddle.swig_paddle as api
         if not isinstance(evaluator, api.Evaluator):
             raise TypeError("Evaluator should be api.Evaluator type")
         self.__evaluator__ = evaluator
diff --git a/python/paddle/v2/framework/.gitignore b/python/paddle/v2/framework/.gitignore
new file mode 100644
index 0000000000..2ff540d576
--- /dev/null
+++ b/python/paddle/v2/framework/.gitignore
@@ -0,0 +1 @@
+proto
diff --git a/python/paddle/v2/framework/default_scope_funcs.py b/python/paddle/v2/framework/default_scope_funcs.py
new file mode 100644
index 0000000000..1b5580c8b3
--- /dev/null
+++ b/python/paddle/v2/framework/default_scope_funcs.py
@@ -0,0 +1,84 @@
+"""
+Default scope function.
+
+`Paddle` manages Scope as programming language's scope.  It just a 
+thread-local stack of Scope. Top of that stack is current scope, the bottom 
+of that stack is all scopes' parent. 
+
+Invoking `new_var/find_var`  can `new/find` variable in current scope. 
+Invoking `enter_local_scope/leave_local_scope` can create or destroy local 
+scope. 
+
+A `scoped_function` will take a `function` as input. That function will be 
+invoked in a new local scope. 
+"""
+
+import paddle.v2.framework.core
+import threading
+
+__tl_scope__ = threading.local()
+
+__all__ = [
+    'get_cur_scope', 'enter_local_scope', 'leave_local_scope', 'new_var',
+    'find_var', 'scoped_function'
+]
+
+
+def get_cur_scope():
+    """
+    Get current scope.
+    :rtype: paddle.v2.framework.core.Scope
+    """
+    cur_scope_stack = getattr(__tl_scope__, 'cur_scope', None)
+    if cur_scope_stack is None:
+        __tl_scope__.cur_scope = list()
+    if len(__tl_scope__.cur_scope) == 0:
+        __tl_scope__.cur_scope.append(paddle.v2.framework.core.Scope())
+    return __tl_scope__.cur_scope[-1]
+
+
+def enter_local_scope():
+    """
+    Enter a new local scope
+    """
+    cur_scope = get_cur_scope()
+    new_scope = cur_scope.new_scope()
+    __tl_scope__.cur_scope.append(new_scope)
+
+
+def leave_local_scope():
+    """
+    Leave local scope
+    """
+    __tl_scope__.cur_scope.pop()
+    get_cur_scope().drop_kids()
+
+
+def new_var(name):
+    """
+    create variable in current scope.
+    """
+    return get_cur_scope().new_var(name)
+
+
+def find_var(name):
+    """
+    get variable in current scope.
+    """
+    return get_cur_scope().find_var(name)
+
+
+def scoped_function(func):
+    """
+    invoke `func` in new scope.
+    
+    :param func: a callable function that will be run in new scope.
+    :type func: callable
+    """
+    enter_local_scope()
+    try:
+        func()
+    except:
+        raise
+    finally:
+        leave_local_scope()
diff --git a/python/paddle/v2/framework/op.py b/python/paddle/v2/framework/op.py
new file mode 100644
index 0000000000..6ac656321e
--- /dev/null
+++ b/python/paddle/v2/framework/op.py
@@ -0,0 +1,202 @@
+import paddle.v2.framework.core as core
+import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+
+
+def get_all_op_protos():
+    """
+    Get all registered op proto from Paddle C++
+    :return: list of OpProto
+    """
+    protostrs = core.get_all_op_protos()
+    ret_values = []
+    for pbstr in protostrs:
+        op_proto = framework_pb2.OpProto.FromString(str(pbstr))
+        ret_values.append(op_proto)
+    return ret_values
+
+
+def is_str(s):
+    return isinstance(s, str) or isinstance(s, unicode)
+
+
+class OpDescCreationMethod(object):
+    """
+    A Functor object to convert user input(use key word args) to OpDesc based on
+    OpProto.
+
+    :param op_proto: The OpProto object.
+    :type op_proto: op_proto_pb2.OpProto
+    """
+
+    def __init__(self, op_proto):
+        if not isinstance(op_proto, framework_pb2.OpProto):
+            raise TypeError("Argument should be OpProto")
+        self.__op_proto__ = op_proto
+
+    def __call__(self, *args, **kwargs):
+        """
+        Convert user input to OpDesc. Only key-word args are supported. 
+        :return: OpDesc based on user input
+        :rtype: op_desc_pb2.OpDesc
+        """
+        if len(args) != 0:
+            raise ValueError("Only keyword arguments is supported by Paddle")
+        op_desc = framework_pb2.OpDesc()
+
+        for input_parameter in self.__op_proto__.inputs:
+            input_arguments = kwargs.get(input_parameter.name, [])
+            if is_str(input_arguments):
+                input_arguments = [input_arguments]
+
+            if not input_parameter.duplicable and len(input_arguments) > 1:
+                raise ValueError("Input %s only accepts one input, but give %d"
+                                 % (input_parameter.name, len(input_arguments)))
+
+            ipt = op_desc.inputs.add()
+            ipt.parameter = input_parameter.name
+            ipt.arguments.extend(input_arguments)
+
+        for output_parameter in self.__op_proto__.outputs:
+            output_arguments = kwargs.get(output_parameter.name, [])
+            if is_str(output_arguments):
+                output_arguments = [output_arguments]
+
+            if not output_parameter.duplicable and len(output_arguments) > 1:
+                raise ValueError(
+                    "Output %s only accepts one output, but give %d" %
+                    (output_parameter.name, len(output_arguments)))
+
+            out = op_desc.outputs.add()
+            out.parameter = output_parameter.name
+            out.arguments.extend(output_arguments)
+
+        # Types
+        op_desc.type = self.__op_proto__.type
+
+        # Attrs
+        for attr in self.__op_proto__.attrs:
+            if attr.generated:
+                continue
+            user_defined_attr = kwargs.get(attr.name, None)
+            if user_defined_attr is not None:
+                new_attr = op_desc.attrs.add()
+                new_attr.name = attr.name
+                new_attr.type = attr.type
+                if attr.type == framework_pb2.INT:
+                    new_attr.i = user_defined_attr
+                elif attr.type == framework_pb2.FLOAT:
+                    new_attr.f = user_defined_attr
+                elif attr.type == framework_pb2.STRING:
+                    new_attr.s = user_defined_attr
+                elif attr.type == framework_pb2.INTS:
+                    new_attr.ints.extend(user_defined_attr)
+                elif attr.type == framework_pb2.FLOATS:
+                    new_attr.floats.extend(user_defined_attr)
+                elif attr.type == framework_pb2.STRINGS:
+                    new_attr.strings.extend(user_defined_attr)
+                else:
+                    raise NotImplementedError("Not support attribute type " +
+                                              attr.type)
+
+        return op_desc
+
+    @staticmethod
+    def any_is_true(generator):
+        """
+        Reduce a bool array to one. If any of them is True, then return True.
+        """
+        for flag in generator:
+            if flag:
+                return True
+        return False
+
+
+class OpInfo(object):
+    def __init__(self, name, method, inputs, outputs, attrs):
+        self.name = name
+        self.method = method
+        self.inputs = inputs
+        self.outputs = outputs
+        self.attrs = attrs
+
+
+def create_op_creation_method(op_proto):
+    """
+    Generate op creation method for an OpProto
+    """
+    method = OpDescCreationMethod(op_proto)
+
+    def __impl__(*args, **kwargs):
+        opdesc = method(*args, **kwargs)
+        return core.Operator.create(opdesc.SerializeToString())
+
+    return OpInfo(
+        method=__impl__,
+        name=op_proto.type,
+        inputs=[var.name for var in op_proto.inputs],
+        outputs=[var.name for var in op_proto.outputs],
+        attrs=[attr.name for attr in op_proto.attrs])
+
+
+class OperatorFactory(object):
+    def __init__(self):
+        self.op_methods = dict()
+        for op_proto in get_all_op_protos():
+            method = create_op_creation_method(op_proto)
+            self.op_methods[method.name] = method
+
+    def __call__(self, *args, **kwargs):
+        if 'type' in kwargs:
+            if len(args) != 0:
+                raise ValueError("All Paddle argument should be key-word "
+                                 "argument except type")
+            t = kwargs.pop('type')
+        else:
+            if len(args) != 1:
+                raise ValueError("All Paddle argument should be key-word "
+                                 "argument except type")
+            t = args[0]
+
+        return self.get_op_info(t).method(**kwargs)
+
+    def types(self):
+        return self.op_methods.keys()
+
+    def get_op_info(self, t):
+        if t not in self.op_methods:
+            raise ValueError("operator %s is not registered", t)
+        return self.op_methods.get(t)
+
+    def get_op_input_names(self, type):
+        return self.get_op_info(type).inputs
+
+    def get_op_output_names(self, type):
+        return self.get_op_info(type).outputs
+
+    def get_op_attr_names(self, type):
+        return self.get_op_info(type).attrs
+
+
+class __RecurrentOp__(object):
+    __proto__ = None
+    type = 'recurrent_op'
+
+    def __init__(self):
+        # cache recurrent_op's proto
+        if self.__proto__ is None:
+            for op_proto in get_all_op_protos():
+                if op_proto.type == self.type:
+                    self.__proto__ = op_proto
+
+    def __call__(self, *args, **kwargs):
+        if self.type not in args and 'type' not in kwargs:
+            kwargs['type'] = self.type
+        # create proto
+        create_method = OpDescCreationMethod(self.__proto__)
+        proto = create_method(*args, **kwargs)
+        # create rnnop
+        return core.RecurrentOp.create(proto.SerializeToString())
+
+
+Operator = OperatorFactory()  # Default global factory
+RecurrentOp = __RecurrentOp__()
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index d809917af1..b07a65f4d1 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -1 +1,29 @@
-add_python_test(test_framework test_protobuf.py test_scope.py)
+py_test(test_net SRCS test_net.py)
+
+py_test(test_scope SRCS test_scope.py)
+
+py_test(test_tensor SRCS test_tensor.py)
+py_test(test_mul_op SRCS test_mul_op.py)
+
+py_test(test_mean_op SRCS test_mean_op.py)
+
+py_test(test_protobuf SRCS test_protobuf.py)
+
+py_test(test_add_two_op SRCS test_add_two_op.py)
+py_test(test_sigmoid_op SRCS test_sigmoid_op.py)
+py_test(test_softmax_op SRCS test_softmax_op.py)
+py_test(test_cross_entropy_op SRCS test_cross_entropy_op.py)
+py_test(test_fill_zeros_like_op SRCS test_fill_zeros_like_op.py)
+
+py_test(gradient_checker SRCS gradient_checker.py)
+
+py_test(test_rowwise_add_op SRCS test_rowwise_add_op.py)
+
+py_test(test_default_scope_funcs SRCS test_default_scope_funcs.py)
+
+py_test(test_operator SRCS test_operator.py)
+py_test(test_gaussian_random_op SRCS test_gaussian_random_op.py)
+py_test(test_uniform_random_op SRCS test_uniform_random_op.py)
+py_test(test_recurrent_op SRCS test_recurrent_op.py)
+py_test(test_sgd_op SRCS test_sgd_op.py)
+py_test(test_gradient_checker SRCS test_gradient_checker.py)
diff --git a/python/paddle/v2/framework/tests/gradient_checker.py b/python/paddle/v2/framework/tests/gradient_checker.py
new file mode 100644
index 0000000000..8b8e2f444b
--- /dev/null
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
@@ -0,0 +1,283 @@
+import unittest
+
+import numpy
+import itertools
+import paddle.v2.framework.core as core
+from paddle.v2.framework.op import Operator
+
+__all__ = ['get_numeric_gradient']
+
+
+def create_op(op_type):
+    # TODO need to set attrs
+    kwargs = dict()
+    for in_name in Operator.get_op_input_names(op_type):
+        kwargs[in_name] = in_name
+    for out_name in Operator.get_op_output_names(op_type):
+        kwargs[out_name] = out_name
+
+    return Operator(op_type, **kwargs)
+
+
+def grad_var_name(var_name):
+    return var_name + "@GRAD"
+
+
+def get_numeric_gradient(op,
+                         input_values,
+                         output_name,
+                         input_to_check,
+                         delta=0.005,
+                         local_scope=None):
+    """
+    Get Numeric Gradient for an operator's input.
+    
+    :param op: C++ operator instance, could be an network 
+    :param input_values: The input variables. Should be an dictionary, key is 
+    variable name. Value is numpy array.
+    :param output_name: The final output variable name. 
+    :param input_to_check: The input variable need to get gradient.
+    :param delta: The perturbation value for numeric gradient method. The 
+    smaller delta is, the more accurate result will get. But if that delta is
+     too small, it could occur numerical stability problem.
+    :param local_scope: The local scope used for get_numeric_gradient.
+    :return: The gradient array in numpy format.
+    """
+    if local_scope is None:
+        local_scope = core.Scope()
+
+    # Create all input variable in local_scope
+    for var_name in input_values:
+        var = local_scope.new_var(var_name)
+        tensor = var.get_tensor()
+        tensor.set_dims(input_values[var_name].shape)
+        tensor.alloc_float(core.CPUPlace())
+        tensor.set(input_values[var_name], core.CPUPlace())
+
+    # Create all output variable in local_scope
+    opts = op.outputs()
+    for key in opts:
+        for output in opts[key]:
+            if local_scope.find_var(output) is None:
+                local_scope.new_var(output).get_tensor()
+    op.infer_shape(local_scope)
+
+    # allocate output memory
+    for key in opts:
+        for output in opts[key]:
+            local_scope.find_var(output).get_tensor().alloc_float(core.CPUPlace(
+            ))
+
+    cpu_ctx = core.DeviceContext.create(core.CPUPlace())
+
+    def get_output():
+        op.run(local_scope, cpu_ctx)
+        return numpy.array(local_scope.find_var(output_name).get_tensor()).sum()
+
+    def product(dim):
+        return reduce(lambda a, b: a * b, dim, 1)
+
+    # get the input tensor that we want to get it's numeric gradient.
+    tensor_to_check = local_scope.find_var(input_to_check).get_tensor()
+    tensor_size = product(tensor_to_check.get_dims())
+    # prepare a numpy array to store the gradient.
+    gradient_flat = numpy.zeros(shape=(tensor_size, ), dtype='float32')
+
+    # we only compute gradient of one element each time.
+    # we use a for loop to compute the gradient of every element.
+    for i in xrange(tensor_size):
+        # get one input element throw it's index i.
+        origin = tensor_to_check.get_float_element(i)
+
+        # add delta to it, run op and then get the sum of the result tensor.
+        x_pos = origin + delta
+        tensor_to_check.set_float_element(i, x_pos)
+        y_pos = get_output()
+
+        # plus delta to this element, run op and get the sum of the result tensor.
+        x_neg = origin - delta
+        tensor_to_check.set_float_element(i, x_neg)
+        y_neg = get_output()
+
+        # restore old value
+        tensor_to_check.set_float_element(i, origin)
+
+        # compute the gradient of this element and store it into a numpy array.
+        gradient_flat[i] = (y_pos - y_neg) / delta / 2
+
+    # reshape the gradient result to the shape of the source tensor.
+    return gradient_flat.reshape(tensor_to_check.get_dims())
+
+
+class GradientChecker(unittest.TestCase):
+    def __get_gradient(self, forward_op, backward_op, input_value, grad_names,
+                       place):
+        """Get the input gradients after running forward and backward operators
+        on the given places.
+
+        :param forward_op: forward operator
+        :type forward_op: Operator
+        :param backward_op: backward operator
+        :type backward_op: Operator
+        :param input_value: input values.
+        :type input_value: dict{string:numpy.array}
+        :param grad_names: the names of returned input gradients.
+        :type input_value: a list of string
+        :param place: the device type.
+        :type place: CPUPlace or GPUPlace
+        :return: the input grdients of given grad_names.
+        :rtype: a list of numpy.array
+        """
+        scope = core.Scope()
+        ctx = core.DeviceContext.create(place)
+
+        inputs = forward_op.inputs()
+        in_names = [item for k in inputs for item in inputs[k]]
+        outputs = forward_op.outputs()
+        out_names = [item for k in outputs for item in outputs[k]]
+
+        # create input var and set value
+        for name, value in input_value.iteritems():
+            if name not in in_names:
+                raise ValueError(name + "does not exist in Op's inputs.")
+            var = scope.new_var(name).get_tensor()
+            var.set_dims(value.shape)
+            var.set(value, place)
+
+        # run forward op
+        for out_name in out_names:
+            scope.new_var(out_name)
+        forward_op.infer_shape(scope)
+        forward_op.run(scope, ctx)
+
+        # set output var's shape
+        # set output grad to ones
+        for name in out_names:
+            out_tensor = scope.find_var(name).get_tensor()
+            grad_tensor = scope.new_var(grad_var_name(name)).get_tensor()
+            grad_tensor.set_dims(out_tensor.shape())
+            data = numpy.ones(out_tensor.shape(), dtype=numpy.float32)
+            grad_tensor.set(data, place)
+
+        # run backward op
+        for name in backward_op.outputs():
+            scope.new_var(name)
+        backward_op.infer_shape(scope)
+        backward_op.run(scope, ctx)
+
+        outs = [
+            numpy.array(scope.find_var(name).get_tensor())
+            for name in grad_names
+        ]
+        return outs
+
+    def compare_grad(self, forward_op, input_value):
+        """ Compare the input gradients between CPU and GPU for the given forward
+        operator.
+
+        :param forward_op: forward operator
+        :type forward_op: Operator
+        :param input_value: input values.
+        :type input_value: dict{string:numpy.array}
+        :raises: AssertionError, there is different gradient value.
+        """
+        backward_op = core.Operator.backward(forward_op, set())
+        # return if not compile with GPU or not implementing GPU kernel
+        if not (core.is_compile_gpu() and backward_op.support_gpu()):
+            return
+
+        outputs = backward_op.outputs()
+        out_names = [item for k in outputs for item in outputs[k]]
+        cpu_grads = self.__get_gradient(forward_op, backward_op, input_value,
+                                        out_names, core.CPUPlace())
+        gpu_grads = self.__get_gradient(forward_op, backward_op, input_value,
+                                        out_names, core.GPUPlace(0))
+
+        for c_grad, g_grad, name in itertools.izip(cpu_grads, gpu_grads,
+                                                   out_names):
+            self.assertTrue(
+                numpy.allclose(
+                    c_grad, g_grad, atol=1e-4),
+                "output name: " + name + " has diff")
+
+    def __assert_is_close(self, numeric_grads, analytic_grads, names,
+                          max_relative_error, msg_prefix):
+        """Use relative error for the comparison.
+
+        :param numeric_grads: the numerical graidents.
+        :type numeric_grads: a list of numpy.array 
+        :param analytic_grads: the analytical graidents.
+        :type analytic_grads: a list of numpy.array 
+        :param name: the names of gradients, used to print for debug.
+        :type names: a list of string
+        :param msg_prefix: string info, used to print for debug.
+        :type msf_prefix: string
+        """
+        for a, b, name in itertools.izip(numeric_grads, analytic_grads, names):
+            abs_a = numpy.abs(a)
+            # if abs_a is nearly zero, then use abs error for a, not relative
+            # error.
+            abs_a[abs_a < 1e-3] = 1
+
+            diff_mat = numpy.abs(a - b) / abs_a
+            max_diff = numpy.max(diff_mat)
+
+            def err_msg():
+                offset = numpy.argmax(diff_mat > max_relative_error)
+                return "%s Variable %s max gradient diff %f over limit %f, the first " \
+                       "error element is %d" % (
+                       msg_prefix, name, max_diff, max_relative_error, offset)
+
+            self.assertLessEqual(max_diff, max_relative_error, err_msg())
+
+    def check_grad(self,
+                   forward_op,
+                   input_vars,
+                   inputs_to_check,
+                   output_name,
+                   no_grad_set=None,
+                   only_cpu=False,
+                   max_relative_error=0.005):
+        """
+        :param forward_op: used to create backward_op
+        :param input_vars: numpy value of input variable. The following
+            computation will use these variables.
+        :param inputs_to_check: inputs var names that should check gradient.
+        :param output_name: output name that used to
+        :param max_relative_error: The relative tolerance parameter.
+        :param no_grad_set: used when create backward ops
+        :param only_cpu: only compute and check gradient on cpu kernel.
+        :return:
+        """
+        if no_grad_set is None:
+            no_grad_set = set()
+
+        no_tmp_out = forward_op.no_intermediate_outputs()
+        if len(no_tmp_out) != 1:
+            raise ValueError("non temp out_names should be 1")
+
+        inputs = forward_op.inputs()
+        in_names = [item for k in inputs for item in inputs[k]]
+        for no_grad in no_grad_set:
+            if no_grad not in in_names:
+                raise ValueError("no_grad should be in in_names")
+        backward_op = core.Operator.backward(forward_op, no_grad_set)
+
+        places = [core.CPUPlace()]
+        if not only_cpu and core.is_compile_gpu() and backward_op.support_gpu():
+            places.append(core.GPUPlace(0))
+
+        # get numerical gradients
+        numeric_grads = [
+            get_numeric_gradient(forward_op, input_vars, output_name, name)
+            for name in inputs_to_check
+        ]
+
+        check_names = [grad_var_name(name) for name in inputs_to_check]
+        for place in places:
+            # get analytical gradients according to different device
+            analytic_grads = self.__get_gradient(forward_op, backward_op,
+                                                 input_vars, check_names, place)
+            self.__assert_is_close(numeric_grads, analytic_grads, check_names,
+                                   max_relative_error,
+                                   "Gradient Check On %s" % str(place))
diff --git a/python/paddle/v2/framework/tests/op_test_util.py b/python/paddle/v2/framework/tests/op_test_util.py
new file mode 100644
index 0000000000..3bc05a0fec
--- /dev/null
+++ b/python/paddle/v2/framework/tests/op_test_util.py
@@ -0,0 +1,72 @@
+import numpy
+import paddle.v2.framework.core as core
+from paddle.v2.framework.op import Operator
+
+
+class OpTestMeta(type):
+    """
+    Operator Test ClassMeta.
+    
+    It injects `test_all` method into user's OperatorTest class, to make Python 
+    unittest module run that method.
+    
+    The `test_all` read what value is stored in `self`. It use self's values to
+    create and run a operator, and check whether that op is OK or not.
+    
+    See `test_add_two_op` for example usage.
+    """
+
+    def __new__(cls, name, bases, attrs):
+        obj = super(OpTestMeta, cls).__new__(cls, name, bases, attrs)
+
+        def test_all(self):
+            scope = core.Scope()
+            kwargs = dict()
+            places = [core.CPUPlace()]
+            if core.is_compile_gpu():
+                places.append(core.GPUPlace(0))
+
+            for place in places:
+                for in_name in Operator.get_op_input_names(self.type):
+                    if hasattr(self, "inputs") and in_name in self.inputs:
+                        kwargs[in_name] = in_name
+                        var = scope.new_var(in_name).get_tensor()
+                        arr = self.inputs[in_name]
+                        var.set_dims(arr.shape)
+                        var.set(arr, place)
+                    else:
+                        kwargs[in_name] = "@EMPTY@"
+
+                for out_name in Operator.get_op_output_names(self.type):
+                    if not hasattr(self, "outputs"):
+                        raise ValueError(
+                            "The test op must set self.outputs dict.")
+                    if out_name not in self.outputs:
+                        raise ValueError("The %s is not in self.outputs dict." %
+                                         (out_name))
+                    kwargs[out_name] = out_name
+                    scope.new_var(out_name).get_tensor()
+
+                for attr_name in Operator.get_op_attr_names(self.type):
+                    if hasattr(self, "attrs") and attr_name in self.attrs:
+                        kwargs[attr_name] = self.attrs[attr_name]
+
+                op = Operator(self.type, **kwargs)
+                if isinstance(place, core.GPUPlace) and not op.support_gpu():
+                    return
+
+                op.infer_shape(scope)
+
+                ctx = core.DeviceContext.create(place)
+                op.run(scope, ctx)
+
+                for out_name in Operator.get_op_output_names(self.type):
+                    actual = numpy.array(scope.find_var(out_name).get_tensor())
+                    expect = self.outputs[out_name]
+                    self.assertTrue(
+                        numpy.allclose(
+                            actual, expect, atol=1e-05),
+                        "output name: " + out_name + "has diff")
+
+        obj.test_all = test_all
+        return obj
diff --git a/python/paddle/v2/framework/tests/test_add_two_op.py b/python/paddle/v2/framework/tests/test_add_two_op.py
new file mode 100644
index 0000000000..0def484edd
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_add_two_op.py
@@ -0,0 +1,23 @@
+import unittest
+
+import numpy
+import paddle.v2.framework.core as core
+from paddle.v2.framework.op import Operator
+
+from op_test_util import OpTestMeta
+
+
+class TestAddOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "add_two"
+        self.inputs = {
+            'X': numpy.random.random((102, 105)).astype("float32"),
+            'Y': numpy.random.random((102, 105)).astype("float32")
+        }
+        self.outputs = {'Out': self.inputs['X'] + self.inputs['Y']}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
new file mode 100644
index 0000000000..d4277f2a42
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@@ -0,0 +1,37 @@
+import unittest
+import numpy
+from op_test_util import OpTestMeta
+from gradient_checker import GradientChecker, create_op
+
+
+class TestCrossEntropy(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "onehot_cross_entropy"
+        batch_size = 30
+        class_num = 10
+        X = numpy.random.random((batch_size, class_num)).astype("float32")
+        label = 5 * numpy.ones(batch_size).astype("int32")
+        self.inputs = {'X': X, 'label': label}
+        Y = []
+        for i in range(0, batch_size):
+            Y.append(-numpy.log(X[i][label[i]]))
+        self.outputs = {'Y': numpy.array(Y).astype("float32")}
+
+
+class CrossEntropyGradOpTest(GradientChecker):
+    def test_check_grad(self):
+        op = create_op("onehot_cross_entropy")
+        batch_size = 30
+        class_num = 10
+        inputs = {
+            "X": numpy.random.uniform(
+                0.1, 1.0, [batch_size, class_num]).astype("float32"),
+            "label": (class_num / 2) * numpy.ones(batch_size).astype("int32")
+        }
+        self.check_grad(op, inputs, set("X"), "Y")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_default_scope_funcs.py b/python/paddle/v2/framework/tests/test_default_scope_funcs.py
new file mode 100644
index 0000000000..495863c456
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_default_scope_funcs.py
@@ -0,0 +1,33 @@
+from paddle.v2.framework.default_scope_funcs import *
+import unittest
+
+
+class TestDefaultScopeFuncs(unittest.TestCase):
+    def test_cur_scope(self):
+        self.assertIsNotNone(get_cur_scope())
+
+    def test_none_variable(self):
+        self.assertIsNone(find_var("test"))
+
+    def test_create_var_get_var(self):
+        var_a = new_var("var_a")
+        self.assertIsNotNone(var_a)
+        self.assertIsNotNone(get_cur_scope().find_var('var_a'))
+        enter_local_scope()
+        self.assertIsNotNone(get_cur_scope().find_var('var_a'))
+        leave_local_scope()
+
+    def test_var_get_int(self):
+        def __new_scope__():
+            i = new_var("var_i")
+            self.assertFalse(i.is_int())
+            i.set_int(10)
+            self.assertTrue(i.is_int())
+            self.assertEqual(10, i.get_int())
+
+        for _ in xrange(10):
+            scoped_function(__new_scope__)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py b/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py
new file mode 100644
index 0000000000..e5c862605f
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py
@@ -0,0 +1,16 @@
+import unittest
+from op_test_util import OpTestMeta
+import numpy
+
+
+class TestFillZerosLikeOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "fill_zeros_like"
+        self.inputs = {'Src': numpy.random.random((219, 232)).astype("float32")}
+        self.outputs = {'Dst': numpy.zeros_like(self.inputs['Src'])}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_gaussian_random_op.py b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
new file mode 100644
index 0000000000..f95ed70b58
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
@@ -0,0 +1,36 @@
+import unittest
+import paddle.v2.framework.core as core
+from paddle.v2.framework.op import Operator
+import numpy
+
+
+class GaussianRandomTest(unittest.TestCase):
+    def test_cpu(self):
+        self.gaussian_random_test(place=core.CPUPlace())
+
+    def test_gpu(self):
+        if core.is_compile_gpu():
+            self.gaussian_random_test(place=core.GPUPlace(0))
+
+    def gaussian_random_test(self, place):
+        scope = core.Scope()
+        scope.new_var("Out").get_tensor()
+
+        op = Operator(
+            "gaussian_random",
+            Out="Out",
+            dims=[1000, 784],
+            mean=.0,
+            std=1.,
+            seed=10)
+
+        op.infer_shape(scope)
+        context = core.DeviceContext.create(place)
+        op.run(scope, context)
+        tensor = numpy.array(scope.find_var("Out").get_tensor())
+        self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1)
+        self.assertAlmostEqual(numpy.std(tensor), 1., delta=0.1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_gradient_checker.py b/python/paddle/v2/framework/tests/test_gradient_checker.py
new file mode 100644
index 0000000000..e0b3151208
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_gradient_checker.py
@@ -0,0 +1,43 @@
+import unittest
+import numpy
+from paddle.v2.framework.op import Operator
+from gradient_checker import GradientChecker
+from gradient_checker import get_numeric_gradient
+
+
+class GetNumericGradientTest(unittest.TestCase):
+    def test_add_op(self):
+        add_op = Operator('add_two', X="X", Y="Y", Out="Z")
+        x = numpy.random.random((10, 1)).astype("float32")
+        y = numpy.random.random((10, 1)).astype("float32")
+
+        arr = get_numeric_gradient(add_op, {'X': x, "Y": y}, 'Z', 'X')
+        self.assertAlmostEqual(arr.mean(), 1.0, delta=1e-4)
+
+    def test_softmax_op(self):
+        def stable_softmax(x):
+            """Compute the softmax of vector x in a numerically stable way."""
+            shiftx = x - numpy.max(x)
+            exps = numpy.exp(shiftx)
+            return exps / numpy.sum(exps)
+
+        def label_softmax_grad(Y, dY):
+            dX = Y * 0.0
+            for i in range(Y.shape[0]):
+                d = numpy.dot(Y[i, :], dY[i, :])
+                dX[i, :] = Y[i, :] * (dY[i, :] - d)
+            return dX
+
+        softmax_op = Operator("softmax", X="X", Y="Y")
+
+        X = numpy.random.random((2, 2)).astype("float32")
+        Y = numpy.apply_along_axis(stable_softmax, 1, X)
+        dY = numpy.ones(Y.shape)
+        dX = label_softmax_grad(Y, dY)
+
+        arr = get_numeric_gradient(softmax_op, {"X": X}, 'Y', 'X')
+        numpy.testing.assert_almost_equal(arr, dX, decimal=1e-2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_mean_op.py b/python/paddle/v2/framework/tests/test_mean_op.py
new file mode 100644
index 0000000000..f32b3160d6
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_mean_op.py
@@ -0,0 +1,24 @@
+import unittest
+from op_test_util import OpTestMeta
+from gradient_checker import GradientChecker, create_op
+import numpy as np
+
+
+class TestMeanOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "mean"
+        self.inputs = {'X': np.random.random((32, 784)).astype("float32")}
+        self.outputs = {'Out': np.mean(self.inputs['X'])}
+
+
+class MeanGradOpTest(GradientChecker):
+    def test_normal(self):
+        op = create_op("mean")
+        inputs = {"X": np.random.random((10, 10)).astype("float32")}
+        self.check_grad(op, inputs, set("X"), "Out")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_mul_op.py b/python/paddle/v2/framework/tests/test_mul_op.py
new file mode 100644
index 0000000000..ee0d81a64e
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_mul_op.py
@@ -0,0 +1,34 @@
+import unittest
+import numpy as np
+from gradient_checker import GradientChecker, create_op
+from op_test_util import OpTestMeta
+
+
+class TestMulOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "mul"
+        self.inputs = {
+            'X': np.random.random((32, 84)).astype("float32"),
+            'Y': np.random.random((84, 100)).astype("float32")
+        }
+        self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+
+
+class MulGradOpTest(GradientChecker):
+    def test_mul(self):
+        op = create_op("mul")
+        inputs = {
+            'X': np.random.random((32, 84)).astype("float32"),
+            'Y': np.random.random((84, 100)).astype("float32")
+        }
+        # mul op will enlarge the relative error
+        self.check_grad(
+            op, inputs, set(["X", "Y"]), "Out", max_relative_error=0.5)
+
+
+# TODO(dzh,qijun) : mulgrad test case need transpose feature of blas library
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_net.py b/python/paddle/v2/framework/tests/test_net.py
new file mode 100644
index 0000000000..9339cf28da
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_net.py
@@ -0,0 +1,39 @@
+import paddle.v2.framework.core as core
+from paddle.v2.framework.op import Operator
+import unittest
+
+
+def fc(X, W, Y):
+    ret_v = core.Net.create()
+
+    ret_v.append_op(Operator("mul", X="X", Y="W", Out="pre_activation"))
+    ret_v.append_op(Operator("sigmoid", X="pre_activation", Y=Y))
+    ret_v.complete_add_op(True)
+    return ret_v
+
+
+class TestNet(unittest.TestCase):
+    def test_net_all(self):
+        net = core.Net.create()
+        op1 = Operator("add_two", X="X", Y="Y", Out="Out")
+        net.append_op(op1)
+
+        net2 = core.Net.create()
+        net2.append_op(fc(X="X", W="w", Y="fc.out"))
+        net2.complete_add_op(True)
+        net.append_op(net2)
+        net.complete_add_op(True)
+
+        expected = '''
+Op(plain_net), inputs:{all[W, X, Y]}, outputs:{all[Out, fc.out, pre_activation]}.
+    Op(add_two), inputs:{X[X], Y[Y]}, outputs:{Out[Out]}.
+    Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
+        Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
+            Op(mul), inputs:{X[X], Y[W]}, outputs:{Out[pre_activation]}.
+            Op(sigmoid), inputs:{X[pre_activation]}, outputs:{Y[fc.out]}.
+'''
+        self.assertEqual(expected, "\n" + str(net))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_operator.py b/python/paddle/v2/framework/tests/test_operator.py
new file mode 100644
index 0000000000..1abc4eeb57
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_operator.py
@@ -0,0 +1,204 @@
+import unittest
+import paddle.v2.framework.op as op
+import paddle.v2.framework.core as core
+import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+
+
+class TestGetAllProtos(unittest.TestCase):
+    def test_all(self):
+        all_protos = op.get_all_op_protos()
+        self.assertNotEqual(0, len(all_protos))
+
+        for each in all_protos:
+            self.assertTrue(each.IsInitialized())
+
+
+class TestOpDescCreationMethod(unittest.TestCase):
+    def test_plain_input_output(self):
+        op_proto = framework_pb2.OpProto()
+        op_proto.type = "test"
+        ipt = op_proto.inputs.add()
+        ipt.name = "X"
+        ipt.comment = "not matter"
+
+        ipt = op_proto.inputs.add()
+        ipt.name = "Y"
+        ipt.comment = "not matter"
+
+        opt = op_proto.outputs.add()
+        opt.name = "Z"
+        opt.comment = "not matter"
+
+        op_proto.comment = "not matter"
+
+        self.assertTrue(op_proto.IsInitialized())
+
+        method = op.OpDescCreationMethod(op_proto)
+        output = method(X="a", Y="b", Z="c")
+        expected = framework_pb2.OpDesc()
+        expected.type = "test"
+        ipt_0 = expected.inputs.add()
+        ipt_0.parameter = "X"
+        ipt_0.arguments.extend(["a"])
+        ipt_1 = expected.inputs.add()
+        ipt_1.parameter = 'Y'
+        ipt_1.arguments.extend(['b'])
+        opt = expected.outputs.add()
+        opt.parameter = "Z"
+        opt.arguments.extend(["c"])
+
+        self.assertEqual(expected, output)
+
+    def test_multiple_input_plain_output(self):
+        op_proto = framework_pb2.OpProto()
+        op_proto.type = "fc"
+        ipt = op_proto.inputs.add()
+        ipt.name = "X"
+        ipt.comment = ""
+        ipt.duplicable = True
+
+        ipt = op_proto.inputs.add()
+        ipt.name = "W"
+        ipt.comment = ""
+        ipt.duplicable = True
+
+        ipt = op_proto.inputs.add()
+        ipt.name = "b"
+        ipt.comment = ""
+
+        out = op_proto.outputs.add()
+        out.name = "Y"
+        out.comment = ""
+
+        op_proto.comment = ""
+        self.assertTrue(op_proto.IsInitialized())
+        method = op.OpDescCreationMethod(op_proto)
+
+        generated1 = method(X="x", W="w", b="b", Y="y")
+        expected1 = framework_pb2.OpDesc()
+        tmp = expected1.inputs.add()
+        tmp.parameter = "X"
+        tmp.arguments.extend(['x'])
+
+        tmp = expected1.inputs.add()
+        tmp.parameter = 'W'
+        tmp.arguments.extend(['w'])
+
+        tmp = expected1.inputs.add()
+        tmp.parameter = 'b'
+        tmp.arguments.extend(['b'])
+
+        tmp = expected1.outputs.add()
+        tmp.parameter = 'Y'
+        tmp.arguments.extend(['y'])
+        expected1.type = 'fc'
+        self.assertEqual(expected1, generated1)
+
+        generated2 = method(
+            X=['x1', 'x2', 'x3'], b='b', W=['w1', 'w2', 'w3'], Y='y')
+        expected2 = framework_pb2.OpDesc()
+
+        tmp = expected2.inputs.add()
+        tmp.parameter = "X"
+        tmp.arguments.extend(['x1', 'x2', 'x3'])
+
+        tmp = expected2.inputs.add()
+        tmp.parameter = 'W'
+        tmp.arguments.extend(['w1', 'w2', 'w3'])
+
+        tmp = expected2.inputs.add()
+        tmp.parameter = 'b'
+        tmp.arguments.extend(['b'])
+
+        tmp = expected2.outputs.add()
+        tmp.parameter = 'Y'
+        tmp.arguments.extend(['y'])
+
+        expected2.type = 'fc'
+        self.assertEqual(expected2, generated2)
+
+    def test_attrs(self):
+        op_proto = framework_pb2.OpProto()
+        op_proto.type = "test"
+        ipt = op_proto.inputs.add()
+        ipt.name = 'X'
+        ipt.comment = ""
+
+        def __add_attr__(name, type):
+            attr = op_proto.attrs.add()
+            attr.name = name
+            attr.comment = ""
+            attr.type = type
+
+        __add_attr__("int_attr", framework_pb2.INT)
+        __add_attr__("float_attr", framework_pb2.FLOAT)
+        __add_attr__("string_attr", framework_pb2.STRING)
+        __add_attr__("ints_attr", framework_pb2.INTS)
+        __add_attr__("floats_attr", framework_pb2.FLOATS)
+        __add_attr__("strings_attr", framework_pb2.STRINGS)
+
+        op_proto.comment = ""
+        self.assertTrue(op_proto.IsInitialized())
+
+        method = op.OpDescCreationMethod(op_proto)
+
+        generated = method(
+            X="a",
+            int_attr=10,
+            float_attr=3.2,
+            string_attr="test_str",
+            ints_attr=[0, 1, 2, 3, 4],
+            floats_attr=[0.2, 3.2, 4.5],
+            strings_attr=["a", "b", "c"])
+
+        expected = framework_pb2.OpDesc()
+        expected.type = "test"
+
+        ipt = expected.inputs.add()
+        ipt.parameter = "X"
+        ipt.arguments.extend(['a'])
+
+        attr = expected.attrs.add()
+        attr.name = "int_attr"
+        attr.type = framework_pb2.INT
+        attr.i = 10
+
+        attr = expected.attrs.add()
+        attr.name = "float_attr"
+        attr.type = framework_pb2.FLOAT
+        attr.f = 3.2
+
+        attr = expected.attrs.add()
+        attr.name = "string_attr"
+        attr.type = framework_pb2.STRING
+        attr.s = "test_str"
+
+        attr = expected.attrs.add()
+        attr.name = "ints_attr"
+        attr.type = framework_pb2.INTS
+        attr.ints.extend([0, 1, 2, 3, 4])
+
+        attr = expected.attrs.add()
+        attr.name = 'floats_attr'
+        attr.type = framework_pb2.FLOATS
+        attr.floats.extend([0.2, 3.2, 4.5])
+
+        attr = expected.attrs.add()
+        attr.name = 'strings_attr'
+        attr.type = framework_pb2.STRINGS
+        attr.strings.extend(['a', 'b', 'c'])
+
+        self.assertEqual(expected, generated)
+
+
+class TestOpCreations(unittest.TestCase):
+    def test_all(self):
+        add_op = op.Operator("add_two", X="a", Y="b", Out="z")
+        self.assertIsNotNone(add_op)
+        # Invoke C++ DebugString()
+        self.assertEqual('Op(add_two), inputs:{X[a], Y[b]}, outputs:{Out[z]}.',
+                         str(add_op))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_protobuf.py b/python/paddle/v2/framework/tests/test_protobuf.py
index b8702477e6..848a396b3b 100644
--- a/python/paddle/v2/framework/tests/test_protobuf.py
+++ b/python/paddle/v2/framework/tests/test_protobuf.py
@@ -1,13 +1,10 @@
-import paddle.v2.framework.proto.op_proto_pb2
-import paddle.v2.framework.proto.attr_type_pb2
+import paddle.v2.framework.proto.framework_pb2 as framework_pb2
 import unittest
 
 
 class TestFrameworkProto(unittest.TestCase):
     def test_all(self):
-        op_proto_lib = paddle.v2.framework.proto.op_proto_pb2
-        attr_type_lib = paddle.v2.framework.proto.attr_type_pb2
-        op_proto = op_proto_lib.OpProto()
+        op_proto = framework_pb2.OpProto()
         ipt0 = op_proto.inputs.add()
         ipt0.name = "a"
         ipt0.comment = "the input of cosine op"
@@ -21,7 +18,7 @@ class TestFrameworkProto(unittest.TestCase):
         attr = op_proto.attrs.add()
         attr.name = "scale"
         attr.comment = "scale of cosine op"
-        attr.type = attr_type_lib.FLOAT
+        attr.type = framework_pb2.FLOAT
         op_proto.type = "cos"
         self.assertTrue(op_proto.IsInitialized())
 
diff --git a/python/paddle/v2/framework/tests/test_recurrent_op.py b/python/paddle/v2/framework/tests/test_recurrent_op.py
new file mode 100644
index 0000000000..d6000ab9f9
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
@@ -0,0 +1,168 @@
+import logging
+import paddle.v2.framework.core as core
+import unittest
+import numpy as np
+from paddle.v2.framework.op import Operator, RecurrentOp
+
+
+def py_sigmoid(x):
+    return 1. / (1. + np.exp(-x))
+
+
+class PySimpleRNN(object):
+    '''
+    A simple implementation of RNN based on numpy, to futhur test RecurrentOp's alogorithm
+    '''
+
+    def __init__(self, input_dim=30, batch_size=50, weight_dim=15, sent_len=11):
+        self.x = np.random.normal(size=(sent_len, batch_size, input_dim))
+        self.W = np.random.normal(size=(input_dim, input_dim))
+        self.U = np.random.normal(size=(input_dim, input_dim))
+        self.h_boot = np.random.normal(size=(batch_size, input_dim))
+
+        # memories
+        self.mems = [
+            np.zeros(shape=(batch_size, input_dim)) for i in range(sent_len)
+        ]
+
+    def forward(self):
+        xs = self.segment_inputs()
+        for step_id in range(self.x.shape[0]):
+            self.step(step_id, xs[step_id])
+        return self.concat_outputs()
+
+    def segment_inputs(self):
+        return [self.x[i] for i in range(self.x.shape[0])]
+
+    def concat_outputs(self):
+        return np.array(self.mems)
+
+    def step(self, step_id, x):
+        '''
+        run a step
+        '''
+        mem = self.mems[step_id]
+        if step_id > 0:
+            pre_mem = self.mems[step_id - 1]
+        else:
+            pre_mem = self.h_boot
+        xW = np.matmul(x, self.W)
+        hU = np.matmul(mem, self.U)
+
+        sum = xW + hU
+        self.mems[step_id] = py_sigmoid(sum)
+
+
+class PySimpleRNNTest(unittest.TestCase):
+    def setUp(self):
+        self.rnn = PySimpleRNN()
+
+    def test_forward(self):
+        output = self.rnn.forward()
+        print 'output', output
+
+
+def create_tensor(scope, name, shape, np_data):
+    tensor = scope.new_var(name).get_tensor()
+    tensor.set_dims(shape)
+    tensor.set(np_data, core.CPUPlace())
+    return tensor
+
+
+class TestRecurrentOp(unittest.TestCase):
+    '''
+    Test RNNOp
+
+    equation:
+        h_t = \sigma (W x_t + U h_{t-1})
+    weights:
+        - W
+        - U
+    vars:
+        - x
+    memories:
+        - h
+    outputs:
+       - h
+    '''
+
+    input_dim = 30
+    batch_size = 50
+    weight_dim = 15
+    sent_len = 11
+
+    def setUp(self):
+        self.py_rnn = PySimpleRNN(self.input_dim, self.batch_size,
+                                  self.weight_dim, self.sent_len)
+
+    def forward(self):
+        self.scope = core.Scope()
+        self.create_global_variables()
+        self.create_rnn_op()
+        self.create_step_net()
+        ctx = core.DeviceContext.create(core.CPUPlace())
+        self.rnnop.infer_shape(self.scope)
+        self.rnnop.run(self.scope, ctx)
+        return np.array(self.scope.find_var("h").get_tensor())
+
+    def create_global_variables(self):
+        # create inlink
+        x_np_data = self.py_rnn.x
+        create_tensor(self.scope, "x",
+                      [self.sent_len, self.batch_size, self.input_dim],
+                      x_np_data)
+        W_np_data = self.py_rnn.W
+        create_tensor(self.scope, "W", [self.input_dim, self.input_dim],
+                      W_np_data)
+
+        U_np_data = self.py_rnn.U
+        create_tensor(self.scope, "U", [self.input_dim, self.input_dim],
+                      U_np_data)
+
+        h_boot_np_data = self.py_rnn.h_boot
+        create_tensor(self.scope, "h_boot", [self.batch_size, self.input_dim],
+                      h_boot_np_data)
+        self.scope.new_var("step_scopes")
+        self.scope.new_var("h@alias")
+        self.scope.new_var("h")
+
+    def create_rnn_op(self):
+        # create RNNOp
+        self.rnnop = RecurrentOp(
+            # inputs
+            inlinks=["x"],
+            boot_memories=["h_boot"],
+            step_net="stepnet",
+            # outputs
+            outlinks=["h"],
+            step_scopes="step_scopes",
+            # attributes
+            inlink_alias=["x@alias"],
+            outlink_alias=["h@alias"],
+            pre_memories=["h@pre"],
+            memories=["h@alias"])
+
+    def create_step_net(self):
+        stepnet = core.Net.create()
+        x_fc_op = Operator("mul", X="x@alias", Y="W", Out="Wx")
+        h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
+        sum_op = Operator("add_two", X="Wx", Y="Uh", Out="sum")
+        sig_op = Operator("sigmoid", X="sum", Y="h@alias")
+
+        for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
+            stepnet.append_op(op)
+        stepnet.complete_add_op(True)
+        self.rnnop.set_stepnet(stepnet)
+
+    def test_forward(self):
+        print 'test recurrent op forward'
+        pd_output = self.forward()
+        py_output = self.py_rnn.forward()
+        print 'pd_output', pd_output
+        print
+        print 'py_output', py_output
+        self.assertEqual(pd_output.shape, py_output.shape)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_rowwise_add_op.py b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
new file mode 100644
index 0000000000..45d569da29
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
@@ -0,0 +1,30 @@
+import unittest
+import numpy as np
+from op_test_util import OpTestMeta
+from gradient_checker import GradientChecker, create_op
+
+
+class TestRowwiseAddOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "rowwise_add"
+        self.inputs = {
+            'X': np.random.random((32, 84)).astype("float32"),
+            'b': np.random.random(84).astype("float32")
+        }
+        self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['b'])}
+
+
+class RowwiseAddGradOpTest(GradientChecker):
+    def test_rowwise_add(self):
+        op = create_op("rowwise_add")
+        inputs = {
+            "X": np.random.uniform(0.1, 1, [5, 10]).astype("float32"),
+            "b": np.random.uniform(0.1, 1, [10]).astype("float32")
+        }
+        self.check_grad(op, inputs, set(["X", "b"]), "Out")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_scope.py b/python/paddle/v2/framework/tests/test_scope.py
index f0ee45cfc7..1ce9454067 100644
--- a/python/paddle/v2/framework/tests/test_scope.py
+++ b/python/paddle/v2/framework/tests/test_scope.py
@@ -5,29 +5,29 @@ import unittest
 class TestScope(unittest.TestCase):
     def test_create_destroy(self):
         paddle_c = paddle.v2.framework.core
-        scope = paddle_c.Scope(None)
+        scope = paddle_c.Scope()
         self.assertIsNotNone(scope)
-        scope_with_parent = paddle_c.Scope(scope)
+        scope_with_parent = scope.new_scope()
         self.assertIsNotNone(scope_with_parent)
 
     def test_none_variable(self):
         paddle_c = paddle.v2.framework.core
-        scope = paddle_c.Scope(None)
-        self.assertIsNone(scope.get_var("test"))
+        scope = paddle_c.Scope()
+        self.assertIsNone(scope.find_var("test"))
 
     def test_create_var_get_var(self):
         paddle_c = paddle.v2.framework.core
-        scope = paddle_c.Scope(None)
-        var_a = scope.create_var("var_a")
+        scope = paddle_c.Scope()
+        var_a = scope.new_var("var_a")
         self.assertIsNotNone(var_a)
-        self.assertIsNotNone(scope.get_var('var_a'))
-        scope2 = paddle_c.Scope(scope)
-        self.assertIsNotNone(scope2.get_var('var_a'))
+        self.assertIsNotNone(scope.find_var('var_a'))
+        scope2 = scope.new_scope()
+        self.assertIsNotNone(scope2.find_var('var_a'))
 
     def test_var_get_int(self):
         paddle_c = paddle.v2.framework.core
-        scope = paddle_c.Scope(None)
-        var = scope.create_var("test_int")
+        scope = paddle_c.Scope()
+        var = scope.new_var("test_int")
         var.set_int(10)
         self.assertTrue(var.is_int())
         self.assertEqual(10, var.get_int())
diff --git a/python/paddle/v2/framework/tests/test_sgd_op.py b/python/paddle/v2/framework/tests/test_sgd_op.py
new file mode 100644
index 0000000000..e5f9ef865e
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_sgd_op.py
@@ -0,0 +1,21 @@
+import unittest
+import numpy
+from op_test_util import OpTestMeta
+
+
+class TestSGD(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "sgd"
+        w = numpy.random.random((102, 105)).astype("float32")
+        g = numpy.random.random((102, 105)).astype("float32")
+        lr = 0.1
+
+        self.inputs = {'param': w, 'grad': g}
+        self.attrs = {'learning_rate': lr}
+        self.outputs = {'param_out': w - lr * g}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_sigmoid_op.py b/python/paddle/v2/framework/tests/test_sigmoid_op.py
new file mode 100644
index 0000000000..273c2e5ab1
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_sigmoid_op.py
@@ -0,0 +1,28 @@
+import unittest
+import numpy as np
+from op_test_util import OpTestMeta
+from gradient_checker import GradientChecker, create_op
+
+
+class TestSigmoidOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "sigmoid"
+        self.inputs = {'X': np.random.random((15, 31)).astype("float32")}
+        self.outputs = {'Y': 1 / (1 + np.exp(-self.inputs['X']))}
+
+
+class TestSigmoidGradOp(GradientChecker):
+    def test_grad(self):
+        op = create_op("sigmoid")
+        inputs = {"X": np.random.uniform(0.1, 1, [11, 17]).astype("float32")}
+        # compare gpu and cpu results for backward op.
+        # this test will be skiped if only compiling CPU version.
+        self.compare_grad(op, inputs)
+        # check gradients 
+        self.check_grad(op, inputs, set("X"), "Y", max_relative_error=0.007)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_softmax_op.py b/python/paddle/v2/framework/tests/test_softmax_op.py
new file mode 100644
index 0000000000..e670d93653
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_softmax_op.py
@@ -0,0 +1,35 @@
+import unittest
+
+import numpy as np
+
+from gradient_checker import GradientChecker, create_op
+from op_test_util import OpTestMeta
+
+
+def stable_softmax(x):
+    """Compute the softmax of vector x in a numerically stable way."""
+    shiftx = x - np.max(x)
+    exps = np.exp(shiftx)
+    return exps / np.sum(exps)
+
+
+class TestSoftmaxOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "softmax"
+        self.inputs = {'X': np.random.random((32, 100)).astype("float32")}
+        self.outputs = {
+            'Y': np.apply_along_axis(stable_softmax, 1, self.inputs['X'])
+        }
+
+
+class SoftmaxGradOpTest(GradientChecker):
+    def test_softmax(self):
+        op = create_op("softmax")
+        inputs = {"X": np.random.uniform(0.1, 1, [10, 10]).astype("float32")}
+        self.check_grad(op, inputs, set("X"), "Y")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_tensor.py b/python/paddle/v2/framework/tests/test_tensor.py
new file mode 100644
index 0000000000..1af39818a3
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_tensor.py
@@ -0,0 +1,48 @@
+import paddle.v2.framework.core as core
+import unittest
+import numpy
+
+
+class TestScope(unittest.TestCase):
+    def test_int_tensor(self):
+        scope = core.Scope()
+        var = scope.new_var("test_tensor")
+        place = core.CPUPlace()
+
+        tensor = var.get_tensor()
+
+        tensor.set_dims([1000, 784])
+        tensor.alloc_int(place)
+        tensor_array = numpy.array(tensor)
+        self.assertEqual((1000, 784), tensor_array.shape)
+        tensor_array[3, 9] = 1
+        tensor_array[19, 11] = 2
+        tensor.set(tensor_array, place)
+
+        tensor_array_2 = numpy.array(tensor)
+        self.assertEqual(1.0, tensor_array_2[3, 9])
+        self.assertEqual(2.0, tensor_array_2[19, 11])
+
+    def test_float_tensor(self):
+        scope = core.Scope()
+        var = scope.new_var("test_tensor")
+        place = core.CPUPlace()
+
+        tensor = var.get_tensor()
+
+        tensor.set_dims([1000, 784])
+        tensor.alloc_float(place)
+
+        tensor_array = numpy.array(tensor)
+        self.assertEqual((1000, 784), tensor_array.shape)
+        tensor_array[3, 9] = 1.0
+        tensor_array[19, 11] = 2.0
+        tensor.set(tensor_array, place)
+
+        tensor_array_2 = numpy.array(tensor)
+        self.assertAlmostEqual(1.0, tensor_array_2[3, 9])
+        self.assertAlmostEqual(2.0, tensor_array_2[19, 11])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_uniform_random_op.py b/python/paddle/v2/framework/tests/test_uniform_random_op.py
new file mode 100644
index 0000000000..c3d2bb44da
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_uniform_random_op.py
@@ -0,0 +1,35 @@
+import unittest
+from paddle.v2.framework.op import Operator
+import paddle.v2.framework.core as core
+import numpy
+
+
+class UniformRandomTest(unittest.TestCase):
+    def test_uniform_random_cpu(self):
+        self.uniform_random_test(place=core.CPUPlace())
+
+    def test_uniform_random_gpu(self):
+        if core.is_compile_gpu():
+            self.uniform_random_test(place=core.GPUPlace(0))
+
+    def uniform_random_test(self, place):
+        scope = core.Scope()
+        scope.new_var("X").get_tensor()
+
+        op = Operator(
+            "uniform_random",
+            Out="X",
+            dims=[1000, 784],
+            min=-5.0,
+            max=10.0,
+            seed=10)
+
+        op.infer_shape(scope)
+        ctx = core.DeviceContext.create(place)
+        op.run(scope, ctx)
+        tensor = numpy.array(scope.find_var("X").get_tensor())
+        self.assertAlmostEqual(tensor.mean(), 2.5, delta=0.1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py
index 34b7308601..4dcc3ab57e 100644
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
@@ -1,9 +1,7 @@
 import numpy
-import py_paddle.swig_paddle as api
 import collections
 import topology
 import minibatch
-from data_feeder import DataFeeder
 
 __all__ = ['infer', 'Inference']
 
@@ -28,6 +26,7 @@ class Inference(object):
     """
 
     def __init__(self, output_layer, parameters):
+        import py_paddle.swig_paddle as api
         topo = topology.Topology(output_layer)
         gm = api.GradientMachine.createFromConfigProto(
             topo.proto(), api.CREATE_MODE_TESTING, [api.PARAMETER_VALUE])
@@ -36,10 +35,18 @@ class Inference(object):
             name = param.getName()
             assert isinstance(val, api.Vector)
             val.copyFromNumpyArray(parameters.get(name).flatten())
+            # the setValueUpdated function is called in randomize, zeroMem,
+            # load function in paddle/parameter/Parameter.cpp. But in the
+            # inference mode, the setValueUpdated is never called, it will
+            # cause the parameter will not be dispatched
+            # in MultiGradientMachine for multi-GPU. So setValueUpdated is
+            # called here, but it's better to call this function in one place.
+            param.setValueUpdated()
         self.__gradient_machine__ = gm
         self.__data_types__ = topo.data_type()
 
     def iter_infer(self, input, feeding=None):
+        from data_feeder import DataFeeder
         feeder = DataFeeder(self.__data_types__, feeding)
         batch_size = len(input)
 
diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py
index 4ade1c6f32..6a2bb8d337 100644
--- a/python/paddle/v2/layer.py
+++ b/python/paddle/v2/layer.py
@@ -324,6 +324,3 @@ def parse_network(output_layers, extra_layers=None):
 
 def get_layer(name):
     return config_base.__layer_map__.get(name)
-
-
-cp.begin_parse()
diff --git a/python/paddle/v2/master/client.py b/python/paddle/v2/master/client.py
index 70f9e43c96..fc718f031e 100644
--- a/python/paddle/v2/master/client.py
+++ b/python/paddle/v2/master/client.py
@@ -1,8 +1,15 @@
 import ctypes
 import os
 
-path = os.path.join(os.path.dirname(__file__), "libpaddle_master.so")
-lib = ctypes.cdll.LoadLibrary(path)
+__lib__ = None
+
+
+def get_c_lib():
+    global __lib__
+    if __lib__ is None:
+        path = os.path.join(os.path.dirname(__file__), "libpaddle_master.so")
+        __lib__ = ctypes.cdll.LoadLibrary(path)
+    return __lib__
 
 
 class client(object):
@@ -10,29 +17,53 @@ class client(object):
     client is a client to the master server.
     """
 
-    def __init__(self, addr, buf_size):
-        self.c = lib.paddle_new_master_client(addr, buf_size)
+    def __init__(self, etcd_endpoints, timeout_sec, buf_size=0):
+        self.c = get_c_lib().paddle_new_etcd_master_client(
+            etcd_endpoints, timeout_sec, buf_size)
+
+    def request_save_model(self, trainer_id, block_ms):
+        """request to save model
+
+        Conventionally the 0-th trainer will save model. But in
+        distributed training, any trainer could be killed. This
+        function asks the master server if the trainer should proceed
+        with saving model.
+
+        :param trainer_id: trainer id.
+        :param block_ms: number of millisecond that other save model
+        will be blocked if this save model request succeeded.
 
-    def close(self):
-        lib.paddle_release_master_client(self.c)
+        Returns:
+            int: 1 if the save the model request is approved, 0 if
+            does the request is rejected because other trainer is
+            saving the model, -1 if error happened.
+
+        """
+        return get_c_lib().paddle_request_save_model(self.c, trainer_id,
+                                                     block_ms)
+
+    def release(self):
+        get_c_lib().paddle_release_master_client(self.c)
         self.c = None
 
     def set_dataset(self, paths):
         holder_type = ctypes.c_char_p * len(paths)
         holder = holder_type()
-        print paths
         for idx, path in enumerate(paths):
             c_ptr = ctypes.c_char_p(path)
             holder[idx] = c_ptr
-        lib.paddle_set_dataset(self.c, holder, len(paths))
+        get_c_lib().paddle_set_dataset(self.c, holder, len(paths))
 
-    # return format: (record, errno)
-    # errno =  0: ok
-    #       <  0: error
     def next_record(self):
+        """gets next record for training
+
+        Returns:
+            string: the record.
+            int: error code, 0 if successful, < 0 otherwise.
+        """
         p = ctypes.c_char_p()
         ret = ctypes.pointer(p)
-        size = lib.paddle_next_record(self.c, ret)
+        size = get_c_lib().paddle_next_record(self.c, ret)
         if size < 0:
             # Error
             return None, size
@@ -43,5 +74,8 @@ class client(object):
 
         record = ret.contents.value[:size]
         # Memory created from C should be freed.
-        lib.mem_free(ret.contents)
+        get_c_lib().mem_free(ret.contents)
         return record, 0
+
+    def paddle_start_get_records(self, pass_id):
+        get_c_lib().paddle_start_get_records(self.c, pass_id)
diff --git a/python/paddle/v2/model.py b/python/paddle/v2/model.py
new file mode 100644
index 0000000000..20c3282098
--- /dev/null
+++ b/python/paddle/v2/model.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import errno
+import uuid
+
+import paddle.v2.master
+
+__all__ = ["save_model", "load_model"]
+
+trainer_id = str(uuid.uuid4())
+
+
+def mkdir_p(path):
+    try:
+        os.makedirs(path)
+    except OSError as exc:
+        if exc.errno == errno.EEXIST and os.path.isdir(path):
+            pass
+        else:
+            raise
+
+
+def save_model(parameters, path):
+    need_request = "KUBERNETES_SERVICE_HOST" in os.environ.keys()
+
+    if need_request:
+        # TODO(helin): figure out how MPI trains, since MPI only save
+        # model when trainer_id == "0", we can consolidate the logic
+        # here.
+
+        # TODO(helin): change this environment variable name from
+        # MASTER_IP to ETCD_IP
+        etcd_name = "MASTER_IP"
+        if etcd_name not in os.environ.keys():
+            raise Exception('not find ' + etcd_name +
+                            ' in environment variable.')
+
+        etcd_ip = os.environ.get(etcd_name)
+        client = master.client("http://" + etcd_ip + ":2379", 5, 0)
+        r = client.request_save_model(trainer_id, 5000)
+        if r == 0:
+            # do not need to save
+            return
+        elif r < 0:
+            # error
+            return
+        else:
+            # save model
+            path = os.path.join(path, trainer_id)
+            path = os.path.join(path, "model.tar")
+
+    mkdir_p(path)
+
+    with open(path, 'wb') as f:
+        parameters.to_tar(f)
+
+
+def load_model(parameters, path):
+    with open(path, 'rb') as f:
+        parameters.from_tar(f)
diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index 390c22ee55..29f0945eb4 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -1,15 +1,26 @@
-import py_paddle.swig_paddle as swig_api
-
-import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils
-import paddle.trainer_config_helpers.optimizers as v1_optimizers
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 Optimizers(update equation) for SGD method.
 
-TODO(zhihong) : create new optimizer with proto config, add new optimizer here
-
 TODO(yuyang18): Complete comments.
 """
 
+import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils
+import paddle.trainer_config_helpers.optimizers as v1_optimizers
+from paddle.proto.OptimizerConfig_pb2 import OptimizerConfig
+
 __all__ = [
     'Momentum', 'Adam', 'Adamax', 'AdaGrad', 'DecayedAdaGrad', 'AdaDelta',
     'RMSProp', 'ModelAverage', 'L2Regularization'
@@ -18,6 +29,7 @@ __all__ = [
 
 class Optimizer(object):
     def __init__(self, **kwargs):
+        import py_paddle.swig_paddle as swig_api
         if 'batch_size' in kwargs:
             del kwargs['batch_size']  # not important for python library.
 
@@ -36,23 +48,27 @@ class Optimizer(object):
         For each optimizer(SGD, Adam), GradientMachine should enable different
         buffers.
         """
+        import py_paddle.swig_paddle as swig_api
         tmp = swig_api.ParameterOptimizer.create(self.__opt_conf__)
         assert isinstance(tmp, swig_api.ParameterOptimizer)
         return tmp.getParameterTypes()
 
     def __create_local_updater__(self):
+        import py_paddle.swig_paddle as swig_api
         return swig_api.ParameterUpdater.createLocalUpdater(self.__opt_conf__)
 
     def __create_remote_updater__(self, pass_num, use_sparse_updater):
+        import py_paddle.swig_paddle as swig_api
         return swig_api.ParameterUpdater.createRemoteUpdater(
             self.__opt_conf__, pass_num, use_sparse_updater)
 
-    def __create_new_remote_updater__(self, pserver_spec):
+    def __create_new_remote_updater__(self, pserver_spec, use_etcd):
+        import py_paddle.swig_paddle as swig_api
         return swig_api.ParameterUpdater.createNewRemoteUpdater(
-            self.__opt_conf__, pserver_spec)
+            self.__opt_conf__, pserver_spec, use_etcd)
 
     def create_updater(self, is_local, num_passes, use_sparse_updater,
-                       pserver_spec):
+                       pserver_spec, use_etcd):
         """
         create proper parameter_updater by configuration.
         :param is_local: create local or remote parameter updater
@@ -67,7 +83,8 @@ class Optimizer(object):
                         gradient_machine.prefetch(in_args)
                         parameter_updater.getParametersRemote()
 
-        :param pserver_spec: pserver location, eg: localhost:3000
+        :param pserver_spec: pserver location, eg: localhost:3000, if use etcd,
+        pserver_spec should be the etcd endpoints, eg: http://localhost:2379
         :return: parameter_updater
         """
         if is_local:
@@ -78,7 +95,7 @@ class Optimizer(object):
                     num_passes, use_sparse_updater)
             else:
                 parameter_updater = self.__create_new_remote_updater__(
-                    pserver_spec)
+                    pserver_spec, use_etcd)
         return parameter_updater
 
 
@@ -268,6 +285,7 @@ ModelAverage = v1_optimizers.ModelAverage
 L2Regularization = v1_optimizers.L2Regularization
 
 if __name__ == '__main__':
+    import py_paddle.swig_paddle as swig_api
     swig_api.initPaddle('--use_gpu=false')
     for opt in [
             Momentum(), Adam(), Adamax(), AdaGrad(), DecayedAdaGrad(),
diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py
index bbaf8bfa97..b8af5abaea 100644
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
@@ -1,5 +1,18 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
-import py_paddle.swig_paddle as api
 from paddle.proto.ParameterConfig_pb2 import ParameterConfig
 import paddle.trainer.config_parser as cp
 import struct
@@ -114,16 +127,8 @@ class Parameters(object):
         """
         return iter(self.__param_conf__)
 
-    def __getitem__(self, key):
-        """
-        Get parameter by parameter name. It uses Python dict syntax.
-
-        :note: It will always copy the parameter from C++ side.
-        :param key: Parameter name
-        :type key: basestring
-        :return: parameter value
-        :rtype: np.ndarray
-        """
+    def __getter_inner(self, key, param_type):
+        import py_paddle.swig_paddle as api
         shape = self.get_shape(key)
 
         if len(self.__gradient_machines__) == 0:
@@ -138,7 +143,7 @@ class Parameters(object):
                     each_gradient_machine, key)
                 # for simplify implementation now, we always copy from C++
                 assert isinstance(param, api.Parameter)
-                val = param.getBuf(api.PARAMETER_VALUE)
+                val = param.getBuf(param_type)
                 assert isinstance(val, api.Vector)
                 val = val.copyToNumpyArray()
                 return val
@@ -146,6 +151,19 @@ class Parameters(object):
 
             raise RuntimeError("Unexpected branch")
 
+    def __getitem__(self, key):
+        """
+        Get parameter by parameter name. It uses Python dict syntax.
+
+        :note: It will always copy the parameter from C++ side.
+        :param key: Parameter name
+        :type key: basestring
+        :return: parameter value
+        :rtype: np.ndarray
+        """
+        import py_paddle.swig_paddle as api
+        return self.__getter_inner(key, api.PARAMETER_VALUE)
+
     def get_shape(self, key):
         """
         get shape of the parameter.
@@ -202,6 +220,19 @@ class Parameters(object):
         """
         return self.__getitem__(key=parameter_name)
 
+    def get_grad(self, key):
+        """
+        Get grandient by parameter name.
+
+        :note: It will always copy the parameter from C++ side.
+        :param key: parameter name
+        :type key: basestring
+        :return: The grandient matrix.
+        :rtype: np.ndarray
+        """
+        import py_paddle.swig_paddle as api
+        return self.__getter_inner(key, api.PARAMETER_GRADIENT)
+
     def set(self, parameter_name, value):
         """
         Set parameter by parameter name & matrix.
@@ -223,7 +254,7 @@ class Parameters(object):
         :type gradient_machine: api.GradientMachine
         :return:
         """
-
+        import py_paddle.swig_paddle as api
         if not isinstance(gradient_machine, api.GradientMachine):
             raise ValueError("gradient_machine should be api.GradientMachine")
 
@@ -250,7 +281,13 @@ class Parameters(object):
         size = reduce(lambda a, b: a * b, param.shape)
         f.write(struct.pack("IIQ", 0, 4, size))
         param = param.astype(np.float32)
-        f.write(param.tostring())
+        s = param.tostring()
+        wrote_size = 0
+        buf = buffer(s, wrote_size, 65535)
+        while buf:  # f.write crashes with big data blog.
+            f.write(buf)
+            wrote_size += 65535
+            buf = buffer(s, wrote_size, 65535)
 
     def deserialize(self, name, f):
         """
@@ -359,6 +396,7 @@ def __copy_parameter_to_gradient_machine__(gradient_machine, name, arr):
     :return:
     :rtype: api.Parameter
     """
+    import py_paddle.swig_paddle as api
     param = __get_parameter_in_gradient_machine__(gradient_machine, name)
     vec = param.getBuf(api.PARAMETER_VALUE)
     assert isinstance(vec, api.Vector)
diff --git a/python/paddle/v2/plot/tests/CMakeLists.txt b/python/paddle/v2/plot/tests/CMakeLists.txt
index da5cd76488..4b6c1c8096 100644
--- a/python/paddle/v2/plot/tests/CMakeLists.txt
+++ b/python/paddle/v2/plot/tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 if (NOT APPLE)
   # The Mac OS X backend will not be able to function correctly if Python is
   # not installed as a framework.
-  add_python_test(test_ploter test_ploter.py)
+  py_test(test_ploter SRCS test_ploter.py)
 endif()
diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py
index 61b5cc134f..97e844b92c 100644
--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Creator package contains some simple reader creator, which could be used in user
-program.
+Creator package contains some simple reader creator, which could
+be used in user program.
 """
 
-__all__ = ['np_array', 'text_file', "recordio"]
+__all__ = ['np_array', 'text_file', "cloud_reader"]
 
 
 def np_array(x):
@@ -57,9 +57,9 @@ def text_file(path):
     return reader
 
 
-def recordio_local(paths, buf_size=100):
+def recordio(paths, buf_size=100):
     """
-    Creates a data reader from given RecordIO file paths separated by ",", 
+    Creates a data reader from given RecordIO file paths separated by ",",
         glob pattern is supported.
     :path: path of recordio files.
     :returns: data reader of recordio files.
@@ -67,49 +67,59 @@ def recordio_local(paths, buf_size=100):
 
     import recordio as rec
     import paddle.v2.reader.decorator as dec
+    import cPickle as pickle
 
     def reader():
-        a = ','.join(paths)
-        f = rec.reader(a)
+        if isinstance(paths, basestring):
+            path = paths
+        else:
+            path = ",".join(paths)
+        f = rec.reader(path)
         while True:
             r = f.read()
             if r is None:
                 break
-            yield r
+            yield pickle.loads(r)
         f.close()
 
     return dec.buffered(reader, buf_size)
 
 
-def recordio(paths, buf_size=100):
+pass_num = 0
+
+
+def cloud_reader(paths, etcd_endpoints, timeout_sec=5, buf_size=64):
     """
-    Creates a data reader that outputs record one one by one 
-        from given local or cloud recordio path.
+    Create a data reader that yield a record one by one from
+        the paths:
     :path: path of recordio files.
+    :etcd_endpoints: the endpoints for etcd cluster
     :returns: data reader of recordio files.
+
+    ..  code-block:: python
+        from paddle.v2.reader.creator import cloud_reader
+        etcd_endpoints = "http://127.0.0.1:2379"
+        trainer.train.(
+            reader=cloud_reader(["/work/dataset/uci_housing/uci_housing*"], etcd_endpoints),
+        )
     """
     import os
-    import paddle.v2.master.client as cloud
-
-    if "KUBERNETES_SERVICE_HOST" not in os.environ.keys():
-        return recordio_local(paths)
-
-    host_name = "MASTER_SERVICE_HOST"
-    if host_name not in os.environ.keys():
-        raise Exception('not find ' + host_name + ' in environ.')
-
-    addr = os.environ(host)
+    import cPickle as pickle
+    import paddle.v2.master as master
+    c = master.client(etcd_endpoints, timeout_sec, buf_size)
+    c.set_dataset(paths)
 
     def reader():
-        c = cloud(addr, buf_size)
-        c.set_dataset(paths)
+        global pass_num
+        c.paddle_start_get_records(pass_num)
+        pass_num += 1
 
         while True:
-            r, err = client.next_record()
-            if err < 0:
+            r, e = c.next_record()
+            if not r:
+                if e != -2:
+                    print "get record error: ", e
                 break
-            yield r
-
-        c.close()
+            yield pickle.loads(r)
 
     return reader
diff --git a/python/paddle/v2/reader/tests/CMakeLists.txt b/python/paddle/v2/reader/tests/CMakeLists.txt
index 6a1d337b23..107d5912e1 100644
--- a/python/paddle/v2/reader/tests/CMakeLists.txt
+++ b/python/paddle/v2/reader/tests/CMakeLists.txt
@@ -1 +1,2 @@
-add_python_test(reader_tests creator_test.py decorator_test.py)
+py_test(creator_test SRCS creator_test.py)
+py_test(decorator_test SRCS decorator_test.py)
diff --git a/python/paddle/v2/reader/tests/creator_test.py b/python/paddle/v2/reader/tests/creator_test.py
index b42d273ecf..cf190aa664 100644
--- a/python/paddle/v2/reader/tests/creator_test.py
+++ b/python/paddle/v2/reader/tests/creator_test.py
@@ -35,12 +35,25 @@ class TestTextFile(unittest.TestCase):
 
 
 class TestRecordIO(unittest.TestCase):
-    def test_recordio(self):
-        path = os.path.join(
-            os.path.dirname(__file__), "test_recordio_creator.dat")
-        reader = paddle.v2.reader.creator.recordio([path])
-        for idx, r in enumerate(reader()):
-            self.assertSequenceEqual(r, str(idx))
+    def do_test(self, path):
+        reader = paddle.v2.reader.creator.recordio(path)
+        idx = 0
+        for e in reader():
+            if idx == 0:
+                self.assertEqual(e, (1, 2, 3))
+            elif idx == 1:
+                self.assertEqual(e, (4, 5, 6))
+            idx += 1
+        self.assertEqual(idx, 2)
+
+    def test_recordIO(self):
+        self.do_test(
+            os.path.join(
+                os.path.dirname(__file__), "test_reader_recordio.dat"))
+        self.do_test([
+            os.path.join(
+                os.path.dirname(__file__), "test_reader_recordio.dat")
+        ])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/reader/tests/test_reader_recordio.dat b/python/paddle/v2/reader/tests/test_reader_recordio.dat
new file mode 100644
index 0000000000..a99a35bb82
Binary files /dev/null and b/python/paddle/v2/reader/tests/test_reader_recordio.dat differ
diff --git a/python/paddle/v2/tests/CMakeLists.txt b/python/paddle/v2/tests/CMakeLists.txt
index 058f22befd..b779155959 100644
--- a/python/paddle/v2/tests/CMakeLists.txt
+++ b/python/paddle/v2/tests/CMakeLists.txt
@@ -1,2 +1,7 @@
-add_python_test(test_v2_api test_data_feeder.py test_op.py test_parameters.py
-test_layer.py test_rnn_layer.py test_topology.py test_image.py)
+py_test(test_op SRCS test_op.py)
+py_test(test_image SRCS test_image.py)
+py_test(test_layer SRCS test_layer.py)
+py_test(test_topology SRCS test_topology.py)
+py_test(test_rnn_layer SRCS test_rnn_layer.py)
+py_test(test_parameters SRCS test_parameters.py)
+py_test(test_data_feeder SRCS test_data_feeder.py)
diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
index 96c6c4b89a..0654a30104 100644
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -2,12 +2,6 @@
 Module Trainer
 """
 import collections
-import gzip
-import os
-
-import py_paddle.swig_paddle as api
-
-from data_feeder import DataFeeder
 from topology import Topology
 from . import event as v2_event
 from . import optimizer as v2_optimizer
@@ -33,16 +27,24 @@ class SGD(object):
     SGD Trainer combines data reader, network topolopy and update_equation together
     to train/test a neural network.
 
-    :param update_equation: The optimizer object.
-    :type update_equation: paddle.v2.optimizer.Optimizer
     :param cost: Target cost that neural network should be optimized.
     :type cost: paddle.v2.config_base.Layer
     :param parameters: The parameters dictionary.
     :type parameters: paddle.v2.parameters.Parameters
+    :param update_equation: The optimizer object.
+    :type update_equation: paddle.v2.optimizer.Optimizer
     :param extra_layers: Some layers in the neural network graph are not
                          in the path of cost layer.
-    :param pserver_spec: pserver location, eg: localhost:3000
     :type extra_layers: paddle.v2.config_base.Layer
+    :param is_local: Whether trainning locally
+    :type is_local: bool
+    :param pserver_spec: comma string for pserver location,
+                         eg:127.10.0.10:3000,127.10.0.11:3000,
+                         and this parameter is only used for fault
+                         tolerant mode cluster training.
+    :type pserver_spec: string
+    :param use_etcd: Whether using etcd pserver.
+    :param use_etcd: bool
     """
 
     def __init__(self,
@@ -51,7 +53,8 @@ class SGD(object):
                  update_equation,
                  extra_layers=None,
                  is_local=True,
-                 pserver_spec=None):
+                 pserver_spec=None,
+                 use_etcd=True):
 
         if not isinstance(parameters, v2_parameters.Parameters):
             raise TypeError('parameters should be parameters')
@@ -59,6 +62,7 @@ class SGD(object):
         if not isinstance(update_equation, v2_optimizer.Optimizer):
             raise TypeError("update equation parameter must be "
                             "paddle.v2.optimizer.Optimizer")
+        import py_paddle.swig_paddle as api
         topology = Topology(cost, extra_layers=extra_layers)
         self.__optimizer__ = update_equation
         self.__topology__ = topology
@@ -66,6 +70,7 @@ class SGD(object):
         self.__topology_in_proto__ = topology.proto()
         self.__is_local__ = is_local
         self.__pserver_spec__ = pserver_spec
+        self.__use_etcd__ = use_etcd
 
         self.__use_sparse_updater__ = self.__topology__.use_sparse_updater()
         # # In local mode, disable sparse_remote_update.
@@ -124,13 +129,15 @@ class SGD(object):
         :type feeding: dict|list
         :return:
         """
+        import py_paddle.swig_paddle as api
+        from data_feeder import DataFeeder
         if event_handler is None:
             event_handler = default_event_handler
         __check_train_args__(**locals())
 
         self.__parameter_updater__ = self.__optimizer__.create_updater(
             self.__is_local__, num_passes, self.__use_sparse_updater__,
-            self.__pserver_spec__)
+            self.__pserver_spec__, self.__use_etcd__)
         self.__parameter_updater__.init(self.__gradient_machine__)
 
         self.__gradient_machine__.start()
@@ -162,14 +169,14 @@ class SGD(object):
                     self.__parameter_updater__.update(each_param)
                 cost_sum = out_args.sum()
                 cost = cost_sum / len(data_batch)
-                self.__parameter_updater__.finishBatch(cost)
-                batch_evaluator.finish()
                 event_handler(
                     v2_event.EndIteration(
                         pass_id=pass_id,
                         batch_id=batch_id,
                         cost=cost,
                         evaluator=batch_evaluator))
+                self.__parameter_updater__.finishBatch(cost)
+                batch_evaluator.finish()
 
             self.__parameter_updater__.finishPass()
             pass_evaluator.finish()
@@ -187,6 +194,8 @@ class SGD(object):
         :type feeding: dict
         :return:
         """
+        import py_paddle.swig_paddle as api
+        from data_feeder import DataFeeder
         feeder = DataFeeder(self.__data_types__, feeding)
         evaluator = self.__gradient_machine__.makeEvaluator()
         out_args = api.Arguments.createArguments(0)
diff --git a/python/requirements.txt b/python/requirements.txt
new file mode 100644
index 0000000000..e19453c25d
--- /dev/null
+++ b/python/requirements.txt
@@ -0,0 +1,9 @@
+requests==2.9.2
+numpy>=1.12
+protobuf==3.1
+recordio>=0.1.0
+matplotlib
+rarfile
+scipy>=0.19.0
+Pillow
+nltk>=3.2.2
diff --git a/python/setup.py.in b/python/setup.py.in
index 271ee6e552..87b3823e52 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -1,4 +1,7 @@
-from setuptools import setup
+from setuptools import setup, Distribution
+class BinaryDistribution(Distribution):
+    def has_ext_modules(foo):
+        return True
 
 packages=['paddle',
           'paddle.proto',
@@ -11,31 +14,46 @@ packages=['paddle',
           'paddle.v2.master',
           'paddle.v2.plot',
           'paddle.v2.framework',
-          'paddle.v2.framework.proto']
+          'paddle.v2.framework.proto',
+          'py_paddle']
 
-setup_requires=["requests",
-                "numpy",
-                "protobuf==3.1",
-                "recordio",
-                "matplotlib",
-                "rarfile",
-                "scipy>=0.19.0"]
+with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
+    setup_requires = f.read().splitlines()
 
 if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
     setup_requires+=["opencv-python"]
 
-setup(name='paddle',
+# the prefix is sys.prefix which should always be usr
+paddle_bin_dir = 'opt/paddle/bin'
+paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle_usage',
+               '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer',
+               '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model',
+               '${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main',
+               '${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
+
+paddle_rt_lib_dir = 'lib'
+paddle_rt_libs = ['${WARPCTC_LIBRARIES}']
+if '${MKL_SHARED_LIBS}'!= '':
+  paddle_rt_libs += '${MKL_SHARED_LIBS}'.split(';')
+
+setup(name='paddlepaddle',
       version='${PADDLE_VERSION}',
       description='Parallel Distributed Deep Learning',
       install_requires=setup_requires,
       packages=packages,
-      package_data={'paddle.v2.master': ['libpaddle_master.so'],
-            'paddle.v2.framework': ['core.so']
+      package_data={
+        'paddle.v2.master': ['libpaddle_master.so'],
+        'paddle.v2.framework': ['core.so'],
+        'py_paddle':['*.py','_swig_paddle.so']
       },
       package_dir={
           '': '${CMAKE_CURRENT_SOURCE_DIR}',
           # The paddle.v2.framework.proto will be generated while compiling.
           # So that package points to other directory.
-          'paddle.v2.framework.proto': '${PROJ_BINARY_ROOT}/paddle/framework'
+          'paddle.v2.framework.proto': '${PADDLE_BINARY_DIR}/paddle/framework',
+          'py_paddle': '${PADDLE_SOURCE_DIR}/paddle/py_paddle'
       },
+      scripts=paddle_bins,
+      distclass=BinaryDistribution,
+      data_files=[(paddle_rt_lib_dir, paddle_rt_libs)]
 )